### 클러스터링 기반

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition.truncated_svd import TruncatedSVD
from sklearn.cluster import KMeans, MiniBatchKMeans
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [3]:
class movie_recommendation_cluster:
    def __init__(self, **kargs):
        self.topn = kargs.get('topn', 10)
        self.df = kargs.get('data', pd.read_csv('./your/data/here.csv'))
        self.a, self.b, self.c = kargs.get('a',0.8), kargs.get('b',0.1), kargs.get('c',0.1)
        self.n_clusters = kargs.get('n_clusters',30)# kmeans
        self.n_components = kargs.get('n_components', 500)# svd
        self.vote_thres = kargs.get('vote_thres',100)# vote_count
        self.verbose = kargs.get('verbose', 1)
        self.re_cluster = kargs.get('re_cluster', 1)# kmeans
        self.batch_size = kargs.get('batch_size', 2000)
        self.max_iter = kargs.get('max_iter', 500)
        
        self.cvec = CountVectorizer(min_df=0, ngram_range=(1,2))
        self.stops = []
        with open('./stopwords/total_stopwords', encoding='utf-8') as f:
            self.stops.append(f.readline()[:-2])
        
        if self.verbose == 1:
            print('-'*35)
            print('# Parameters')
            print('      a, b, c        : {0}, {1}, {2}'.format(self.a, self.b, self.c))
            print('vote count threshold :', self.vote_thres)
            print("n_components of SVD  :", self.n_components)
            print("n_clusters of KMeans :", self.n_clusters)
            print('batch_size of Kmeans :', self.batch_size)
            print('max_iter of Kmeans   :', self.max_iter)
            print('weighted_sum = dist_scaled*{0}(a) + genre_scaled*{1}(b) + wvote_scaled*{2}(c)'.format(self.a, self.b, self.c))
            print('-'*35)
    
    def search_title(self, title_name):
        return self.df[self.df['title'].str.contains(title_name)].title
    
    def genre_sim_sorted(self, title_idx):
        genre_literal = self.df['genre'].apply(lambda x: x.replace('|',' '))
        genre = self.cvec.fit_transform(genre_literal)
        genre_sim = cosine_similarity(genre,genre)
        
        return np.array([(idx,sim) for idx,sim in enumerate(genre_sim[title_idx])])
    
    def raw_to_tfidf(self, data_preprocess):
        tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,3),stop_words=self.stops,
                                     min_df=3, max_df=0.95, max_features=10000)
        return tfidf.fit_transform(data_preprocess)
    def tfidf_to_svd(self, data_tfidf):
        svd = TruncatedSVD(n_components=self.n_components, n_iter=10)
        return svd.fit_transform(data_tfidf)
    
    def similar_cluster_movies(self, title_idx):
        do_cluster, loop_cnt = True, 0
        
        # data preprocessing
        data_tfidf = self.raw_to_tfidf(list(map(str, self.df['preprocessed_plot'].values)))
        data_svd = self.tfidf_to_svd(data_tfidf)
        
        # K-means clustering
        print('Clustering...')
        while do_cluster:        
            #kmeans = KMeans(n_clusters=self.n_clusters, max_iter=300, verbose=0)
            kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, batch_size=self.batch_size,
                                     max_iter=self.max_iter, verbose=0 ,n_init=3)

            vote_over_thres_idx = self.df[self.df['vote_count'] > self.vote_thres].index
            data_svd_idx = np.array([(idx,val) for idx,val in zip(self.df.index,data_svd)])
            data_svd_to_km = [val for idx,val in data_svd_idx if idx in vote_over_thres_idx]
            data_svd_dict = dict([(idx,val) for idx,val in filter(lambda x: x[0] in vote_over_thres_idx, data_svd_idx)])
            
            # (optional)avoid biggest cluster
            km = kmeans.fit(data_svd_to_km)
            km_dict = dict([(df_idx,label_) for df_idx,label_ in zip(vote_over_thres_idx,km.labels_)])
            km_cluster = list(filter(lambda x: km_dict.get(x) == km_dict.get(title_idx), km_dict.keys()))

            clusters = [0]*self.n_clusters
            for label_ in km.labels_:
                clusters[label_] += 1

            clusters_idx = np.array(clusters).argsort()
            bad_clusters = clusters_idx[-3:]
            
            if self.re_cluster:            
                if km_dict.get(title_idx) not in bad_clusters:
                    do_cluster=False
                elif loop_cnt >= 20:
                    print('Loop count exceeded')
                    do_cluster=False
                else:
                    del kmeans
                    loop_cnt += 1
                    print('Re-clustering...(%d)'%(loop_cnt))
                    
            else:
                do_cluster = False

        if self.verbose == 1:
            print('-'*35)
            print('# K-means clustering distribution')
            for i,size in enumerate(clusters):
                postfix = '<==' if i == km_dict.get(title_idx) else ''
                print('cluster #%3d : %4d items %s'%(i,size,postfix))
            print('-'*35)

        closest = []
        for i in km_cluster:
            if i != title_idx:
                closest.append((i,euclidean(data_svd_dict.get(title_idx), data_svd_dict.get(i))))

        return np.array(closest), self.df.loc[np.array(sorted(closest, key=lambda x: x[1]))[:,0]]

    def result_by_weights(self, dataf):
        dataf['weighted_sum'] = dataf['dist_scaled']*self.a + dataf['genre_scaled']*self.b + dataf['wvote_scaled']*self.c
        
        return dataf.sort_values('weighted_sum', ascending=False)

            
    def getMovies(self, title):
        # no title result
        try: title_idx = self.df[self.df['title']== title].index.values[0]
        except:
            raise ValueError('There is no such title name. Search with "search_title" function')
        
        # get movies in same cluster
        dist, result = self.similar_cluster_movies(title_idx)
        
        # merge with distance
        result = pd.merge(result, pd.Series(dist[:,1], name='dist'), left_on=result.index, right_on=dist[:,0])
        result.rename(columns={'key_0':'idx'}, inplace=True)
        
        # IMDB's weighted_vote
        def weighted_vote_average(record):
            v, r = record['vote_count'], record['rating']
            return (v/(v+m))*r + (m/(m+v))*c
        c = result['rating'].mean()
        m = result['vote_count'].quantile(.6)
        result['weighted_vote'] = result.apply(weighted_vote_average,axis=1)
        
        # merge with genre
        genre_sim = self.genre_sim_sorted(title_idx)
        result_with_genre = pd.merge(result, pd.Series(genre_sim[:,1], name='genre_sim'), left_on=result.idx, right_on=genre_sim[:,0],)
        
        # minmax scale
        result_with_genre['wvote_scaled'] = MinMaxScaler().fit_transform(result_with_genre['weighted_vote'].values.reshape(-1,1))
        result_with_genre['genre_scaled'] = MinMaxScaler().fit_transform(result_with_genre['genre_sim'].values.reshape(-1,1))
        result_with_genre['dist_scaled'] = MinMaxScaler().fit_transform(result_with_genre['dist'].max() - result_with_genre['dist'].values.reshape(-1,1))
        
        # (optional)remove data with 0 genre score
        no_genre_score_idx = result_with_genre[result_with_genre['genre_sim'] == 0].index
        result_with_genre.drop(no_genre_score_idx, inplace=True)
        
        result_with_genre = self.result_by_weights(result_with_genre)
        return result_with_genre.head(self.topn)

In [4]:
recom = movie_recommendation_cluster(re_cluster=True)

-----------------------------------
# Parameters
      a, b, c        : 0.8, 0.1, 0.1
vote count threshold : 100
n_components of SVD  : 500
n_clusters of KMeans : 30
batch_size of Kmeans : 2000
max_iter of Kmeans   : 500
weighted_sum = dist_scaled*0.8(a) + genre_scaled*0.1(b) + wvote_scaled*0.1(c)
-----------------------------------


In [5]:
result = recom.getMovies('라라랜드')

Clustering...
-----------------------------------
# K-means clustering distribution
cluster #  0 :   72 items 
cluster #  1 :   46 items 
cluster #  2 :   49 items 
cluster #  3 :   14 items 
cluster #  4 :  107 items 
cluster #  5 :  228 items 
cluster #  6 :  176 items 
cluster #  7 :   51 items 
cluster #  8 :   79 items 
cluster #  9 :  196 items 
cluster # 10 :   45 items 
cluster # 11 :  124 items 
cluster # 12 :   29 items 
cluster # 13 :  105 items 
cluster # 14 :  165 items 
cluster # 15 :   14 items 
cluster # 16 :  156 items 
cluster # 17 :   24 items 
cluster # 18 :   90 items <==
cluster # 19 :   60 items 
cluster # 20 :  147 items 
cluster # 21 :  569 items 
cluster # 22 :   25 items 
cluster # 23 :   42 items 
cluster # 24 : 2125 items 
cluster # 25 :    9 items 
cluster # 26 :   26 items 
cluster # 27 :   33 items 
cluster # 28 :    9 items 
cluster # 29 :  181 items 
-----------------------------------


In [6]:
result[['weighted_sum','title','dist_scaled','genre_scaled','wvote_scaled']]

Unnamed: 0,weighted_sum,title,dist_scaled,genre_scaled,wvote_scaled
0,0.944647,풀잎들,1.0,0.866025,0.580445
1,0.90901,천국의 계단,0.952551,0.866025,0.603665
2,0.906568,올 더 킹즈 맨,0.951658,0.866025,0.586395
3,0.8702,머시니스트,0.940601,0.5,0.677193
6,0.868231,일일시호일,0.877278,0.866025,0.798066
5,0.856161,감각의 제국,0.910575,0.774597,0.502417
8,0.808294,윈터 슬립,0.826365,0.866025,0.605994
10,0.797094,사랑에 대한 모든 것,0.788572,1.0,0.662364
12,0.79166,워크 투 리멤버,0.760409,1.0,0.833333
17,0.760394,너의 이름은.,0.734046,0.774597,0.956976


In [18]:
result[['weighted_sum','title','dist_scaled','genre_scaled','wvote_scaled']]

Unnamed: 0,weighted_sum,title,dist_scaled,genre_scaled,wvote_scaled
0,0.922174,아이언맨,1.0,0.338062,0.883676
1,0.820435,아이언맨 3,0.789447,1.0,0.888776
3,0.720282,레지던트 이블,0.743657,0.596285,0.657284
6,0.718579,매트릭스 2 - 리로디드,0.708401,0.845154,0.67343
2,0.706637,엑스맨 2 - 엑스투,0.747227,0.507093,0.581464
4,0.679487,블랙 팬서,0.734005,0.4,0.522834
32,0.675552,스타워즈 에피소드 3 - 시스의 복수,0.618427,1.0,0.808105
17,0.675446,어벤져스: 인피니티 워,0.663013,0.507093,0.943256
8,0.659528,로보캅,0.704137,0.338062,0.624128
5,0.65702,그린 존,0.730919,0.169031,0.553817


In [13]:
result[['weighted_sum','title','dist_scaled','genre_scaled','wvote_scaled']]

Unnamed: 0,weighted_sum,title,dist_scaled,genre_scaled,wvote_scaled
2,0.74287,이니셜 D - 극장판,0.860964,0.436436,0.104553
7,0.685518,니드 포 스피드,0.72422,0.436436,0.624982
6,0.664949,스피드 레이서,0.728113,0.333333,0.491258
9,0.656259,패스트 & 퓨리어스 2,0.693833,0.333333,0.678588
8,0.631071,패스트 & 퓨리어스 - 도쿄 드리프트,0.702729,0.258199,0.430677
11,0.619151,와즈다,0.612536,0.57735,0.713876
14,0.549487,드리븐,0.512179,0.774597,0.622841
17,0.37599,로건 럭키,0.345293,0.436436,0.561119
23,0.287251,러시 : 더 라이벌,0.110163,1.0,0.991212
20,0.275992,DOA,0.243763,0.57735,0.232467
