### 벡터 기반

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [3]:
class movie_recommendation_vec:
    def __init__(self, **kargs):
        self.topn = kargs.get('topn',10)
        self.vec_type = kargs.get('vec_type','tfidf')
        self.df = kargs.get('data', pd.read_csv('./your/data/here.csv'))
        self.a, self.b, self.c = kargs.get('a', 0.8), kargs.get('b', 0.1), kargs.get('c', 0.1)
        self.vote_thres = kargs.get('vote_thres', 100)
        self.verbose = kargs.get('verbose', 1)
        
        self.cvec = CountVectorizer(min_df=0, ngram_range=(1,2))
        self.stops = []
        with open('./stopwords/total_stopwords', encoding='utf-8') as f:
            self.stops.append(f.readline()[:-2])
        
        if self.vec_type == 'tfidf':
            self.vec = TfidfVectorizer(analyzer='word', ngram_range=(1,5),stop_words=self.stops,
                       min_df=10, max_df=0.95, max_features=10000)
        elif self.vec_type == 'count':
            self.vec = CountVectorizer(analyzer='word',ngram_range=(1,1), stop_words=self.stops,
                      min_df=10, max_df=0.95, max_features=10000)
        else:
            raise ValueError('vec_type is not in [tfidf, count]')
            
        if self.verbose == 1:
            print('-'*35)
            print('# Parameters')
            print('      a, b, c        : {0}, {1}, {2}'.format(self.a, self.b, self.c))
            print('vote count threshold :', self.vote_thres)
            print('vec_type :',self.vec_type.capitalize())
            print('weighted_sum = vec_sim_scaled*{0}(a) + genre_scaled*{1}(b) + wvote_scaled*{2}(c)'.format(self.a, self.b, self.c))
            print('-'*35)
            
    def search_title(self, title_name):
        return self.df[self.df['title'].str.contains(title_name)].title
    
    def genre_sim_sorted(self, title_idx):
        genre_literal = self.df['genre'].apply(lambda x: x.replace('|',' '))
        genre = self.cvec.fit_transform(genre_literal)
        genre_sim = cosine_similarity(genre,genre)

        return np.array([(idx,sim) for idx,sim in enumerate(genre_sim[title_idx])])            

    def raw_to_vec(self, data_preprocess):
        return self.vec.fit_transform(data_preprocess)
    
    def cos_sim(self, data_preprocess, title_idx):
        cos_sims = []
        data_transformed = self.raw_to_vec(data_preprocess)
        
        for df_idx, src_idx in zip(self.df.index, range(data_transformed.shape[0])):
            if df_idx != title_idx:
                cos_sims.append((df_idx, cosine_similarity(data_transformed[title_idx].reshape(1,-1),
                                                          data_transformed[src_idx].reshape(1,-1))))
        return cos_sims
    
    def similar_vec_movies(self, title_idx):
        idx_sims = np.array(self.cos_sim(self.df['preprocessed_plot'], title_idx))
        sims_scaled = MinMaxScaler().fit_transform(idx_sims[:,1].reshape(-1,1))
        idx_sims[:,1] = sims_scaled.reshape(-1)

        idx_sims = np.array(sorted(idx_sims, key=lambda x: x[1], reverse=True))
        result_df = self.df.loc[idx_sims[:,0]]
        result_df['vec_sim'] = idx_sims[:,1]
        
        return result_df[result_df['vote_count'] > self.vote_thres]
    
    def result_by_weights(self, dataf):
        dataf['weighted_sum'] = dataf['vec_sim_scaled']*self.a + dataf['wvote_scaled']*self.b + dataf['genre_scaled']*self.c
        
        return dataf.sort_values('weighted_sum', ascending=False)
    
    def getMovies(self, title):
        # no title result
        try:title_idx = self.df[self.df['title']== title].index.values[0]
        except:
            raise ValueError('There is no such title name in data. Search with "search_title" function')

        # get movies
        result = self.similar_vec_movies(title_idx)
        
        # IMDB's weighted_vote
        def weighted_vote_average(record):
            v, r = record['vote_count'], record['rating']
            return (v/(v+m))*r + (m/(m+v))*c
        c = result['rating'].mean()
        m = result['vote_count'].quantile(.6)
        result['weighted_vote'] = result.apply(weighted_vote_average,axis=1)
        
        # merge with genre
        genre_sim = self.genre_sim_sorted(title_idx)
        result_with_genre = pd.merge(result, pd.Series(genre_sim[:,1], name='genre_sim'), left_on=result.index, right_on=genre_sim[:,0],)
        
        # minmax scale
        result_with_genre['vec_sim_scaled'] = MinMaxScaler().fit_transform(result_with_genre['vec_sim'].values.reshape(-1,1))
        result_with_genre['wvote_scaled'] = MinMaxScaler().fit_transform(result_with_genre['weighted_vote'].values.reshape(-1,1))
        result_with_genre['genre_scaled'] = MinMaxScaler().fit_transform(result_with_genre['genre_sim'].values.reshape(-1,1))

        # (optional)remove data genre score is 0
        no_genre_score_idx = result_with_genre[result_with_genre['genre_sim'] == 0].index
        result_with_genre.drop(no_genre_score_idx, inplace=True)
        
        result_with_genre = self.result_by_weights(result_with_genre)
        return result_with_genre.head(self.topn)

In [4]:
recom = movie_recommendation_vec(vec_type='tfidf')

-----------------------------------
# Parameters
      a, b, c        : 0.8, 0.1, 0.1
vote count threshold : 100
vec_type : Tfidf
weighted_sum = vec_sim_scaled*0.8(a) + genre_scaled*0.1(b) + wvote_scaled*0.1(c)
-----------------------------------


In [5]:
result = recom.getMovies(title='라라랜드')

In [6]:
result[['weighted_sum', 'title', 'vec_sim_scaled', 'genre_scaled', 'wvote_scaled']]

Unnamed: 0,weighted_sum,title,vec_sim_scaled,genre_scaled,wvote_scaled
0,0.926463,필름스타 인 리버풀,1.0,0.516398,0.748232
1,0.726387,남자가 사랑할 때,0.77177,0.447214,0.642497
2,0.632774,이브닝,0.644759,0.447214,0.722458
3,0.624868,피아니스트의 전설,0.616026,0.4,0.920472
4,0.594844,더 와이프,0.590518,0.447214,0.777082
6,0.506938,르누아르,0.486259,0.447214,0.732088
10,0.478159,쇼를 사랑한 남자,0.44061,0.516398,0.740314
9,0.468454,매란방,0.441682,0.447214,0.703876
7,0.46581,청춘만화,0.475365,0.258199,0.596984
11,0.465,천국의 책방 - 연화,0.430921,0.447214,0.755421


In [12]:
result[['weighted_sum', 'title', 'vec_sim_scaled', 'genre_scaled', 'wvote_scaled']]

Unnamed: 0,weighted_sum,title,vec_sim_scaled,genre_scaled,wvote_scaled
0,0.926463,필름스타 인 리버풀,1.0,0.516398,0.748232
1,0.726387,남자가 사랑할 때,0.77177,0.447214,0.642497
2,0.632774,이브닝,0.644759,0.447214,0.722458
3,0.624868,피아니스트의 전설,0.616026,0.4,0.920472
4,0.594844,더 와이프,0.590518,0.447214,0.777082
6,0.506938,르누아르,0.486259,0.447214,0.732088
10,0.478159,쇼를 사랑한 남자,0.44061,0.516398,0.740314
9,0.468454,매란방,0.441682,0.447214,0.703876
7,0.46581,청춘만화,0.475365,0.258199,0.596984
11,0.465,천국의 책방 - 연화,0.430921,0.447214,0.755421
