##  키워드 기반

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import ast

In [5]:
class movie_recommendation:
    def __init__(self, **kargs):
        self.topn = kargs.get('topn', 10)
        self.vote_thres = kargs.get('vote_thres', 100)
        self.df = kargs.get('data', pd.read_csv('./your/data/here.csv'))
        self.a, self.b, self.c = kargs.get('a',0.4), kargs.get('b',0.1), kargs.get('c',0.5)
        self.verbose = kargs.get('verbose', 1)
        
        self.cvec = CountVectorizer(min_df=0, ngram_range=(1,2))
        self.w2v = word2vec.Word2Vec.load('./model/naver_plot_kmr_min10_N_only.model')
        self.scaler = MinMaxScaler()
        
        if self.verbose == 1:
            print('-'*35)
            print('# Parameters')
            print('      a, b, c        : {0}, {1}, {2}'.format(self.a, self.b, self.c))
            print('vote count threshold :', self.vote_thres)
            print('weighted_sum = keywords*{0}(a) + genre*{1}(b) + weighted vote*{2}(c)'.format(self.a, self.b, self.c))
            print('-'*35)
        
    def search_title(self, title_name):
        return self.df[self.df['title'].str.contains(title_name)].title   
    
    def genre_sim_sorted(self, title_idx):
        genre_literal = self.df['genre'].apply(lambda x: x.replace('|',' '))
        genre = self.cvec.fit_transform(genre_literal)
        genre_sim = cosine_similarity(genre,genre)
        
        return np.array([(idx,sim) for idx,sim in enumerate(genre_sim[title_idx])])
           
    def cos_sim(self, corp1, corp2):
        vec1, vec2 = [], []
        for word1, word2 in zip(corp1,corp2):
            vec1.append(self.w2v[word1])
            vec2.append(self.w2v[word2])

        vec1, vec2 = np.array(vec1).mean(axis=0), np.array(vec2).mean(axis=0)
        return np.inner(vec1,vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))

    def similar_keywords_movies(self, title_idx):
        self.df['keywords_literal'] = self.df['keywords'].apply(lambda x: ' '.join(ast.literal_eval(x)))
        keywords_sims = []
        for idx in self.df.index:
            keywords_src = ast.literal_eval(self.df.loc[title_idx,'keywords'])
            keywords_tgt = ast.literal_eval(self.df.loc[idx,'keywords'])
            keywords_sims.append((idx,self.cos_sim(keywords_src, keywords_tgt)))

        df_with_ksim = self.df.copy()
        df_with_ksim['keywords_sim'] = np.array(keywords_sims)[:,1]
        df_with_ksim = df_with_ksim[df_with_ksim['vote_count'] > self.vote_thres]
        
        return df_with_ksim.sort_values('keywords_sim',ascending=False)[1:]

    def result_by_weights(self, dataf):
        dataf['weighted_sum'] = dataf['keywords_sim_scaled']*self.a + dataf['wvote_scaled']*self.b + dataf['genre_scaled']*self.c
        
        return dataf.sort_values('weighted_sum', ascending=False)
    
    def getMovies(self, title):
        # no title result
        try: title_idx = self.df[self.df['title']== title].index.values[0]
        except:
            raise ValueError('There is no such title name. Search with "search_title" function')
        
        # get movies
        result = self.similar_keywords_movies(title_idx)

        # IMDB's weighted_vote
        def weighted_vote_average(record):
            v, r = record['vote_count'], record['rating']
            return (v/(v+m))*r + (m/(m+v))*c
        c = result['rating'].mean()
        m = result['vote_count'].quantile(.6)
        result['weighted_vote'] = result.apply(weighted_vote_average,axis=1)
        
        # merge with genre
        genre_sim = self.genre_sim_sorted(title_idx)
        result_with_genre = pd.merge(result, pd.Series(genre_sim[:,1], name='genre_sim'), left_on=result.index, right_on=genre_sim[:,0],)
             
        # minmax scale
        result_with_genre['keywords_sim_scaled'] = MinMaxScaler().fit_transform(result_with_genre['keywords_sim'].values.reshape(-1,1))
        result_with_genre['wvote_scaled'] = MinMaxScaler().fit_transform(result_with_genre['weighted_vote'].values.reshape(-1,1))
        result_with_genre['genre_scaled'] = MinMaxScaler().fit_transform(result_with_genre['genre_sim'].values.reshape(-1,1))
        
        # (optional)remove data genre score is 0
        no_genre_score_idx = result_with_genre[result_with_genre['genre_sim'] == 0].index
        result_with_genre.drop(no_genre_score_idx, inplace=True)
        
        result_with_genre = self.result_by_weights(result_with_genre)
        return result_with_genre.head(self.topn)

In [6]:
recom = movie_recommendation()

-----------------------------------
# Parameters
      a, b, c        : 0.4, 0.1, 0.5
vote count threshold : 100
weighted_sum = keywords*0.4(a) + genre*0.1(b) + weighted vote*0.5(c)
-----------------------------------


In [7]:
result = recom.getMovies(title='엽문')

In [9]:
result[['weighted_sum','title', 'keywords_literal', 'keywords_sim_scaled', 'genre_scaled', 'wvote_scaled']]

Unnamed: 0,weighted_sum,title,keywords_literal,keywords_sim_scaled,genre_scaled,wvote_scaled
2,0.953589,샤오린 : 최후의 결전,혼돈 소림사 공화국 초기 대륙 중국 시대 함락 시작 반란군,0.952831,1.0,0.724563
3,0.948686,엽문 2,무예 폭력 도전 소극 영춘권 최고 제자 두지 자비 일본,0.909114,1.0,0.850399
11,0.922646,바람의 파이터,배달 일본 범수 후예 도전 무예 연명 머슴 목숨 무도 인간 소년 고수 친구 조선인 ...,0.856488,1.0,0.800506
17,0.916003,엽문3: 최후의 대결,암흑 존경 영춘권 무예 보스 타진 부지 학생 정착 성품,0.837161,1.0,0.811383
33,0.911065,명량,패배 두려움 전쟁 백성 속도 한양 국가 위기 배 조선,0.811388,1.0,0.865102
15,0.910507,삼국-무영자,장군 장수 권력 야망 눈 시대 중국 싸움 이용,0.844093,1.0,0.728697
54,0.904713,글래디에이터,황제 무스 아들 아프리카 하늘 황제 서기 땅 병사 검투사 로마 분노 사랑,0.772738,1.0,0.956179
20,0.903967,뮬란: 전사의 귀환,족 장군 계략 호시 남장 시대 각지 위나라 무술 시작,0.832484,1.0,0.709733
16,0.899858,공자춘추전국시대,열망 나라 왕 당대 최고 왕권 부활 천하 통일 전쟁,0.842378,1.0,0.629066
97,0.889199,암살,사령관 임시 정부 의뢰 청부살인 업자 시대 일본 암살 작전,0.73873,1.0,0.937073
