## Content Based Recommender

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from contextlib import contextmanager
import pickle
import time
import os

In [2]:
@contextmanager
def timer(msg):
    t0 = time.time()
    print(f'[{msg}] start.')
    yield
    elapsed_time = time.time() - t0
    print(f'[{msg}] done in {elapsed_time / 60:.2f} min.')

In [3]:
#actual path
fileDir = os.path.dirname(os.path.realpath('__file__'))
#correct path
movies_path = os.path.join(fileDir, '../processed_data/movies_content.csv')
ratings_path = os.path.join(fileDir, '../processed_data/ratings_content.csv')
tags_path = os.path.join(fileDir, '../data/tags.csv')
tags = pd.read_csv(tags_path)
movies = pd.read_csv(movies_path)
reviews = pd.read_csv(ratings_path)

In [4]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("")
# .astype('str')

In [5]:
new = pd.qcut(movies['n_ratings'], [0.1, 0.19,0.31, 0.4, 0.5, 0.6, 0.75,0.85, 0.95, 0.99, 1],
        labels=[0.3, 0.40,0.5, 0.58, 0.63, .7, 0.75, 0.82,0.9,1], duplicates='drop')
movies['weight_quantile'] = new
movies['weighted_mean_rating'] = movies['mean_rating'] * movies['weight_quantile'].astype(float)

## TAGS

In [6]:
list_tags = tags.groupby('movieId')['tag'].apply(set).apply(list)
list_tags = list_tags.reset_index()
list_tags.columns = ['orig_movieId', 'tag']

In [7]:
merged_movies = pd.merge(movies, list_tags, how='left', on='orig_movieId')

In [8]:
merged_movies['genre_tag'] = merged_movies['genres'] + merged_movies['tag'].fillna('').apply(lambda x: list(x))

In [9]:
import re
import string
import gensim

In [10]:
alphabetic = lambda x: re.sub('\w*\d\w*', ' ', x.lower())
# punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
# text
text = merged_movies['genre_tag'].astype('str').map(alphabetic)
text.head() 

0    ['adventure', 'animation', 'children', 'comedy...
1    ['adventure', 'children', 'fantasy', 'see also...
2    ['comedy', 'romance', 'fishing', 'moldy', 'bes...
3    ['comedy', 'drama', 'romance', 'clv', 'girl mo...
4    ['comedy', 'comedy', 'aging', 'parent child re...
Name: genre_tag, dtype: object

In [11]:
merged_movies['genre_tag'] = text

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',min_df=0.005, stop_words='english')
tfidf_matrix = tf.fit_transform(merged_movies['genre_tag'])
tfidf_matrix.shape

(53889, 394)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
movie_df = pd.DataFrame(tfidf_matrix.todense())

In [None]:
# with open('merged_movies.pkl', 'wb') as f:
#     pickle.dump(merged_movies, f)

In [None]:
# movie_df.to_csv('movie_tdidf_matrix.csv', index=False)

In [None]:
# merged_movies.to_csv('merged_movies.csv', index=False)

In [15]:
def movie_recommendations(movie):
    
    requested_movie_id = merged_movies[merged_movies['key'] == movie].index
    requested_movie_values = (movie_df.iloc[requested_movie_id]
                                      .values
                                      .reshape((-1,)))    
    num_recs = 10
    movie_scores = []
    
    for movie_id, movie_values in enumerate(movie_df.values):
        score = cosine_similarity([requested_movie_values],[movie_values])[0][0]
        title = merged_movies.loc[movie_id, 'key']
        movie_scores.append((title, score))

    return sorted(movie_scores, key = lambda x:x[1], reverse = True)[1:num_recs]

In [16]:
movie_recommendations('Call Me by Your Name')

[('Brokeback Mountain', 0.6614054469713097),
 ('Noordzee, Texas (North Sea Texas)', 0.602780273610671),
 ('Kiss of the Spider Woman', 0.5966146043266278),
 ('Milk (2008)', 0.5963469204439947),
 ('The Crying Game', 0.5949283053664574),
 ('Moonlight', 0.5928417396385717),
 ('Postman, The (Postino, Il)', 0.5647994898064955),
 ('The Imitation Game', 0.5634925215070292),
 ('Terms of Endearment', 0.5631017245351335)]

## Do matrix similarity first and short DB

In [17]:
with timer('tdidf'):
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

[tdidf] start.
[tdidf] done in 1.49 min.


In [18]:
# cosine_path = os.path.join(fileDir, '../model/cosine.pkl')
# with open(cosine_path, 'rb') as f:
#     cosine_sim = pickle.load(f)

In [19]:
movies_path_short = os.path.join(fileDir, '../processed_data/movies_content_short_rating.csv')

In [20]:
movies_short = pd.read_csv(movies_path_short)

In [21]:
# Sorted by Similarity and Rating
titles = movies_short['key']
indices = pd.Series(movies_short.index, index=movies_short['key'])
def recommend_by_genre(title):
    idx = indices[title]
    sim_scores = cosine_sim[idx]
    datas = pd.concat([pd.Series(sim_scores), movies_short['weighted_mean_rating']], axis=1)
    datas.columns = ['similarity', 'weighted_mean_rating']
    datas = datas.sort_values(by=["similarity", 'weighted_mean_rating'], ascending=False)
    index = datas.iloc[1:11].index
    return titles.iloc[index]

In [24]:
movies_short.shape

(28791, 10)

In [22]:
recommend_by_genre('Call Me by Your Name')

IndexError: positional indexers are out-of-bounds