## Content Based Recommender

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from contextlib import contextmanager
import pickle
import time
import os
import re
import string
import gensim

In [2]:
@contextmanager
def timer(msg):
    t0 = time.time()
    print(f'[{msg}] start.')
    yield
    elapsed_time = time.time() - t0
    print(f'[{msg}] done in {elapsed_time / 60:.2f} min.')

In [14]:
#actual path
fileDir = os.path.dirname(os.path.realpath('__file__'))
#correct path
movies_path = os.path.join(fileDir, '../processed_data/movies_content.csv')
# ratings_path = os.path.join(fileDir, '../processed_data/ratings_content.csv')
tags_path = os.path.join(fileDir, '../data/tags.csv')
tags = pd.read_csv(tags_path)
movies = pd.read_csv(movies_path)
# reviews = pd.read_csv(ratings_path)

In [15]:
print(tags.shape)
tags.head()

(1108997, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


In [None]:
print(movies.shape)
movies.head()

In [None]:
# print(reviews.shape)
# reviews.head()

In [None]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("")
# .astype('str')

In [None]:
new = pd.qcut(movies['n_ratings'], [0.1, 0.19,0.31, 0.4, 0.5, 0.6, 0.75,0.85, 0.95, 0.99, 1],
        labels=[0.3, 0.40,0.5, 0.58, 0.63, .7, 0.75, 0.82,0.9,1], duplicates='drop')
movies['weight_quantile'] = new
movies['weighted_mean_rating'] = movies['mean_rating'] * movies['weight_quantile'].astype(float)

## TAGS

In [None]:
list_tags = tags.groupby('movieId')['tag'].apply(set).apply(list)
list_tags = list_tags.reset_index()
list_tags.columns = ['orig_movieId', 'tag']

In [None]:
list_tags.head()

In [None]:
merged_movies = pd.merge(movies, list_tags, how='left', on='orig_movieId')
merged_movies['genre_tag'] = merged_movies['genres'] + merged_movies['tag'].fillna('').apply(lambda x: list(x))

In [None]:
alphabetic = lambda x: re.sub('\w*\d\w*', ' ', x.lower())
# punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
text = merged_movies['genre_tag'].astype('str').map(alphabetic)
text.head() 

In [None]:
merged_movies['genre_tag'] = text

In [None]:
# merged_movies.to_csv('merged_movies.csv', index=False)

## Create TFIDF table and Cosine Matrix

In [None]:
tf = TfidfVectorizer(analyzer='word',min_df=0.005, stop_words='english')
tfidf_matrix = tf.fit_transform(merged_movies['genre_tag'])
tfidf_matrix.shape

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# movie_df = pd.DataFrame(tfidf_matrix.todense())

In [None]:
# print(movie_df.shape)
# movie_df.head()

In [None]:
# with open('merged_movies.pkl', 'wb') as f:
#     pickle.dump(merged_movies, f)

In [None]:
# movie_df.to_csv('movie_tdidf_matrix.csv', index=False)

In [None]:
# def movie_recommendations(movie):
#     """
#     What this function does is: 
#     It uses the matrix of TFIDF scores and it finds the cosine similarity for each score compared to the movie that is chosen. 
#     It then returns a list of the top 10 movies with the highest score.
#     """
#     requested_movie_id = merged_movies[merged_movies['key'] == movie].index
#     requested_movie_values = (movie_df.iloc[requested_movie_id]
#                                       .values
#                                       .reshape((-1,)))    
#     num_recs = 10
#     movie_scores = []
    
#     for movie_id, movie_values in enumerate(movie_df.values):
#         score = cosine_similarity([requested_movie_values],[movie_values])[0][0]
#         title = merged_movies.loc[movie_id, 'key']
#         movie_scores.append((title, score))

#     return sorted(movie_scores, key = lambda x:x[1], reverse = True)[1:num_recs]

In [None]:
# movie_recommendations('Call Me by Your Name')

## Do matrix similarity first and short DB

In [None]:
with timer('tdidf'):
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# with open('cosine.pkl', 'wb') as f:
#     pickle.dump(cosine_sim, f)

In [None]:
print(cosine_sim.shape)
cosine_sim

In [16]:
cosine_path = os.path.join(fileDir, '../model/cosine.pkl')
with open(cosine_path, 'rb') as f:
    cosine_sim = pickle.load(f)

In [17]:
merged_path = os.path.join(fileDir, '../processed_data/merged_movies_tags.csv')
merged_movies = pd.read_csv(merged_path)

In [18]:
movies_short = merged_movies

In [33]:
# Sorted by Similarity and Rating
titles = movies_short['key']
indices = pd.Series(movies_short.index, index=movies_short['key'])
def recommend_by_genre(title):
    idx = indices[title]
    sim_scores = cosine_sim[idx]
    datas = pd.concat([pd.Series(sim_scores), movies_short['weighted_mean_rating']], axis=1)
    datas.columns = ['similarity', 'weighted_mean_rating']
    datas = datas.sort_values(by=["similarity", 'weighted_mean_rating'], ascending=False)
    index = datas.iloc[1:11].index
    return titles.iloc[index]

In [35]:
recommend_by_genre('Call Me by Your Name')

10469                   Brokeback Mountain
13119                          Milk (2008)
1072                       The Crying Game
6674              Kiss of the Spider Woman
32212                            Moonlight
21590    Noordzee, Texas (North Sea Texas)
23095                        Hawaii (2013)
57              Postman, The (Postino, Il)
879      Wild Reeds (Les roseaux sauvages)
33975                              Esteros
Name: key, dtype: object

## Checking for repetitive titles

In [None]:
[x for x in list(movies_short.old_title.unique()) if x not in list(movies_short.title.unique())]

In [None]:
old = list(movies_short.title.unique())
titl = list(movies_short.title)

In [None]:
len(old)

In [None]:
len(titl)

In [None]:
pd.Series(titl).value_counts()

In [28]:
movies_short[movies_short['title'] == 'Cinderella'].head()

Unnamed: 0,movieId,title,genres,orig_movieId,key,year,full_title,n_ratings,mean_rating,weight_quantile,weighted_mean_rating,tag,genre_tag
1003,1003,Cinderella,"['Animation', 'Children', 'Fantasy', 'Musical'...",1022,Cinderella (1950),1950,Cinderella (1950),11387,3.538948,0.95,3.362001,"['Disney animated feature', 'animated', 'CLV',...","['animation', 'children', 'fantasy', 'musical'..."
13067,13149,Cinderella,"['Children', 'Fantasy', 'Musical', 'Romance']",63239,Cinderella (1997),1997,Cinderella (1997),177,3.132768,0.85,2.662853,"['musical', 'CLV', 'fairy tale', 'itaege', 'ra...","['children', 'fantasy', 'musical', 'romance', ..."
22216,23795,Cinderella,"['Animation', 'Children', 'Fantasy', 'Romance']",111961,Cinderella (1994),1994,Cinderella (1994),9,1.555556,0.65,1.011111,"['GoodTimes Entertainment', 'direct-to-video',...","['animation', 'children', 'fantasy', 'romance'..."
25280,28829,Cinderella,"['Drama', 'Romance']",129195,Cinderella (2011),2011,Cinderella (2011),6,3.083333,0.6,1.85,"['cinderella', 'miniseries', '1950s', 'pianist...","['drama', 'romance', 'cinderella', 'miniseries..."
25442,29116,Cinderella,"['Children', 'Drama', 'Fantasy', 'Romance']",130073,Cinderella (2015),2015,Cinderella (2015),1140,3.357018,0.9,3.021316,"['feel-good', 'cheesy', 'ballroom dancing', 'c...","['children', 'drama', 'fantasy', 'romance', 'f..."
