### 1 Import Modules

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',101)
pd.set_option('display.max_rows',101)
import datetime
import os
import re 
import string
import itertools
import ast
from scipy.sparse import coo_matrix, hstack

import nltk
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
# linear_kernel is much faster than cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

### 2 Load Data

In [2]:
DATA_PATH = '../data'
OUTPUT_PATH = '../output'
# df = pd.read_csv(os.path.join(DATA_PATH, 'movies_metadata.csv'))
df = pd.read_pickle(os.path.join(OUTPUT_PATH, 'movies_metadata.pkl'))
credits_df = pd.read_csv(os.path.join(DATA_PATH, 'credits.csv'))
keywords_df = pd.read_csv(os.path.join(DATA_PATH, 'keywords.csv'))
links_df = pd.read_csv(os.path.join(DATA_PATH, 'links_small.csv'), dtype='Int64')
ratings_df = pd.read_csv(os.path.join(DATA_PATH, 'ratings_small.csv'))

In [3]:
# merge metadata, credits and keywords
df = df.merge(credits_df, on = 'id', how = 'left').merge(keywords_df, on = 'id', how = 'left')

In [4]:
df.head()

Unnamed: 0,budget,genres,id,original_language,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,belongs_to_collection_ind,homepage_ind,profit,year,cast,crew,keywords
0,30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,True,True,11.451801,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,False,False,3.043035,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,True,False,,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,False,False,4.09076,1995,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,True,False,,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


### 3 Recommender System

#### 3.1 Popularity-based
__Top 250 rated movies chart on IMDb:__
$$
\text{weighted rate (WR)} = \frac{v}{v+m} \times R + \frac{m}{v+m} \times C
$$
* R = average for the movie (mean) = (Rating)
* v = number of votes for the movie = (votes)
* m = minimum votes required to be listed in the Top 250 (currently 25,000 - 2016)
* C = the mean vote across the whole report

In [31]:
def popularity_based_recommender(df, m_threshold = 0.95, top_n = 10):
    # calculate m and C
    m = df['vote_count'].quantile(m_threshold)
    C = df['vote_average'].mean()
    # filter qualified movies with minimum vote threshold
    qualified_movies = df.loc[df.vote_count>m, ['title', 'vote_count', 'vote_average']].copy()
    # calculate R,v, and WR
    R = qualified_movies.vote_average
    v = qualified_movies.vote_count
    wr = (v/(v+m)*R)+(m/(v+m)*C)
    qualified_movies['weighted_rate_score'] = wr
    # sort and recommender top n movies
    qualified_movies.sort_values('weighted_rate_score', ascending=False, inplace=True)
    if top_n is None:
        return qualified_movies
    return qualified_movies.head(top_n)

In [32]:
popularity_based_recommender(df)

Unnamed: 0,title,vote_count,vote_average,weighted_rate_score
314,The Shawshank Redemption,8358.0,8.5,8.360234
837,The Godfather,6024.0,8.5,8.30965
12541,The Dark Knight,12269.0,8.3,8.209992
2858,Fight Club,9678.0,8.3,8.186908
292,Pulp Fiction,8670.0,8.3,8.174374
351,Forrest Gump,8147.0,8.2,8.071667
522,Schindler's List,4436.0,8.3,8.064952
23817,Whiplash,4376.0,8.3,8.062015
5505,Spirited Away,3968.0,8.3,8.039912
1163,The Empire Strikes Back,5998.0,8.2,8.02873


#### 3.2 Content-based filtering
##### 3.2.1 Plot description based

In [10]:
def remove_punctuations(text):
    translator = str.maketrans('','',string.punctuation)
    return text.translate(translator)
def remove_numbers(text):
    return re.sub(r'[\d+]','',text)
def tokenize(text):
    if len(text)==0:
        return []
    word_tokens = word_tokenize(text)
    return word_tokens
def remove_non_english_words(word_tokens):
    pattern = re.compile(r'^([a-zA-Z0-9\s]+)')
    filtered_tokens = [word for word in word_tokens if pattern.match(word) is not None]
    return filtered_tokens
def remove_stopwords(word_tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in word_tokens if word not in stop_words]
    return filtered_tokens
def stem_words(word_tokens):
    stems = [stemmer.stem(word) for word in word_tokens]
    stems = [word for word in stems if len(word)>2]
    return stems
def lemmatize_words(word_tokens):
    lemmas = [lemmatizer.lemmatize(word, pos = 'v') for word in word_tokens]
    lemmas = [word for word in lemmas if len(word)>2]
    return lemmas

In [11]:
df = (df.assign(cleaned_overview = lambda x: x['overview'].fillna('').astype(str)
                .str.lower()
                .apply(remove_punctuations)
                .apply(remove_numbers))
      .assign(word_tokens = lambda x: x['cleaned_overview'].fillna('').astype(str)
               .apply(tokenize)
               .apply(remove_non_english_words)
               .apply(remove_stopwords))
      .assign(stems = lambda x: x['word_tokens'].apply(stem_words),
              lemmas = lambda x: x['word_tokens'].apply(lemmatize_words))
      .assign(stemmed_overview = lambda x: x['stems'].apply(lambda x: ' '.join(x)),
              lemmatized_overview = lambda x: x['lemmas'].apply(lambda x: ' '.join(x))))

In [12]:
# calculate TF-IDF matrix
tfidf = TfidfVectorizer(analyzer='word', 
                        #ngram_range=(1, 2),
                        min_df=2,
                       )
tfidf_matrix = tfidf.fit_transform(df['stemmed_overview'])
# compute the cosine similarity matrix (for tfidf, dot product = cosine similairty score)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
print('# unique words:',tfidf_matrix.shape[1])

# unique words: 29257


In [33]:
# define recommendation function
def get_recommendations(title, df = df, cosine_sim = cosine_sim):
    idx = df.index[df.title==title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    movie_titles = df.title.iloc[movie_indices]
    return movie_titles

# define qualified recommendation function
def get_qualified_recommendations(title, df = df, cosine_sim = cosine_sim):
    idx = df.index[df.title==title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    movies = df.iloc[movie_indices].copy()
    qualified_movies = popularity_based_recommender(movies, m_threshold = 0.8, top_n = 10)
    return qualified_movies

In [34]:
get_recommendations('The Dark Knight Rises')

12541                                      The Dark Knight
1337                                        Batman Returns
585                                                 Batman
150                                         Batman Forever
15607                           Batman: Under the Red Hood
21326    Batman Unmasked: The Psychology of the Dark Kn...
9272                    Batman Beyond: Return of the Joker
19906              Batman: The Dark Knight Returns, Part 1
29707                   Batman Unlimited: Animal Instincts
3110                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [35]:
get_qualified_recommendations('The Dark Knight Rises')

Unnamed: 0,title,vote_count,vote_average,weighted_rate_score
12541,The Dark Knight,12269.0,8.3,8.232359
10170,Batman Begins,7511.0,7.5,7.435412
43126,The Lego Batman Movie,1473.0,7.2,6.998865
15607,Batman: Under the Red Hood,459.0,7.6,6.976039
3282,JFK,513.0,7.5,6.95742
585,Batman,2145.0,7.0,6.884868
1337,Batman Returns,1706.0,6.6,6.542147
31246,Batman v Superman: Dawn of Justice,7189.0,5.7,5.73485
150,Batman Forever,1529.0,5.2,5.445675
1503,Batman & Robin,1447.0,4.2,4.686548


##### 3.2.2 Metadata based

In [17]:
# clean genres
df['genres'] = df['genres'].fillna('[]').apply(ast.literal_eval)
df['genres'] = df['genres'].apply(lambda x: [elem['name'] for elem in x] if isinstance(x, list) else [])
# clean crew
df['crew'] = df['crew'].fillna('[]').apply(ast.literal_eval)
df['director'] = df['crew'].apply(lambda x: [elem['name'] for elem in x if elem['job']=='Director'] if isinstance(x, list) else [])
df['producer'] = df['crew'].apply(lambda x: [elem['name'] for elem in x if elem['job']=='Producer'] if isinstance(x, list) else [])
df['crew_size'] = df['crew'].apply(len)
# clean cast
df['cast'] = df['cast'].fillna('[]').apply(ast.literal_eval)
df['cast_name'] = df['cast'].apply(lambda x: [elem['name'] for elem in x] if isinstance(x, list) else [])
df['cast_name_top3'] = df['cast_name'].apply(lambda x: x[:3])
df['cast_gender'] = df['cast'].apply(lambda x: [elem['gender'] for elem in x] if isinstance(x, list) else [])
df = pd.concat([df, df['cast_gender'].apply(pd.Series.value_counts).fillna(0).astype(int).rename({0:'cast_gender0_size',1:'cast_gender1_size',2:'cast_gender2_size'},axis=1)], axis=1)
# clean keywords
df['keywords'] = df['keywords'].fillna('[]').apply(ast.literal_eval)
df['keywords'] = df['keywords'].apply(lambda x: [elem['name'] for elem in x] if isinstance(x, list) else [])
# df['stemmed_keywords'] = df['keywords'].apply(stem_words)

In [18]:
# top keywords
keywords = df['keywords'].apply(pd.Series).stack().reset_index(level=1, drop=True)
print(keywords.value_counts().head())
print('# unique keywords:', len(keywords.value_counts()))
print('# unique keywords that appear more than once:', len(keywords.value_counts()[keywords.value_counts()>1]))
print('# unique stemmed keywords that appear more than 10 times:', len(keywords.value_counts()[keywords.value_counts()>10]))

# stemmed_keywords = df['stemmed_keywords'].apply(pd.Series).stack().reset_index(level=1, drop=True)
# print(stemmed_keywords.value_counts().head())
# print('# unique stemmed keywords:', len(stemmed_keywords.value_counts()))
# print('# unique stemmed keywords that appear more than once:', len(stemmed_keywords.value_counts()[stemmed_keywords.value_counts()>1]))
# print('# unique stemmed keywords that appear more than 10 times:', len(stemmed_keywords.value_counts()[stemmed_keywords.value_counts()>10]))

filtered_keywords = keywords.value_counts()[keywords.value_counts()>1].index.values.tolist()
df['filtered_keywords'] = df['keywords'].apply(lambda x: [elem for elem in x if elem in filtered_keywords])

woman director      3128
independent film    1942
murder              1314
based on novel       840
musical              734
dtype: int64
# unique keywords: 19956
# unique keywords that appear more than once: 11397
# unique stemmed keywords that appear more than 10 times: 2694


In [19]:
def convert_to_count_matrix(col, weight=1):
    '''this function convert feature column to count matrix'''
    df[col+'_cleaned'] = df[col].apply(lambda x: ' '.join([str.lower(i.replace(" ", "")) for i in x]))
    count = CountVectorizer(analyzer='word',
                            #ngram_range=(1, 2),
                            min_df=1, 
                            stop_words='english')
    count_matrix = count.fit_transform(df[col+'_cleaned'])
    return count_matrix * weight

In [20]:
# concat the count matrices of all the features
features_to_convert = ['genres','director','cast_name_top3','filtered_keywords']
count_matrix1 = convert_to_count_matrix('genres', 2)
count_matrix2 = convert_to_count_matrix('director', 4)
count_matrix3 = convert_to_count_matrix('cast_name_top3')
count_matrix4 = convert_to_count_matrix('filtered_keywords')
count_matrix = hstack([count_matrix1,count_matrix2,count_matrix3,count_matrix4])
del count_matrix1, count_matrix2, count_matrix3, count_matrix4

In [21]:
# calculate cosine similarity score
if 'cosine_sim2' in globals():
    del cosine_sim2

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [22]:
get_recommendations('The Dark Knight Rises', df, cosine_sim2)

12541    The Dark Knight
10170      Batman Begins
2478           Following
11411       The Prestige
45828            Dunkirk
5278            Insomnia
15576          Inception
18866          Last Exit
34456               Rege
34603        Khiladi 420
Name: title, dtype: object

In [23]:
get_qualified_recommendations('The Dark Knight Rises', df, cosine_sim2)

Unnamed: 0,title,vote_count,vote_average,weighted_rate_score
12541,The Dark Knight,12269.0,8.3,8.252954
15576,Inception,14075.0,8.1,8.062091
23017,Interstellar,11187.0,8.1,8.0525
4114,Memento,4168.0,8.1,7.976669
11411,The Prestige,4510.0,8.0,7.890394
10170,Batman Begins,7511.0,7.5,7.447661
45828,Dunkirk,2712.0,7.5,7.362246
2478,Following,363.0,7.2,6.631205
5278,Insomnia,1181.0,6.8,6.62615
15004,Harry Brown,351.0,6.7,6.316634


#### 3.3 Collaborative filtering
SVD: build a latent factor model to capture the similarity between users and items

In [24]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [26]:
# create reader object
reader = Reader()
# load data
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
# single value decomposition + cross validation
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9044  0.8908  0.8932  0.8914  0.9021  0.8964  0.0057  
MAE (testset)     0.6966  0.6860  0.6847  0.6845  0.6972  0.6898  0.0058  
Fit time          5.16    5.52    5.86    5.23    5.40    5.43    0.25    
Test time         0.19    0.17    0.15    0.16    0.17    0.17    0.01    


{'test_rmse': array([0.90439682, 0.89077087, 0.89324086, 0.89137703, 0.90208754]),
 'test_mae': array([0.69656136, 0.68599272, 0.68471428, 0.68452473, 0.69718884]),
 'fit_time': (5.163435935974121,
  5.517941951751709,
  5.855656147003174,
  5.229674816131592,
  5.395166873931885),
 'test_time': (0.19231081008911133,
  0.1675121784210205,
  0.15033888816833496,
  0.15548396110534668,
  0.1735081672668457)}

In [27]:
# fit model with full data
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x108c099d0>

In [29]:
ratings_df[ratings_df['userId']==2].head()

Unnamed: 0,userId,movieId,rating,timestamp
20,2,10,4.0,835355493
21,2,17,5.0,835355681
22,2,39,5.0,835355604
23,2,47,4.0,835355552
24,2,50,4.0,835355586


In [30]:
# predict user, item, true rating
svd.predict(2, 10, 4)

Prediction(uid=2, iid=10, r_ui=4, est=3.315663560708013, details={'was_impossible': False})

In [None]:
# create movieId to Id map
links_df.dropna(axis=0, subset=['movieId', 'tmdbId'], inplace=True)
links_df['movieId'] = links_df['movieId'].astype(int)
links_df['tmdbId'] = links_df['tmdbId'].astype(int)
movieId_map = links_df[['movieId','tmdbId']].rename({'tmdbId':'id'},axis=1).set_index('id')

In [103]:
def cf_recommender(userId, title, model = svd):
    Id = df.id[df.title==title].values[0]
    movieId =  movieId_map.loc[Id]['movieId']
    return svd.predict(userId, movieId)

cf_recommender(1, 'The Dark Knight Rises')

Prediction(uid=1, iid=91529, r_ui=None, est=2.926132607471471, details={'was_impossible': False})

#### 3.4 Hybrid

In [65]:
def hybrid_recommendation(userId, title, df = df, cosine_sim = cosine_sim, model = svd):
    idx = df.index[df.title==title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    movies = df.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']].copy()
    movies['estimated_score'] = movies['id'].apply(lambda x: model.predict(userId, movieId_map.loc[x]['movieId']).est if x in movieId_map.index else np.nan)
    movies.sort_values('estimated_score', ascending=False, inplace=True)
    return movies.head(10)

In [98]:
hybrid_recommendation(1, 'The Dark Knight Rises')

Unnamed: 0,title,vote_count,vote_average,year,id,estimated_score
3282,JFK,513.0,7.5,1991,820,3.198647
19906,"Batman: The Dark Knight Returns, Part 1",410.0,7.7,2012,123025,3.160642
12541,The Dark Knight,12269.0,8.3,2008,155,3.046462
3110,Batman: Mask of the Phantasm,218.0,7.4,1993,14919,3.00865
20352,"Batman: The Dark Knight Returns, Part 2",426.0,7.9,2013,142061,2.963366
15607,Batman: Under the Red Hood,459.0,7.6,2010,40662,2.793932
18151,Batman: Year One,255.0,7.1,2011,69735,2.774763
10170,Batman Begins,7511.0,7.5,2005,272,2.759002
4384,Criminal Law,20.0,5.5,1988,41952,2.613691
6075,Q & A,22.0,6.6,1990,31598,2.591932
