# Movies Recommender System

![](http://labs.criteo.com/wp-content/uploads/2017/08/CustomersWhoBought3.jpg)

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

## Simple Recommender

In [35]:
md = pd. read_csv('./input/movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [36]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [37]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('float')
C = vote_averages.mean()
C

5.618207215134184

In [38]:
m = vote_counts.quantile(0.95)
m

434.0

In [39]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [40]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('float')
qualified.shape

(2274, 6)

In [41]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [42]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [43]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

### Top Movies

In [45]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
314,The Shawshank Redemption,1994,8358,8.5,51.645403,"[Drama, Crime]",8.357746
834,The Godfather,1972,6024,8.5,41.109264,"[Drama, Crime]",8.306334
12481,The Dark Knight,2008,12269,8.3,123.167259,"[Drama, Action, Crime, Thriller]",8.208376
2843,Fight Club,1999,9678,8.3,63.869599,[Drama],8.184899
292,Pulp Fiction,1994,8670,8.3,140.950236,"[Thriller, Crime]",8.172155
351,Forrest Gump,1994,8147,8.2,48.307194,"[Comedy, Drama, Romance]",8.069421
522,Schindler's List,1993,4436,8.3,41.725123,"[Drama, History, War]",8.061007
23673,Whiplash,2014,4376,8.3,64.29999,[Drama],8.058025
5481,Spirited Away,2001,3968,8.3,41.048867,"[Fantasy, Adventure, Animation, Family]",8.035598
1154,The Empire Strikes Back,1980,5998,8.2,19.470959,"[Adventure, Action, Science Fiction]",8.025793


### Data Prep

In [12]:
links_small = pd.read_csv('./input/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [13]:
md = md.drop([19730, 29503, 35587])

In [14]:
credits = pd.read_csv('./input/credits.csv')
keywords = pd.read_csv('./input/keywords.csv')

In [15]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [16]:
md.shape

(45463, 25)

In [17]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [18]:
smd = md[md['id'].isin(links_small)]
smd = smd.reset_index()
titles = smd['title']
print(smd['title'])
smd.shape

0                                               Toy Story
1                                                 Jumanji
2                                        Grumpier Old Men
3                                       Waiting to Exhale
4                             Father of the Bride Part II
                              ...                        
9214                       The Last Brickmaker in America
9215                                               Rustom
9216                                         Mohenjo Daro
9217                                        Shin Godzilla
9218    The Beatles: Eight Days a Week - The Touring Y...
Name: title, Length: 9219, dtype: object


(9219, 29)

## Collaborative Filtering

In [19]:
reader = Reader()

In [20]:
ratings = pd.read_csv('./input/ratings_small.csv')

In [21]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
# data.split(n_folds=5)
# cross_validate(svd, data, measures=['RMSE'], cv=10)
# kf = KFold(n_splits=5)
# kf.split(data)

In [22]:
svd = SVD()
# evaluate(svd, data, measures=['RMSE', 'MAE'])
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8924  0.9041  0.8900  0.9015  0.8956  0.8967  0.0053  
MAE (testset)     0.6862  0.6968  0.6853  0.6949  0.6895  0.6906  0.0046  
Fit time          0.68    0.74    0.70    0.70    0.68    0.70    0.02    
Test time         0.07    0.06    0.12    0.06    0.12    0.09    0.03    


{'test_rmse': array([0.8923664 , 0.90409456, 0.890006  , 0.90146419, 0.89558167]),
 'test_mae': array([0.68615538, 0.69683861, 0.6853181 , 0.69492325, 0.68954593]),
 'fit_time': (0.6792447566986084,
  0.7367193698883057,
  0.6955678462982178,
  0.6961543560028076,
  0.6841263771057129),
 'test_time': (0.0675361156463623,
  0.06474947929382324,
  0.12285542488098145,
  0.06323695182800293,
  0.1241145133972168)}

In [23]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcd0db78ac0>

In [24]:
ranking = []

for i in range(len(titles)):
    ranking.append([i, svd.predict(1, i)[3]])

ranking.sort(key = lambda ranking: ranking[1])
ranking.reverse()
print([titles[ranking[i][0]] for i in range(10)])

['Die Hard', 'Jean de Florette', 'Layer Cake', 'Inventing the Abbotts', 'Lamerica', 'The Crow', 'Fools Rush In', 'Rosewood', 'Love and Other Catastrophes', 'The Apartment']
