# Movie Recommendation

In [213]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import surprise
from surprise.prediction_algorithms import *
from surprise import accuracy, Dataset, Reader, BaselineOnly

from surprise.model_selection import cross_validate, train_test_split
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV


## Business Problem

Build a model that provides top 5 movie recommendations to a user, based on their ratings of other movies

## Data Exploration

In [214]:
links_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
tags_df = pd.read_csv('data/tags.csv')

ratings.csv fields:  
 - userId,movieId,rating,timestamp  
 
tags.csv fields:  

 - userId,movieId,tag,timestamp  
 
movie.csv fields:  
 - movieId,title,genres  
 
links.csv fields:  
 - MovieId,imdbId,tmdbId    

In [215]:
links_df.head()
movies_df.head()
ratings_df.head()
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [216]:
links_df.info()
movies_df.info()
ratings_df.info()
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  fl

We will be looking at ratings_df first, as it contains all the relevant columns.
Since we are interested in recommending movies users based on their ratings, target column is movieId, with userId as a feature. 

In [217]:
#Combine dfs to get a larger understanding of data
df = ratings_df.merge(movies_df, how="left", on="movieId")
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [218]:
df['genres'].nunique()

951

Our unique genres consist of: 
Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western
* (no genres listed)

But We have 951 types of genres, which include combinations of the above

In [219]:
df["rating"].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [220]:
print("userIds", df['userId'].nunique())
print("movieIds", df['movieId'].nunique())
## We have 610 unique users
## and 9724 different movies

userIds 610
movieIds 9724


In [221]:
df['timestamp']

0          964982703
1          964981247
2          964982224
3          964983815
4          964982931
             ...    
100831    1493848402
100832    1493850091
100833    1494273047
100834    1493846352
100835    1493846415
Name: timestamp, Length: 100836, dtype: int64

Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
Which is hard to interpret. Converting to date-time format:



In [222]:
#print(dt.datetime.fromtimestamp(df.loc['timestamp']))
#print(dt.datetime.fromtimestamp(df.loc['timestamp']))

#dt.datetime.fromtimestamp(df['timestamp'])

df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df['timestamp']
df.head()
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Data Processing for Modeling

We will be looking and modeling ratings_df, as it contains all the columns we are interested in.

In [223]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [306]:
#remove timestamp column, which is unnecessary for modeling
ratings_df = ratings_df.drop("timestamp", axis=1)
ratings_df

KeyError: "['timestamp'] not found in axis"

In [307]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4
1,1,3,4
2,1,6,4
3,1,47,5
4,1,50,5
...,...,...,...
0,1000,274,5
1,1000,2518,4
2,1000,46967,3
3,1000,6596,2


Transform dataset into suprise compatible data

In [225]:
from surprise import Reader, Dataset

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df, reader)

In [226]:
full_dataset = data.build_full_trainset

#view the number of users and items
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


Splitting Data into Data set A, for training, and B for testing:

In [227]:
##Manual Train/test split:

# A = 80% of the data, B = 20% of the data
#raw_ratings = data.raw_ratings
#threshold = int(0.8 * len(raw_ratings))
#A_raw_ratings = raw_ratings[:threshold]
#B_raw_ratings = raw_ratings[threshold:]

##Data is now the set A
#data.raw_ratings = A_raw_ratings

#trainset = data.build_full_trainset()
#testset = data.construct_testset(B_raw_ratings)

#trainset is set on dataset A
#testset is set on dataset B
# this ensures no dataleakage

# we can now use 
    #algo.fit(trainset)
    #algo.test(testset)

In [250]:
## We could just use train_test_split instead of doing it manually 

trainset, testset = train_test_split(data, test_size=0.2,random_state=42)

In [248]:
#function for outputting the accuracy predictions on an algorithm

def train_and_test_pred (algo, trainset, testset): 
    algo.fit(trainset)
   
    train_predictions = algo.test(trainset.build_testset())
    print('biased accuracy on train set: ')
    print(accuracy.rmse(train_predictions))
    test_predictions = algo.test(testset)
    print('unbiased accuracy on test set: ')
    print(accuracy.rmse(test_predictions))
    


## Determining the Best Model

Compare different models and determine which is the best.   
We will use RMSE to evaluate models for now

### SVD:

In [251]:
# performing a gridsearch with SVD

params = {
    'n_factors': [20,50,100],
    'reg_all': [0.02, 0.05, 0.1]
}

SVD_grid = GridSearchCV(SVD, param_grid = params, cv=5)
SVD_grid.fit(data)


In [252]:
#view grid search results
print(SVD_grid.best_params)
print(SVD_grid.best_score)

{'rmse': {'n_factors': 20, 'reg_all': 0.05}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}
{'rmse': 0.8752694123265279, 'mae': 0.6738782839797175}


In [256]:
## choose the best algo
algo = SVD_grid.best_estimator['rmse']

In [254]:
# Use our function to output train/test predictions
train_and_test_pred(algo, trainset, testset)

biased accuracy on train set: 
RMSE: 0.8056
0.8055865758331578
unbiased accuracy on test set: 
RMSE: 0.8830
0.8830479090492301


### KNNBasic:

In [257]:
## view crossvalidate score with KNNBasic:

knn_basic = KNNBasic(sim_options={"name": 'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)
cv_knn_basic

{'test_rmse': array([0.97647125, 0.96876095, 0.98759337, 0.98465294, 0.98363855]),
 'test_mae': array([0.75442713, 0.74512977, 0.76320275, 0.75784858, 0.75892987]),
 'fit_time': (0.784174919128418,
  0.8055026531219482,
  0.8020296096801758,
  0.7598710060119629,
  0.7648298740386963),
 'test_time': (1.3605248928070068,
  1.3406856060028076,
  1.362013578414917,
  1.313901662826538,
  1.2811660766601562)}

In [258]:
#average cross validation score:
np.mean(cv_knn_basic['test_rmse'])

0.9802234138647543

In [294]:
## GridSearch with KNNBasic
## note: this takes several minutes to run
params = {
    'k': [20,40,60],
    'sim_options': {
        'name':['msd', 'cosine', 'pearson'],
        'user_based': [True,False]   
    }
}
KNNBasic_grid = GridSearchCV(KNNBasic, param_grid = params, cv=5)
KNNBasic_grid.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


In [295]:
#view grid search results
print(KNNBasic_grid.best_params)
print(KNNBasic_grid.best_score)

{'rmse': {'k': 60, 'sim_options': {'name': 'msd', 'user_based': False}}, 'mae': {'k': 60, 'sim_options': {'name': 'msd', 'user_based': False}}}
{'rmse': 0.9110794614999813, 'mae': 0.7013363323211289}


In [296]:
#take the best model and 
#test on train/test 

algo = KNNBasic_grid.best_estimator['rmse']
train_and_test_pred(algo, trainset, testset)

Computing the msd similarity matrix...
Done computing similarity matrix.
biased accuracy on train set: 
RMSE: 0.5014
0.5013979150537194
unbiased accuracy on test set: 
RMSE: 0.9168
0.9167722493305924


### KNNBaseline

In [260]:
##view our cross validation score with KNNBaseline:

knn_baseline = KNNBaseline(sim_options={"name": 'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline, data)
cv_knn_baseline

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.88677525, 0.89052149, 0.8885817 , 0.8857827 , 0.88932634]),
 'test_mae': array([0.67833764, 0.68362775, 0.67740689, 0.67544341, 0.68272225]),
 'fit_time': (0.7053110599517822,
  0.7182059288024902,
  0.9954416751861572,
  0.7135488986968994,
  0.7271335124969482),
 'test_time': (1.4418697357177734,
  1.4488139152526855,
  1.4840292930603027,
  1.420046091079712,
  1.4101536273956299)}

In [261]:
np.mean(cv_knn_baseline['test_rmse'])

0.8881974948440214

In [262]:
## now view our accuracy with train/test data
train_and_test_pred(knn_baseline, trainset, testset)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
biased accuracy on train set: 
RMSE: 0.5759
0.5759278845918823
unbiased accuracy on test set: 
RMSE: 0.8928
0.8927787505891083


## Making Recommendations

We want to return the actual title of a movie instead of an ID

In [242]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Making simple predictions

Taking our best algorithm so far: SVD

In [264]:
best_algo = SVD(n_factors=50, reg_all = 0.05)
best_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cdfc44ca30>

In [266]:
best_algo.predict(2,4)

Prediction(uid=2, iid=4, r_ui=None, est=3.3024815397133183, details={'was_impossible': False})

In [267]:
## our prediction output is userId, itemId, r_ui, estimated rating, details

### Obtaining User Rankings

Creating a function that allows the current user to rate movies in our database. The function will save the ratings of the current user and give recommendations based on their preferences.

The function `movie_rater()` should take as parameters: 

* `movie_df`: DataFrame - a dataframe containing the movie ids, name of movie, and genres
* `num`: int - number of ratings
* `genre`: string - a specific genre from which to draw movies

The function returns:
* rating_list : list - a collection of dictionaries in the format of {'userId': int , 'movieId': int , 'rating': float}

In [269]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [299]:
def movie_rater(movie_df,num, genre=None):
    userId = 1000
    rating_list = []
    
    while num > 0:    
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
            
        print(movie)
        rating = input("Please rate this movie from 1-5, press n if you have not seen :\n")
    
        try: 
            if 1 <= int(rating) <= 5:
                rating_movie = {'userId': userId, "movieId":movie['movieId'].values[0], "rating": rating}
                rating_list.append(rating_movie)
                num -= 1
        except:
            print('Please enter a valid rating from 1-5')
            continue
    return rating_list


In [273]:
movies_df.loc[movies_df["movieId"]==663]

Unnamed: 0,movieId,title,genres
553,663,Kids in the Hall: Brain Candy (1996),Comedy


In [311]:
## Trying out the function:
user_rating = movie_rater(movies_df, 5, "Comedy")
user_rating

      movieId               title          genres
7565    85565  Chalet Girl (2011)  Comedy|Romance
Please rate this movie from 1-5, press n if you have not seen :
4
      movieId                                        title            genres
7779    91671  Alvin and the Chipmunks: Chipwrecked (2011)  Animation|Comedy
Please rate this movie from 1-5, press n if you have not seen :
4
      movieId                                 title                genres
5668    27644  Remember Me (Ricordati di me) (2003)  Comedy|Drama|Romance
Please rate this movie from 1-5, press n if you have not seen :
4
      movieId                 title  genres
5403    25752  Freshman, The (1925)  Comedy
Please rate this movie from 1-5, press n if you have not seen :
4
      movieId                       title  genres
3331     4509  Great Outdoors, The (1988)  Comedy
Please rate this movie from 1-5, press n if you have not seen :
4


[{'userId': 1000, 'movieId': 85565, 'rating': '4'},
 {'userId': 1000, 'movieId': 91671, 'rating': '4'},
 {'userId': 1000, 'movieId': 27644, 'rating': '4'},
 {'userId': 1000, 'movieId': 25752, 'rating': '4'},
 {'userId': 1000, 'movieId': 4509, 'rating': '4'}]

### Making Predictions with the New Ratings

In [276]:
user_rating

[{'userId': 1000, 'movieId': 274, 'rating': '5'},
 {'userId': 1000, 'movieId': 2518, 'rating': '4'},
 {'userId': 1000, 'movieId': 46967, 'rating': '3'},
 {'userId': 1000, 'movieId': 6596, 'rating': '2'},
 {'userId': 1000, 'movieId': 274, 'rating': '1'}]

In [313]:
## add the new ratings to the original ratings DataFrame
user_ratings_df = pd.DataFrame(user_rating)
ratings_df = pd.concat([ratings_df, user_ratings_df], axis=0)
new_data = Dataset.load_from_df(ratings_df, reader)


<surprise.dataset.DatasetAutoFolds at 0x1cd87e89160>

In [281]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
recc_movies = []
for m_id in new_df['movieId'].unique():
    prediction = best_algo.predict(1000,m_id)[3]
    recc_movies.append( (m_id, prediction) )

In [282]:
# order the predictions from highest to lowest rated

ranked_movies = sorted(recc_movies, key=lambda x:x[1], reverse = True)

In [302]:
ranked_movies

[(858, 4.267089211806522),
 (296, 4.2544888292107785),
 (318, 4.239404622316781),
 (1221, 4.231819254507933),
 (2959, 4.218571408559769),
 (750, 4.217218497526687),
 (50, 4.196163515648796),
 (1213, 4.182435668340897),
 (1276, 4.176829968961727),
 (1204, 4.175200920696795),
 (898, 4.171818590045663),
 (1261, 4.159018750933442),
 (2019, 4.150946001583618),
 (933, 4.150397848386026),
 (48516, 4.1439813120552875),
 (4973, 4.137416739307285),
 (2324, 4.137131647789212),
 (1228, 4.135482403014425),
 (56782, 4.126052161274939),
 (1233, 4.125357807216946),
 (1136, 4.118282185335452),
 (1201, 4.117427543141067),
 (1248, 4.1143224173895225),
 (904, 4.113924405538391),
 (1262, 4.1112671614303995),
 (1104, 4.11087450041822),
 (720, 4.109810540203233),
 (7153, 4.104908305159104),
 (1208, 4.104852940914796),
 (1223, 4.10420501778623),
 (1089, 4.100385525384872),
 (4993, 4.098349607155989),
 (1196, 4.090136423851055),
 (1237, 4.088837773855071),
 (3275, 4.087340681020859),
 (2571, 4.0853148406597555

We create a function `recommended_movies()` that takes in the parameters:
* `user_ratings`: list - list of tuples formulated as (user_id, movie_id) (should be in order of best to worst for this individual)
* `movie_title_df`: DataFrame 
* `n`: int - number of recommended movies 

We print out each recommended *n* movies in order from best to worst

In [286]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
    for idx, rec in enumerate(user_ratings):
        title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
        print('Recommendation # ', idx+1, ': ', title, '\n')
        n-= 1
        if n == 0:
            break

Recommendation #  1 :  659    Godfather, The (1972)
Name: title, dtype: object 

Recommendation #  2 :  257    Pulp Fiction (1994)
Name: title, dtype: object 

Recommendation #  3 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation #  4 :  922    Godfather: Part II, The (1974)
Name: title, dtype: object 

Recommendation #  5 :  2226    Fight Club (1999)
Name: title, dtype: object 



In [303]:
recommended_movies(ranked_movies, movies_df, 5)

Recommendation #  1 :  659    Godfather, The (1972)
Name: title, dtype: object 

Recommendation #  2 :  257    Pulp Fiction (1994)
Name: title, dtype: object 

Recommendation #  3 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation #  4 :  922    Godfather: Part II, The (1974)
Name: title, dtype: object 

Recommendation #  5 :  2226    Fight Club (1999)
Name: title, dtype: object 



Combining all into one function:

In [309]:
def rate_and_rec(ratings_df, movies_df,num, genre=None):
    
    user_rating = movie_rater(movies_df, num, genre)
    
    ## add the new ratings to the original ratings DataFrame
    user_ratings_df = pd.DataFrame(user_rating)
    ratings_df = pd.concat([ratings_df, user_ratings_df], axis=0)
    new_data = Dataset.load_from_df(ratings_df, reader)

    # make predictions for the user
    # you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
    recc_movies = []
    for m_id in new_df['movieId'].unique():
        prediction = best_algo.predict(1000,m_id)[3]
        recc_movies.append( (m_id, prediction) )
    
    # order the predictions from highest to lowest rated
    ranked_movies = sorted(recc_movies, key=lambda x:x[1], reverse = True)
    
    recommended_movies(ranked_movies,movies_df,num)

In [310]:
rate_and_rec(ratings_df, movies_df, 5, "Action")

      movieId               title                        genres
7221    73268  Daybreakers (2010)  Action|Drama|Horror|Thriller
Please rate this movie from 1-5, press n if you have not seen :
4
      movieId                                      title         genres
1908     2532  Conquest of the Planet of the Apes (1972)  Action|Sci-Fi
Please rate this movie from 1-5, press n if you have not seen :
4
      movieId                title                               genres
7271    74685  Crazies, The (2010)  Action|Drama|Horror|Sci-Fi|Thriller
Please rate this movie from 1-5, press n if you have not seen :
5
      movieId          title                 genres
6627    56156  Hitman (2007)  Action|Crime|Thriller
Please rate this movie from 1-5, press n if you have not seen :
1
      movieId               title            genres
8263   105121  Inescapable (2012)  Action|Drama|War
Please rate this movie from 1-5, press n if you have not seen :
2
Recommendation #  1 :  659    Godfather, The (