# Movie Recommendation

In [173]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import surprise
from surprise.prediction_algorithms import *
from surprise import accuracy, Dataset, Reader, BaselineOnly

from surprise.model_selection import cross_validate, train_test_split
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV


## Business Problem

Build a model that provides top 5 movie recommendations to a user, based on their ratings of other movies

## Data Exploration

In [174]:
links_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
tags_df = pd.read_csv('data/tags.csv')

ratings.csv fields:  
 - userId,movieId,rating,timestamp  
 
tags.csv fields:  

 - userId,movieId,tag,timestamp  
 
movie.csv fields:  
 - movieId,title,genres  
 
links.csv fields:  
 - MovieId,imdbId,tmdbId    

In [175]:
links_df.head()
movies_df.head()
ratings_df.head()
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [176]:
links_df.info()
movies_df.info()
ratings_df.info()
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  fl

We will be looking at ratings_df first, as it contains all the relevant columns.
Since we are interested in recommending movies users based on their ratings, target column is movieId, with userId as a feature. 

In [177]:
#Combine dfs to get a larger understanding of data
df = ratings_df.merge(movies_df, how="left", on="movieId")
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [178]:
df['genres'].nunique()

951

Our unique genres consist of: 
Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western
* (no genres listed)

But We have 951 types of genres, which include combinations of the above

In [179]:
df["rating"].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [180]:
print("userIds", df['userId'].nunique())
print("movieIds", df['movieId'].nunique())
## We have 610 unique users
## and 9724 different movies

userIds 610
movieIds 9724


In [181]:
df['timestamp']

0          964982703
1          964981247
2          964982224
3          964983815
4          964982931
             ...    
100831    1493848402
100832    1493850091
100833    1494273047
100834    1493846352
100835    1493846415
Name: timestamp, Length: 100836, dtype: int64

Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
Which is hard to interpret. Converting to date-time format:



In [182]:
#print(dt.datetime.fromtimestamp(df.loc['timestamp']))
#print(dt.datetime.fromtimestamp(df.loc['timestamp']))

#dt.datetime.fromtimestamp(df['timestamp'])

df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df['timestamp']
df.head()
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [183]:
##surprise needs a reader, but only on rating_scale if loading from a df
reader = Reader(rating_scale=(1, 5))

In [184]:
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[["userId", "movieId", "rating"]], reader)

In [185]:
##we can cross_validate
from surprise.model_selection import cross_validate
cross_validate(NormalPredictor(), data, cv=2, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    1.4171  1.4167  1.4169  0.0002  
MAE (testset)     1.1326  1.1304  1.1315  0.0011  
Fit time          0.07    0.07    0.07    0.00    
Test time         0.29    0.55    0.42    0.13    


{'test_rmse': array([1.41714807, 1.41666375]),
 'test_mae': array([1.13256752, 1.13035451]),
 'fit_time': (0.06745672225952148, 0.07093477249145508),
 'test_time': (0.2872030735015869, 0.5500357151031494)}

## Data Processing for Modeling

We will be looking and modeling ratings_df, as it contains all the columns we are interested in.

In [186]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [187]:
#remove timestamp column, which is unnecessary for modeling
ratings_df = ratings_df.drop("timestamp", axis=1)
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


Transform dataset into suprise compatible data

In [188]:
from surprise import Reader, Dataset

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df, reader)

In [189]:
full_dataset = data.build_full_trainset

#view the number of users and items
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


Splitting Data into Data set A, for training, and B for testing:

In [190]:
# A = 80% of the data, B = 20% of the data
raw_ratings = data.raw_ratings
threshold = int(0.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]


In [191]:
##Data is now the set A
data.raw_ratings = A_raw_ratings

In [192]:
trainset = data.build_full_trainset()
testset = data.construct_testset(B_raw_ratings)

#trainset is set on dataset A
#testset is set on dataset B
# this ensures no dataleakage

# we can now use 
    #algo.fit(trainset)
    #algo.test(testset)

In [199]:
#function for outputting the accuracy predictions on an algorithm

def train_and_test_pred (algo, trainset, testset): 
    algo.fit(trainset)
   
    train_predictions = algo.test(trainset.build_testset())
    print('biased accuracy on train set: ')
    print(accuracy.rmse(train_predictions))
    test_predictions = algo.test(testset)
    print('unbiased accuracy on test set: ')
    print(accuracy.rmse(test_predictions))
    


## Determining the Best Model

Compare different models and determine which is the best.   
We will use RMSE to evaluate models for now

### SVD:

In [206]:
# performing a gridsearch with SVD

params = {
    'n_factors': [20,50,100],
    'reg_all': [0.02, 0.05, 0.1]
}

grid_search = GridSearchCV(SVD, param_grid = params, cv=5)
grid_search.fit(data)


AttributeError: 'function' object has no attribute 'raw_ratings'

In [196]:
#view grid search results
print(grid_search.best_params)
print(grid_search.best_score)

{'rmse': {'n_factors': 50, 'reg_all': 0.05}, 'mae': {'n_factors': 50, 'reg_all': 0.05}}
{'rmse': 0.875402958832254, 'mae': 0.673759521498005}


In [197]:
## choose the best algo
algo = grid_search.best_estimator['rmse']

In [200]:
# Use our function to output train/test predictions
train_and_test_pred(algo, trainset, testset)

biased accuracy on train set: 
RMSE: 0.7831
0.7831045294294952
unbiased accuracy on test set: 
RMSE: 0.9772
0.9771716157293666


### KNNBasic:

In [208]:
## view crossvalidate score with KNNBasic:

knn_basic = KNNBasic(sim_options={"name": 'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)
cv_knn_basic

{'test_rmse': array([0.97772026, 0.98612306, 0.98134088, 0.98265038, 0.97750178]),
 'test_mae': array([0.75216182, 0.75936307, 0.75841816, 0.75967003, 0.75049198]),
 'fit_time': (0.7911174297332764,
  0.7380468845367432,
  0.7588791847229004,
  0.8476622104644775,
  0.7950868606567383),
 'test_time': (1.3064630031585693,
  1.2762067317962646,
  1.1740305423736572,
  1.2528941631317139,
  1.2851343154907227)}

In [209]:
#average cross validation score:
np.mean(cv_knn_basic['test_rmse'])

0.9810672687440805

In [202]:
##test on train/test 

algo = knn_basic
train_and_test_pred(algo, trainset, testset)

Computing the pearson similarity matrix...
Done computing similarity matrix.
biased accuracy on train set: 
RMSE: 0.6744
0.6744138315275588
unbiased accuracy on test set: 
RMSE: 1.0474
1.0473584701569942


### KNNBaseline

In [210]:
##view our cross validation score with KNNBaseline:

knn_baseline = KNNBaseline(sim_options={"name": 'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline, data)
cv_knn_baseline

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.89157978, 0.88014826, 0.88437932, 0.89261357, 0.88842335]),
 'test_mae': array([0.68093303, 0.67278037, 0.67569177, 0.68238241, 0.68049459]),
 'fit_time': (0.7638404369354248,
  1.3139007091522217,
  0.767310619354248,
  0.7360351085662842,
  0.7886383533477783),
 'test_time': (1.9715969562530518,
  1.51676607131958,
  1.4959335327148438,
  1.4656777381896973,
  1.4458370208740234)}

In [211]:
np.mean(cv_knn_baseline['test_rmse'])

0.8874288554755895

In [212]:
## now view our accuracy with train/test data
train_and_test_pred(knn_baseline, trainset, testset)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
biased accuracy on train set: 
RMSE: 0.5955
0.5954646024520138
unbiased accuracy on test set: 
RMSE: 0.9833
0.9832845005797262


## Making Recommendations