In [47]:
import os
import pandas as pd
import numpy as np

In [54]:
#actual path
fileDir = os.path.dirname(os.path.realpath('__file__'))
#correct path
rating_file = os.path.join(fileDir, '../data/ratings_4std.csv')
movies_filename = 'movies_preprocessed.csv'
# read data
movies = pd.read_csv(movies_filename)
ratings = pd.read_csv(ratings_filename, usecols=['userId', 'movieId', 'rating'],
                     dtype={'userId':np.int32, 'movieId':np.int32, 'rating':np.float32})

In [55]:
# Removing the years from the 'title' column
# Strip function to get rid of any ending whitespace characters that may have appeared
movies['title'] = (movies.title.str.replace('(\(\d\d\d\d\))', '')
                               .apply(lambda x: x.strip()))

# Every genre is separated by a | so we simply have to call the split function on |
movies['genres'] = movies.genres.str.split('|')

In [56]:
movies.head()

Unnamed: 0,movieId,title,genres,key,year,n_ratings,mean_rating
0,0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story,1995,68469,3.886649
1,1,Jumanji,"[Adventure, Children, Fantasy]",Jumanji,1995,27143,3.246583
2,2,Grumpier Old Men,"[Comedy, Romance]",Grumpier Old Men,1995,15585,3.173981
3,3,Waiting to Exhale,"[Comedy, Drama, Romance]",Waiting to Exhale,1995,2989,2.87454
4,4,Father of the Bride Part II,[Comedy],Father of the Bride Part II,1995,15474,3.077291


In [57]:
import time
from contextlib import contextmanager
@contextmanager
def timer(msg):
    t0 = time.time()
    print(f'[{msg}] start.')
    yield
    elapsed_time = time.time() - t0
    print(f'[{msg}] done in {elapsed_time / 60:.2f} min.')

In [58]:
print('shape: ', movies.shape)
movies.head()

shape:  (53889, 7)


Unnamed: 0,movieId,title,genres,key,year,n_ratings,mean_rating
0,0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story,1995,68469,3.886649
1,1,Jumanji,"[Adventure, Children, Fantasy]",Jumanji,1995,27143,3.246583
2,2,Grumpier Old Men,"[Comedy, Romance]",Grumpier Old Men,1995,15585,3.173981
3,3,Waiting to Exhale,"[Comedy, Drama, Romance]",Waiting to Exhale,1995,2989,2.87454
4,4,Father of the Bride Part II,[Comedy],Father of the Bride Part II,1995,15474,3.077291


In [59]:
print('shape: ', ratings.shape)
ratings.head()

shape:  (18578174, 3)


Unnamed: 0,userId,movieId,rating
0,99948,11043,4.0
1,151227,1181,4.5
2,143305,11792,5.0
3,258710,1793,3.0
4,152942,143,3.0


In [60]:
ratings['rating'] = ratings['rating'] * 2
ratings['rating'] = ratings['rating'].astype(np.int8)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18578174 entries, 0 to 18578173
Data columns (total 3 columns):
userId     int32
movieId    int32
rating     int8
dtypes: int32(2), int8(1)
memory usage: 159.5 MB


## Recommend based on popularity

In [61]:
most_voted = (ratings.groupby('movieId')[['rating']]
                     .count()
                     .sort_values('rating', ascending=False)
                     .reset_index())
most_voted = pd.merge(most_voted, movies, on='movieId').drop('rating', axis=1)
most_voted.head()

Unnamed: 0,movieId,title,genres,key,year,n_ratings,mean_rating
0,315,The Shawshank Redemption,"[Crime, Drama]",The Shawshank Redemption,1994,97999,4.424188
1,352,Forrest Gump,"[Comedy, Drama, Romance, War]",Forrest Gump,1994,97040,4.056585
2,293,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",Pulp Fiction,1994,92406,4.173971
3,587,The Silence of the Lambs,"[Crime, Horror, Thriller]",The Silence of the Lambs,1991,87899,4.151412
4,2487,The Matrix,"[Action, Sci-Fi, Thriller]",The Matrix,1999,84545,4.149695


## Corr based on recommenders

Collaborative filtering

Takes into account user preferences

Uses Pearson's R correlation

Chooses item based on previous chosen item

In [62]:
# Due to problems with pandas, we can't use pivot_table with our all data as it throws MemoryError.
# Therefore, for this part we will work with a sample data
sample_ratings = ratings.sample(n=100000, random_state=20)

# Creating our sparse matrix and fill NA's with 0 to avoid high memory usage.
pivot = pd.pivot_table(sample_ratings, values='rating', index='userId', columns='movieId', fill_value=0)
pivot.head()

movieId,0,1,2,3,4,5,6,7,8,9,...,51284,51356,51505,51528,51529,52174,52215,52311,52801,53277
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
pivot = pivot.astype(np.int8)
pivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66640 entries, 5 to 283228
Columns: 7609 entries, 0 to 53277
dtypes: int8(7609)
memory usage: 484.1 MB


In [64]:
rating_count = (ratings.groupby('movieId')[['rating']]
                       .count()
                       .sort_values('rating', ascending=False)
                       .reset_index())
rating_count = pd.merge(rating_count, movies, on='movieId')
rating_count.head()

Unnamed: 0,movieId,rating,title,genres,key,year,n_ratings,mean_rating
0,315,89412,The Shawshank Redemption,"[Crime, Drama]",The Shawshank Redemption,1994,97999,4.424188
1,352,87917,Forrest Gump,"[Comedy, Drama, Romance, War]",Forrest Gump,1994,97040,4.056585
2,293,83481,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",Pulp Fiction,1994,92406,4.173971
3,587,79145,The Silence of the Lambs,"[Crime, Horror, Thriller]",The Silence of the Lambs,1991,87899,4.151412
4,2487,75295,The Matrix,"[Action, Sci-Fi, Thriller]",The Matrix,1999,84545,4.149695


In [68]:
# Let's look something similar to 
rand_movie = 0

similar = pivot.corrwith(pivot[rand_movie], drop=True).to_frame(name='PearsonR')

In [69]:
similar_sum = similar.join(rating_count['rating'])
similar_top10 = similar_sum[similar_sum['rating']>=500].sort_values(['PearsonR', 'rating'], 
                                                            ascending=[False, False]).head(11)
# Add movie names
similar_top10 = pd.merge(similar_top10[0:11], movies[['title', 'movieId']], on='movieId')
similar_top10

Unnamed: 0,movieId,PearsonR,rating,title
0,0,1.0,89412.0,Toy Story
1,260,0.066556,14143.0,Ladybird Ladybird
2,1005,0.059409,4464.0,The Three Caballeros
3,912,0.037269,5079.0,Foreign Correspondent
4,1487,0.036296,2669.0,Love! Valour! Compassion!
5,3763,0.035137,512.0,"Affair of Love, An (Liaison pornographique, Une)"
6,1160,0.033153,3744.0,Mediterraneo
7,2702,0.03103,965.0,Cat's Eye
8,485,0.028229,9344.0,Made in America
9,1964,0.023546,1693.0,The Gnome-Mobile


## Truncated SVD

In [74]:
from sklearn.decomposition import TruncatedSVD
with timer('SVD'):
    X = pivot.T
    # 500 components
    SVD = TruncatedSVD(n_components=500, random_state=20)
    SVD_matrix = SVD.fit_transform(X)

[SVD] start.
[SVD] done in 3.29 min.


In [75]:
SVD.explained_variance_ratio_.sum()
#It covers about 50% of the whole data

0.5662047381989668

In [76]:
corr_mat = np.corrcoef(SVD_matrix)
corr_mat.shape

(7609, 7609)

In [77]:
corr_pulp_fiction = corr_mat[rand_movie]

# Recommending a Highly Correlated Movie.
# We will get different results due to decompression with svd
idx = X[(corr_pulp_fiction < 1.0) & (corr_pulp_fiction > 0.5)].index
movies.index = movies.movieId
movies.loc[idx, 'title']

movieId
0                                               Toy Story
260                                     Ladybird Ladybird
485                                       Made in America
1005                                 The Three Caballeros
3763     Affair of Love, An (Liaison pornographique, Une)
4578                                         The Tall Guy
6267                                       Pokémon Heroes
6669                 Journey of Hope (Reise der Hoffnung)
12096                                    The King of Kong
16566                                        The Way Back
Name: title, dtype: object