In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
#actual path
fileDir = os.path.dirname(os.path.realpath('__file__'))
#correct path
rating_file = os.path.join(fileDir, '../processed_data/ratings_4std.csv')
movies_file = os.path.join(fileDir, '../processed_data/movies_preprocessed.csv')
# read data
movies = pd.read_csv(movies_file)
ratings = pd.read_csv(rating_file, usecols=['userId', 'movieId', 'rating'],
                     dtype={'userId':np.int32, 'movieId':np.int32, 'rating':np.float32})

In [3]:
# Removing the years from the 'title' column
# Strip function to get rid of any ending whitespace characters that may have appeared
movies['title'] = (movies.title.str.replace('(\(\d\d\d\d\))', '')
                               .apply(lambda x: x.strip()))

# Every genre is separated by a | so we simply have to call the split function on |
movies['genres'] = movies.genres.str.split('|')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres,key,year,n_ratings,mean_rating
0,0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story,1995,68469,3.886649
1,1,Jumanji,"[Adventure, Children, Fantasy]",Jumanji,1995,27143,3.246583
2,2,Grumpier Old Men,"[Comedy, Romance]",Grumpier Old Men,1995,15585,3.173981
3,3,Waiting to Exhale,"[Comedy, Drama, Romance]",Waiting to Exhale,1995,2989,2.87454
4,4,Father of the Bride Part II,[Comedy],Father of the Bride Part II,1995,15474,3.077291


In [5]:
import time
from contextlib import contextmanager
@contextmanager
def timer(msg):
    t0 = time.time()
    print(f'[{msg}] start.')
    yield
    elapsed_time = time.time() - t0
    print(f'[{msg}] done in {elapsed_time / 60:.2f} min.')

In [6]:
print('shape: ', movies.shape)
movies.head()

shape:  (53889, 7)


Unnamed: 0,movieId,title,genres,key,year,n_ratings,mean_rating
0,0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story,1995,68469,3.886649
1,1,Jumanji,"[Adventure, Children, Fantasy]",Jumanji,1995,27143,3.246583
2,2,Grumpier Old Men,"[Comedy, Romance]",Grumpier Old Men,1995,15585,3.173981
3,3,Waiting to Exhale,"[Comedy, Drama, Romance]",Waiting to Exhale,1995,2989,2.87454
4,4,Father of the Bride Part II,[Comedy],Father of the Bride Part II,1995,15474,3.077291


In [7]:
print('shape: ', ratings.shape)
ratings.head()

shape:  (15394772, 3)


Unnamed: 0,userId,movieId,rating
0,99948,11043,4.0
1,151227,1181,4.5
2,143305,11792,5.0
3,152942,143,3.0
4,128941,2823,3.5


In [8]:
ratings['rating'] = ratings['rating'] * 2
ratings['rating'] = ratings['rating'].astype(np.int8)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15394772 entries, 0 to 15394771
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   userId   int32
 1   movieId  int32
 2   rating   int8 
dtypes: int32(2), int8(1)
memory usage: 132.1 MB


## Recommend based on popularity

In [9]:
most_voted = (ratings.groupby('movieId')[['rating']]
                     .count()
                     .sort_values('rating', ascending=False)
                     .reset_index())
most_voted = pd.merge(most_voted, movies, on='movieId').drop('rating', axis=1)
most_voted.head()

Unnamed: 0,movieId,title,genres,key,year,n_ratings,mean_rating
0,315,The Shawshank Redemption,"[Crime, Drama]",The Shawshank Redemption,1994,97999,4.424188
1,352,Forrest Gump,"[Comedy, Drama, Romance, War]",Forrest Gump,1994,97040,4.056585
2,293,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",Pulp Fiction,1994,92406,4.173971
3,587,The Silence of the Lambs,"[Crime, Horror, Thriller]",The Silence of the Lambs,1991,87899,4.151412
4,2487,The Matrix,"[Action, Sci-Fi, Thriller]",The Matrix,1999,84545,4.149695


## Corr based on recommenders

Collaborative filtering

Takes into account user preferences

Uses Pearson's R correlation

Chooses item based on previous chosen item

In [10]:
# Due to problems with pandas, we can't use pivot_table with our all data as it throws MemoryError.
# Therefore, for this part we will work with a sample data
sample_ratings = ratings.sample(n=100000, random_state=20)

# Creating our sparse matrix and fill NA's with 0 to avoid high memory usage.
pivot = pd.pivot_table(sample_ratings, values='rating', index='userId', columns='movieId', fill_value=0)
pivot.head()

movieId,0,1,2,3,4,5,6,7,8,9,...,51529,51538,51582,51852,51860,52109,52237,52363,52589,52813
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
pivot = pivot.astype(np.int8)
pivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68358 entries, 5 to 283215
Columns: 7268 entries, 0 to 52813
dtypes: int8(7268)
memory usage: 474.3 MB


In [12]:
rating_count = (ratings.groupby('movieId')[['rating']]
                       .count()
                       .sort_values('rating', ascending=False)
                       .reset_index())
rating_count = pd.merge(rating_count, movies, on='movieId')
rating_count.head()

Unnamed: 0,movieId,rating,title,genres,key,year,n_ratings,mean_rating
0,315,83576,The Shawshank Redemption,"[Crime, Drama]",The Shawshank Redemption,1994,97999,4.424188
1,352,81493,Forrest Gump,"[Comedy, Drama, Romance, War]",Forrest Gump,1994,97040,4.056585
2,293,77322,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",Pulp Fiction,1994,92406,4.173971
3,587,73223,The Silence of the Lambs,"[Crime, Horror, Thriller]",The Silence of the Lambs,1991,87899,4.151412
4,2487,68873,The Matrix,"[Action, Sci-Fi, Thriller]",The Matrix,1999,84545,4.149695


In [13]:
# Let's look something similar to 
rand_movie = 0

similar = pivot.corrwith(pivot[rand_movie], drop=True).to_frame(name='PearsonR')

In [14]:
similar_sum = similar.join(rating_count['rating'])
similar_top10 = similar_sum[similar_sum['rating']>=500].sort_values(['PearsonR', 'rating'], 
                                                            ascending=[False, False]).head(11)
# Add movie names
similar_top10 = pd.merge(similar_top10[0:11], movies[['title', 'movieId']], on='movieId')
similar_top10

Unnamed: 0,movieId,PearsonR,rating,title
0,0,1.0,83576.0,Toy Story
1,3000,0.065317,602.0,Babes in Toyland
2,2715,0.053352,723.0,Little Nemo: Adventures in Slumberland
3,2155,0.052198,1103.0,Swept Away (Travolti da un insolito destino ne...
4,788,0.045991,4928.0,Harriet the Spy
5,1405,0.036715,2308.0,Amos & Andrew
6,1371,0.033369,2401.0,Night Falls on Manhattan
7,2725,0.025487,718.0,Perfect Blue
8,1960,0.024366,1307.0,Darby O'Gill and the Little People
9,3310,0.022718,500.0,The Great Muppet Caper


## Truncated SVD

In [None]:
from sklearn.decomposition import TruncatedSVD
with timer('SVD'):
    X = pivot.T
    # 500 components
    SVD = TruncatedSVD(n_components=500, random_state=20)
    SVD_matrix = SVD.fit_transform(X)

[SVD] start.


In [19]:
SVD.explained_variance_ratio_.sum()
#It covers about 50% of the whole data

0.5877481951922316

In [20]:
corr_mat = np.corrcoef(SVD_matrix)
corr_mat.shape

(7268, 7268)

In [25]:
rand_movie = 101

corr_pulp_fiction = corr_mat[rand_movie]

# Recommending a Highly Correlated Movie.
# We will get different results due to decompression with svd
idx = X[(corr_pulp_fiction < 1.0) & (corr_pulp_fiction > 0.5)].index
movies.index = movies.movieId
movies.loc[idx, 'title']

movieId
265                                 Little Odessa
305      Three Colors: White (Trzy kolory: Bialy)
345                      Clear and Present Danger
755                        Someone Else's America
1681                                    Firestorm
2932                                  Creepshow 2
6137          Europa Europa (Hitlerjunge Salomon)
12193                   The Jane Austen Book Club
12631                              Happy-Go-Lucky
Name: title, dtype: object