In [17]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [2]:
ratings = pd.read_csv('./Raw Data/ratings_large.csv')
movies = pd.read_csv('./Raw Data/movies_large.csv')

#### Drop timestamp and genres columns, merge ratings and movies, then create pivot table

In [3]:
ratings.drop(columns='timestamp', inplace=True)
movies.drop(columns='genres', inplace=True)

In [4]:
df = pd.merge(ratings, movies, left_on='movieId', right_on='movieId')
df.head(5)

Unnamed: 0,userId,movieId,rating,title
0,1,307,3.5,Three Colors: Blue (Trois couleurs: Bleu) (1993)
1,6,307,4.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
2,56,307,4.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,71,307,5.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
4,84,307,3.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)


In [7]:
#https://stackoverflow.com/questions/61757170/python-unstacked-dataframe-is-too-big-causing-int32-overflow

chunk_size = 50000
chunks = [x for x in range(0, df.shape[0], chunk_size)]

for i in range(0, len(chunks) - 1):
    print(chunks[i], chunks[i + 1] - 1)

0 49999
50000 99999
100000 149999
150000 199999
200000 249999
250000 299999
300000 349999
350000 399999
400000 449999
450000 499999
500000 549999
550000 599999
600000 649999
650000 699999
700000 749999
750000 799999
800000 849999
850000 899999
900000 949999
950000 999999
1000000 1049999
1050000 1099999
1100000 1149999
1150000 1199999
1200000 1249999
1250000 1299999
1300000 1349999
1350000 1399999
1400000 1449999
1450000 1499999
1500000 1549999
1550000 1599999
1600000 1649999
1650000 1699999
1700000 1749999
1750000 1799999
1800000 1849999
1850000 1899999
1900000 1949999
1950000 1999999
2000000 2049999
2050000 2099999
2100000 2149999
2150000 2199999
2200000 2249999
2250000 2299999
2300000 2349999
2350000 2399999
2400000 2449999
2450000 2499999
2500000 2549999
2550000 2599999
2600000 2649999
2650000 2699999
2700000 2749999
2750000 2799999
2800000 2849999
2850000 2899999
2900000 2949999
2950000 2999999
3000000 3049999
3050000 3099999
3100000 3149999
3150000 3199999
3200000 3249999
3250000 

In [None]:
df_new = pd.concat([df.iloc[chunks[i]:chunks[i + 1] - 1 ].pivot_table(index='title', columns='userId', values='rating') for i in range(0, len(chunks) - 1)])

In [None]:
#pivot = df.pivot_table(values='rating', index='title', columns='userId')

### Create spare matrix and then calculate cosine similarity

In [None]:
pivot_sparse = sparse.csr_matrix(pivot.fillna(0))

In [None]:
distances = pairwise_distances(pivot_sparse, metric='cosine')

similarities = 1.0 - distances

In [None]:
recommender = pd.DataFrame(similarities, index=pivot.index, columns=pivot.index)
recommender

### explore recommender

In [None]:
title_word = ''

for title in movies.loc[movies['title'].str.contains(title_word), 'title']:
    print(title)
    print('Average rating', pivot.loc[title, :].mean())

In [None]:
title_word = ''
titles = pivot[pivot.index.str.contains(title_word)].index

for title in titles:
    print(title)
    print('10 closest movies')
    print(recommender_df[title].sort_values()[1:11])
    print('')
    print('*******************************************************************************************')
    print('')

# On the small dataset

In [5]:
ratings_small = pd.read_csv('./Raw Data/ratings_small.csv')
movies_small = pd.read_csv('./Raw Data/movies_small.csv')

In [6]:
df_small = pd.merge(ratings_small, movies_small, on='movieId')

In [7]:
df_small.drop(columns=['timestamp','genres'], inplace=True)

In [8]:
pivot = df_small.pivot_table(values='rating', index='title', columns='userId')

In [15]:
pivot_sparse = sparse.csr_matrix(pivot.fillna(0))

In [18]:
distances = pairwise_distances(pivot_sparse, metric='cosine')

similarities = 1.0 - distances

In [19]:
recommender = pd.DataFrame(similarities, index=pivot.index, columns=pivot.index)
recommender

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.141653,0.000000,...,0.000000,0.342055,0.543305,0.707107,0.0,0.000000,0.139431,0.327327,0.000000,0.0
'Hellboy': The Seeds of Creation (2004),0.000000,1.000000,0.707107,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Round Midnight (1986),0.000000,0.707107,1.000000,0.000000,0.000000,0.0,0.176777,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Salem's Lot (2004),0.000000,0.000000,0.000000,1.000000,0.857493,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Til There Was You (1997),0.000000,0.000000,0.000000,0.857493,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.211467,0.216295,0.097935,0.132489,...,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.192259,0.000000,0.170341,0.0
xXx (2002),0.139431,0.000000,0.000000,0.000000,0.000000,0.0,0.089634,0.000000,0.276512,0.019862,...,0.069716,0.305535,0.173151,0.246482,0.0,0.192259,1.000000,0.270034,0.100396,0.0
xXx: State of the Union (2005),0.327327,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.156764,0.000000,...,0.000000,0.382543,0.177838,0.231455,0.0,0.000000,0.270034,1.000000,0.000000,0.0
¡Three Amigos! (1986),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.372876,0.180009,0.169385,0.249586,...,0.180009,0.000000,0.000000,0.000000,0.0,0.170341,0.100396,0.000000,1.000000,0.0


In [29]:
title_word = 'matrix'

for title in movies.loc[movies['title'].str.contains(title_word), 'title']:
    print(title)
    print('Average rating', pivot.loc[title, :].mean())

Animatrix, The (2003)
Average rating 3.7


In [30]:
title_word = 'matrix'
titles = pivot[pivot.index.str.contains(title_word)].index

for title in titles:
    print(title)
    print('10 closest movies')
    print(recommender[title].sort_values()[1:6])
    print('')
    print('*******************************************************************************************')
    print('')

Animatrix, The (2003)
10 closest movies
title
Lucky One, The (2012)             0.0
Luck by Chance (2009)             0.0
Low Down Dirty Shame, A (1994)    0.0
Loving Vincent (2017)             0.0
Lovesick (2014)                   0.0
Name: Animatrix, The (2003), dtype: float64

*******************************************************************************************

