# Recommender System Modeling
In this notebook, we will work through multiple iterations of our movie recommender system model. First, we will create a simpler model using just the movie reviews from IMDb. This will give us a baseline model that we can potentially improve on by adding more detailed features.

In [1]:
# Imports
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

## Simple Recommender System
This is a simple recommender system using cosine similarity that can be improved upon and replaced when all our data is properly formatted.

In [2]:
# Let's try this with only our MovieLens reviews first
ml_reviews = pd.read_csv('../Data/Large-Data/ml_reviews.csv')
ml_reviews

Unnamed: 0,user_id,imdb_id,scaled_rating,title
0,1,tt0110912,10.0,Pulp Fiction
1,1,tt0111495,7.0,Trois couleurs: Rouge
2,1,tt0108394,10.0,Trois couleurs: Bleu
3,1,tt0114787,10.0,Underground
4,1,tt0045152,7.0,Singin' in the Rain
...,...,...,...,...
24969860,162541,tt0382932,9.0,Ratatouille
24969861,162541,tt0389790,5.0,Bee Movie
24969862,162541,tt0952640,4.0,Alvin and the Chipmunks
24969863,162541,tt0468569,8.0,The Dark Knight


In [3]:
# First let's create a pivot table with a small sample of movie reviews
sampled_reviews = ml_reviews.sample(frac=0.01, random_state=42)

In [23]:
sampled_reviews['title'].nunique()

13868

In [29]:
print(f"Reviews: {len(sampled_reviews['imdb_id'])}")
print(f"Movies: {sampled_reviews['imdb_id'].nunique()}")
print(f"Users: {sampled_reviews['user_id'].nunique()}")

Reviews: 249699
Movies: 14341
Users: 89946


In [4]:
pivot = sampled_reviews.pivot_table(values='scaled_rating', index='title', columns='user_id')
pivot.head()

user_id,1,2,3,4,5,8,9,10,12,13,...,162528,162529,162532,162533,162534,162535,162536,162538,162539,162540
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#Followme,,,,,,,,,,,...,,,,,,,,,,
#Nerealnaya lyubov,,,,,,,,,,,...,,,,,,,,,,
$5 a Day,,,,,,,,,,,...,,,,,,,,,,
$9.99,,,,,,,,,,,...,,,,,,,,,,
$ellebrity,,,,,,,,,,,...,,,,,,,,,,


In [25]:
pivot.loc['#Followme']

user_id
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
          ..
162535   NaN
162536   NaN
162538   NaN
162539   NaN
162540   NaN
Name: #Followme, Length: 89946, dtype: float64

In [None]:
#pivot = ml_reviews.pivot_table(values = 'scaled_rating', index = 'title', columns = 'user_id')

#pivot.head()

In [5]:
pivot.shape

(13868, 89946)

In [6]:
pivot_sparse = sparse.csr_matrix(pivot.fillna(0))

In [7]:
distances = pairwise_distances(pivot_sparse, metric = 'cosine')

In [9]:
similarities = 1.0 - distances

In [10]:
recommender = pd.DataFrame(similarities, index = pivot.index, columns=pivot.index)

recommender.head()

title,#Followme,#Nerealnaya lyubov,$5 a Day,$9.99,$ellebrity,'71,'A' gai wak 2,'Breaker' Morant,'R Xmas,'Round Midnight,...,État de siège,Évolution,Être et avoir,"Ó Paí, Ó",Ôdishon,Ôkami kodomo no Ame to Yuki,Ônibus 174,Ôritsu uchûgun Oneamisu no tsubasa,Üvegtigris,Üç Maymun
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#Followme,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#Nerealnaya lyubov,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$5 a Day,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$ellebrity,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
recommender.loc['#Followme']

title
#Followme                             1.0
#Nerealnaya lyubov                    0.0
$5 a Day                              0.0
$9.99                                 0.0
$ellebrity                            0.0
                                     ... 
Ôkami kodomo no Ame to Yuki           0.0
Ônibus 174                            0.0
Ôritsu uchûgun Oneamisu no tsubasa    0.0
Üvegtigris                            0.0
Üç Maymun                             0.0
Name: #Followme, Length: 13868, dtype: float64

In [15]:
imdb_titles = pd.read_csv('../Data/Large-Data/imdb_titles.csv')

In [16]:
imdb_titles.head()

Unnamed: 0,titleId,title
0,tt0000001,Carmencita
1,tt0000002,Le clown et ses chiens
2,tt0000003,Pauvre Pierrot
3,tt0000004,Un bon bock
4,tt0000005,Blacksmith Scene


In [17]:
q = 'Toy Story'

for title in imdb_titles.loc[imdb_titles['title'].str.contains(q), 'title']:
    print(title)
    print(recommender[title].sort_values(ascending = False)[1:11])
    print()
    print()

Toy Story
title
Flicka                                                              0.050089
They Wait                                                           0.050089
DOA: Dead or Alive                                                  0.048198
Fancy Pants                                                         0.047519
Godspell: A Musical Based on the Gospel According to St. Matthew    0.045080
Courageous                                                          0.045080
Talk Radio                                                          0.044801
Ball of Fire                                                        0.043786
Punchline                                                           0.042475
Scooby-Doo and the Loch Ness Monster                                0.040071
Name: Toy Story, dtype: float64


Toy Story 2
title
Nightmare at Noon                                                0.077706
Her Alibi                                                        0.077706
Gertrud       

KeyError: "The Story Behind 'Toy Story'"

## Scale to include more reviews

In [30]:
over_100 = pd.read_csv('../Data/Large-Data/over_100_reviews.csv')
over_100

  over_100 = pd.read_csv('../Data/Large-Data/over_100_reviews.csv')


Unnamed: 0,user_id,imdb_id,scaled_rating,title
0,1,tt0110912,10.0,Pulp Fiction
1,1,tt0111495,7.0,Trois couleurs: Rouge
2,1,tt0108394,10.0,Trois couleurs: Bleu
3,1,tt0114787,10.0,Underground
4,1,tt0045152,7.0,Singin' in the Rain
...,...,...,...,...
27522147,ur3174947,tt0096895,8.0,Batman
27522148,ur0581842,tt0107977,6.0,Robin Hood: Men in Tights
27522149,ur3174947,tt0103776,8.0,Batman Returns
27522150,ur4581944,tt0102614,8.0,Out for Justice


In [34]:
print(f"Reviews: {len(over_100['imdb_id'])}")
print(f"Movies: {over_100['imdb_id'].nunique()}")
print(f"Users: {over_100['user_id'].nunique()}")

Reviews: 27522152
Movies: 14339
Users: 1229633


In [35]:
pivot_100 = over_100.pivot_table(values='scaled_rating', index='title', columns='user_id')
pivot_100.head()

  pivot_100 = over_100.pivot_table(values='scaled_rating', index='title', columns='user_id')


: 

In [None]:
pivot_100 = ml_reviews.pivot_table(values = 'scaled_rating', index = 'title', columns = 'user_id')

pivot_100.head()

In [32]:
pivot_100.shape

In [None]:
pivot_100_sparse = sparse.csr_matrix(pivot_100.fillna(0))

In [None]:
distances_100 = pairwise_distances(pivot_100_sparse, metric = 'cosine')

In [None]:
similarities_100 = 1.0 - distances_100

In [33]:
recommender_100 = pd.DataFrame(similarities_100, index = pivot_100.index, columns=pivot_100.index)

recommender_100.head()

In [None]:
imdb_titles.head()

Unnamed: 0,titleId,title
0,tt0000001,Carmencita
1,tt0000002,Le clown et ses chiens
2,tt0000003,Pauvre Pierrot
3,tt0000004,Un bon bock
4,tt0000005,Blacksmith Scene


In [None]:
q = 'Toy Story'

for title in imdb_titles.loc[imdb_titles['title'].str.contains(q), 'title']:
    print(title)
    print(recommender_100[title].sort_values(ascending = False)[1:11])
    print()
    print()

Toy Story
title
Flicka                                                              0.050089
They Wait                                                           0.050089
DOA: Dead or Alive                                                  0.048198
Fancy Pants                                                         0.047519
Godspell: A Musical Based on the Gospel According to St. Matthew    0.045080
Courageous                                                          0.045080
Talk Radio                                                          0.044801
Ball of Fire                                                        0.043786
Punchline                                                           0.042475
Scooby-Doo and the Loch Ness Monster                                0.040071
Name: Toy Story, dtype: float64


Toy Story 2
title
Nightmare at Noon                                                0.077706
Her Alibi                                                        0.077706
Gertrud       

KeyError: "The Story Behind 'Toy Story'"