## MovieRec

In this notebook, I'll build a deep learning model movie recommendations system on the MovieLens 20M dataset.

In [1]:
from fastai import *
from fastai.collab import *
from fastai.tabular import *

### Loading MovieLens 20m dataset
Download [link](http://files.grouplens.org/datasets/movielens/ml-20m.zip) and dataset [readme.](http://files.grouplens.org/datasets/movielens/ml-20m-README.html)

In [2]:
path = 'data/'
ml20 = path + 'ml-20m/' 
!ls {ml20}

README.txt	   genome-tags.csv  movies.csv	 tags.csv
genome-scores.csv  links.csv	    ratings.csv


#### ratings.csv

In [3]:
ratings = pd.read_csv(ml20 + 'ratings.csv', encoding='latin-1')
print('Size:', len(ratings))
ratings.head(2)

Size: 20000263


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676


#### movies.csv

In [4]:
movies = pd.read_csv(ml20 + 'movies.csv', encoding='latin-1')
print('Size:', len(movies))
movies.head(2)

Size: 27278


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


#### links.csv

In [5]:
links = pd.read_csv(ml20 + 'links.csv', encoding='latin-1')
print('Size:', len(links))
links.head(2)

Size: 27278


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


#### Or load 100k dataset

In [6]:
ml100 = path + 'ml-100k/' 
r_names = ['userId','movieId','rating','timestamp']
m_names = ['movieId', 'title', 'date', 'N', 'url', *[f'g{i}' for i in range(19)]]
#ratings = pd.read_csv(ml100 + 'u.data', delimiter='\t', header=None, names=r_names)
#movies = pd.read_csv(ml100 + 'u.item',  delimiter='|', encoding='latin-1', header=None, names=m_names)

### Loading my [IMDb ratings](https://www.imdb.com/user/ur15834927/ratings)

In [7]:
my_ratings = pd.read_csv(path + 'my_ratings.csv', encoding='latin-1')
my_ratings = my_ratings.sort_values('Date Rated').reset_index(drop=True)
print('Size:', len(my_ratings))

Size: 568


#### First and last movie I rated

In [8]:
my_ratings.iloc[[0, -1]]

Unnamed: 0,Const,Your Rating,Date Rated,Title,URL,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors
0,tt0111161,10,2007-08-12,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,movie,9.3,142.0,1994,Drama,2034334,1994-09-10,Frank Darabont
567,tt2397535,10,2015-11-28,Predestination,https://www.imdb.com/title/tt2397535/,movie,7.5,97.0,2014,"Drama, Mystery, Sci-Fi, Thriller",218780,2014-03-08,"Michael Spierig, Peter Spierig"


In [9]:
my_ratings.describe().round(decimals=1)

Unnamed: 0,Your Rating,IMDb Rating,Runtime (mins),Year,Num Votes
count,568.0,568.0,565.0,568.0,568.0
mean,8.0,7.6,109.4,2002.2,323541.1
std,1.9,0.9,37.7,10.3,332702.2
min,1.0,1.6,8.0,1942.0,13.0
25%,7.0,7.1,96.0,1999.8,84620.5
50%,8.0,7.7,108.0,2005.0,215472.0
75%,10.0,8.1,125.0,2008.0,454300.2
max,10.0,9.5,533.0,2014.0,2034334.0


### Format my_ratings

#### Add userId, rating, timestamp and imdbId to my_ratings

In [10]:
my_ratings['userId'] = max(ratings['userId']) + 1
my_ratings['rating'] = my_ratings['Your Rating'] / 2
my_ratings['timestamp'] = my_ratings['Date Rated'].astype('datetime64[ns]').astype('int64')//1000000000
my_ratings['imdbId'] = my_ratings['Const'].apply(lambda x: x[2:]).map(int)
my_ratings.iloc[[0, -1]]

Unnamed: 0,Const,Your Rating,Date Rated,Title,URL,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors,userId,rating,timestamp,imdbId
0,tt0111161,10,2007-08-12,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,movie,9.3,142.0,1994,Drama,2034334,1994-09-10,Frank Darabont,138494,5.0,1186876800,111161
567,tt2397535,10,2015-11-28,Predestination,https://www.imdb.com/title/tt2397535/,movie,7.5,97.0,2014,"Drama, Mystery, Sci-Fi, Thriller",218780,2014-03-08,"Michael Spierig, Peter Spierig",138494,5.0,1448668800,2397535


#### Merge my_ratings with links and select only necessary columns

In [11]:
my_movies = my_ratings.merge(links, on='imdbId')[['userId', 'movieId', 'rating', 'timestamp']]

my_movies.tail(2)

Unnamed: 0,userId,movieId,rating,timestamp
495,138494,77800,5.0,1448496000
496,138494,114935,5.0,1448668800


#### Concat my ratings with all ratings

In [12]:
all_ratings = pd.concat([ratings, my_movies], ignore_index=True)
all_ratings.tail(2)

Unnamed: 0,userId,movieId,rating,timestamp
20000758,138494,77800,5.0,1448496000
20000759,138494,114935,5.0,1448668800


#### Merge titles

In [13]:
all_ratings['order'] = all_ratings.index
all_ratings = pd.merge(all_ratings, movies[['movieId', 'title']])
all_ratings = all_ratings.sort_values('order')
all_ratings = all_ratings.reset_index(drop=True)
all_ratings = all_ratings.drop(columns='order')
#all_ratings.to_csv(path + 'all_ratings.csv', index=False)

all_ratings.tail(2)

Unnamed: 0,userId,movieId,rating,timestamp,title
20000758,138494,77800,5.0,1448496000,Four Lions (2010)
20000759,138494,114935,5.0,1448668800,Predestination (2014)


### Collaborative filtering

In [14]:
data = CollabDataBunch.from_df(all_ratings, item_name='title')
data.show_batch(2)

userId,title,target
110609,"Frighteners, The (1996)",3.0
28652,Ace Ventura: When Nature Calls (1995),2.0


In [15]:
learn = collab_learner(data, n_factors=50, y_range=(0.,5.))

In [16]:
learn.fit_one_cycle(1, 5e-3, wd=0.1)

epoch,train_loss,valid_loss
1,0.922044,0.929842
,,


In [17]:
#learn.save('dotprod')
#learn.load('20m_dotprod')

### Interpretation

In [18]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(138495, 50)
  (i_weight): Embedding(25858, 50)
  (u_bias): Embedding(138495, 1)
  (i_bias): Embedding(25858, 1)
)

In [19]:
g = all_ratings.groupby('title')['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_movies[:10]

array(['Pulp Fiction (1994)', 'Forrest Gump (1994)', 'Shawshank Redemption, The (1994)',
       'Silence of the Lambs, The (1991)', 'Jurassic Park (1993)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Braveheart (1995)', 'Terminator 2: Judgment Day (1991)', 'Matrix, The (1999)', "Schindler's List (1993)"],
      dtype=object)

#### Movie Bias

In [20]:
movie_bias = learn.bias(top_movies, is_item=True)

In [21]:
movie_bias

tensor([ 1.0209,  0.9481,  1.3641,  1.0751,  0.6756,  1.0189,  0.9420,  0.8335,
         0.9809,  1.1530,  0.8110,  0.9423,  0.8372,  0.4630,  1.1437,  0.8543,
         0.4571,  0.9365,  0.9464,  0.8049,  0.7296,  0.9644,  0.9016,  0.8714,
         0.5593,  0.6539,  0.5644,  0.7380,  1.0734,  0.9819,  0.8454,  0.7213,
         0.1922,  0.9019,  0.4874,  0.8543,  0.4958,  0.6260,  0.4883,  0.3187,
         0.8354,  0.5485,  0.4895,  0.7006,  0.8630,  0.7483,  0.2164,  0.6029,
         0.9172,  0.3767,  0.6090,  0.3635,  0.1641,  0.6695,  0.6602,  0.4359,
         0.8680,  0.6726,  0.7715,  0.7846,  0.8450,  0.9147,  0.7252,  0.9150,
         0.2435,  0.6629,  0.4951,  0.5911,  0.4746,  0.5410,  0.2708,  0.5809,
         0.7795,  0.1641,  0.5977,  0.5722,  0.7974,  0.5023,  0.9260,  0.7333,
         0.6762,  0.5127,  0.7799,  0.8833,  0.4317,  0.4284,  0.5000,  0.7728,
         0.5515,  0.6492,  0.6721,  0.6521,  0.3092,  0.6844,  0.2998,  0.3188,
         0.7010,  0.4488,  0.6197,  0.83