In [40]:
# import dependencies
import pandas as pd
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os.path
import re
import pickle
import requests
import math


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [41]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")
links = pd.read_csv("links.csv")
genome_tags = pd.read_csv("genome_tags.csv")
genome_scores = pd.read_csv("genome_scores.csv")

In [42]:
%%time
user_ids = ratings.userId.unique().tolist()
user2idx = {userId: idx for (idx, userId) in enumerate(user_ids)}
idx2user = {idx: userId for (idx, userId) in enumerate(user_ids)}
ratings.userId = ratings.userId.map(user2idx)

movie_ids = ratings.movieId.unique().tolist()
movie2idx = {movieId: idx for (idx, movieId) in enumerate(movie_ids)}
idx2movie = {idx: movieId for (idx, movieId) in enumerate(movie_ids)}
ratings.movieId = ratings.movieId.map(movie2idx)

# map rest
tags.movieId = tags.movieId.map(movie2idx)
movies.movieId = movies.movieId.map(movie2idx)
links.movieId = links.movieId.map(movie2idx)

CPU times: user 402 ms, sys: 252 ms, total: 655 ms
Wall time: 723 ms


In [43]:
num_users = int(ratings.userId.nunique())
num_movies = int(ratings.movieId.nunique())
ratings["rating"] = ratings["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(ratings["rating"])
max_rating = max(ratings["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)


Number of users: 162541, Number of Movies: 59047, Min rating: 0.5, Max rating: 5.0


In [44]:
# drop some columns we do not care about
ratings = ratings.drop("timestamp", axis=1)
tags = tags.drop("timestamp", axis=1)

In [45]:
# drop duplicates for ratings for content 
ratings_content = ratings.drop_duplicates("movieId")

In [46]:
# we need take make sure that we have equal movies and ratings.
movies.isnull().sum()

movieId    3376
title         0
genres        0
dtype: int64

In [47]:

movies[~movies.movieId.isin(ratings_content.movieId)]

Unnamed: 0,movieId,title,genres
8371,,Break of Hearts (1935),Drama|Romance
8763,,Baby Blue Marine (1976),Drama
11556,,"Thousand and One Nights, A (1001 Nights) (1945)",Adventure
11997,,Suicide Killers (2006),Documentary
12173,,Alex in Wonder (Sex and a Girl) (2001),Comedy|Drama
...,...,...,...
62238,,Eternal Blood (2002),Action|Horror|Thriller
62239,,Big Business (1929),Comedy
62240,,The Student of Prague (1926),Horror
62298,,The Coldest Game (2019),(no genres listed)


In [48]:

movies = movies[movies['movieId'].notna()]

In [49]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [50]:
# basic stats
print("{} unique movies in ratings_content".format(len(ratings_content.movieId.unique())))
print("{} unique movies in tags.csv".format(len(tags.movieId.unique())))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

59047 unique movies in ratings_content
41876 unique movies in tags.csv
59047 unique movies in movies.csv


In [51]:
# do we have tags with movies that do NOT have a ratings?
len(tags[~tags["movieId"].isin(ratings_content.movieId)]["movieId"].unique())

1

In [52]:
# movies that are not in ratings, should be removed (using right join)
tags_content = pd.merge(tags, ratings_content, on="movieId", how="right")

In [53]:

tags_content[tags_content.movieId == 0]

Unnamed: 0,userId_x,movieId,tag,userId_y,rating
0,264.0,0.0,assassin,0,5.0
1,264.0,0.0,Black comedy,0,5.0
2,264.0,0.0,cult film,0,5.0
3,264.0,0.0,dark comedy,0,5.0
4,264.0,0.0,Quentin Tarantino,0,5.0
...,...,...,...,...,...
4762,162400.0,0.0,Oscar Nominee: Best Picture,0,5.0
4763,162400.0,0.0,Quentin Tarantino,0,5.0
4764,162400.0,0.0,satire,0,5.0
4765,162400.0,0.0,Steve Buscemi,0,5.0


In [54]:
tags_content.head()

Unnamed: 0,userId_x,movieId,tag,userId_y,rating
0,264.0,0.0,assassin,0,5.0
1,264.0,0.0,Black comedy,0,5.0
2,264.0,0.0,cult film,0,5.0
3,264.0,0.0,dark comedy,0,5.0
4,264.0,0.0,Quentin Tarantino,0,5.0


In [55]:
len(tags[~tags["movieId"].isin(ratings_content.movieId)]["movieId"].unique())

1

In [56]:
# after some clean
print("{} unique movies in ratings_content".format(len(ratings_content.movieId.unique())))
print("{} unique movies in tags_content".format(len(tags_content.movieId.unique())))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

59047 unique movies in ratings_content
59047 unique movies in tags_content
59047 unique movies in movies.csv


In [57]:
# handle genres (remove all non alphabet characters)
movies['genres'] = movies['genres'].str.replace(pat="|", repl=" ")
movies['genres'] = movies['genres'].str.replace(pat="-", repl="")
movies['genres'] = movies['genres'].str.replace(pat="(no genres listed)", repl="")
movies['genres'] = movies['genres'].str.replace(pat="(", repl="")
movies['genres'] = movies['genres'].str.replace(pat=")", repl="")
movies.head()

  movies['genres'] = movies['genres'].str.replace(pat="|", repl=" ")
  movies['genres'] = movies['genres'].str.replace(pat="(no genres listed)", repl="")
  movies['genres'] = movies['genres'].str.replace(pat="(", repl="")
  movies['genres'] = movies['genres'].str.replace(pat=")", repl="")


Unnamed: 0,movieId,title,genres
0,70.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,1103.0,Jumanji (1995),Adventure Children Fantasy
2,1017.0,Grumpier Old Men (1995),Comedy Romance
3,4270.0,Waiting to Exhale (1995),Comedy Drama Romance
4,1858.0,Father of the Bride Part II (1995),Comedy


In [58]:
# merge movies with ratings and map idx
movies_content = pd.merge(movies, ratings_content, on="movieId", how="inner")

In [59]:
movies.query("title == 'Batman Begins (2005)'")

Unnamed: 0,movieId,title,genres
10002,245.0,Batman Begins (2005),Action Crime IMAX


In [60]:
# there is some nans in tags
tags_content.isnull().sum().sort_values(ascending = False)

tag         17188
userId_x    17172
movieId         0
userId_y        0
rating          0
dtype: int64

In [61]:
# fill some nans 
tags_content.fillna("", inplace=True)

In [62]:
tags_content.isnull().sum().sort_values(ascending = False)

userId_x    0
movieId     0
tag         0
userId_y    0
rating      0
dtype: int64

In [63]:
tags_content = pd.DataFrame(tags_content.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
tags_content.reset_index(inplace=True)

In [64]:
tags_content.head()

Unnamed: 0,movieId,tag
0,0.0,assassin Black comedy cult film dark comedy Qu...
1,1.0,atmospheric enigmatic gentle lyrical meditativ...
2,2.0,atmospheric CRISIS OF FAITH DEATH OF A CHILD D...
3,3.0,biting cerebral cynical harsh irreverent madca...
4,4.0,Dance 50s imdb top 250 musical romance happy m...


In [65]:
movies_content.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,70.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1,3.5
1,1103.0,Jumanji (1995),Adventure Children Fantasy,8,5.0
2,1017.0,Grumpier Old Men (1995),Comedy Romance,7,4.0
3,4270.0,Waiting to Exhale (1995),Comedy Drama Romance,140,3.0
4,1858.0,Father of the Bride Part II (1995),Comedy,17,4.0


In [66]:
# group tags by movie id to create corpus
content_data = pd.merge(movies_content, tags_content, on="movieId", how="right")
content_data["corpus"] = content_data[["genres", "tag"]].apply(lambda x: " ".join(x), axis=1)

In [67]:

movies_content[movies_content.movieId == 245]

Unnamed: 0,movieId,title,genres,userId,rating
10000,245.0,Batman Begins (2005),Action Crime IMAX,1,5.0


In [68]:
content_data[content_data.movieId == 245]

Unnamed: 0,movieId,title,genres,userId,rating,tag,corpus
245,245.0,Batman Begins (2005),Action Crime IMAX,1,5.0,action batman billionaire Christian Bale comic...,Action Crime IMAX action batman billionaire Ch...


In [69]:
# drop items we do not need 
content_data = content_data.drop("rating", axis=1)
content_data = content_data.drop("userId", axis=1)

In [70]:
content_data.head()

Unnamed: 0,movieId,title,genres,tag,corpus
0,0.0,Pulp Fiction (1994),Comedy Crime Drama Thriller,assassin Black comedy cult film dark comedy Qu...,Comedy Crime Drama Thriller assassin Black com...
1,1.0,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,atmospheric enigmatic gentle lyrical meditativ...,Drama atmospheric enigmatic gentle lyrical med...
2,2.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,atmospheric CRISIS OF FAITH DEATH OF A CHILD D...,Drama atmospheric CRISIS OF FAITH DEATH OF A C...
3,3.0,Underground (1995),Comedy Drama War,biting cerebral cynical harsh irreverent madca...,Comedy Drama War biting cerebral cynical harsh...
4,4.0,Singin' in the Rain (1952),Comedy Musical Romance,Dance 50s imdb top 250 musical romance happy m...,Comedy Musical Romance Dance 50s imdb top 250 ...


In [71]:
# find ratings count and mean
movie_summary = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
movie_summary['ratings_count'] = ratings.groupby('movieId')['rating'].count()
movie_summary.head()

Unnamed: 0_level_0,rating,ratings_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4.188912,79672
1,4.072967,7058
2,3.981409,6616
3,3.946021,1269
4,4.050987,10895


In [72]:
# merge with content data
content_data = pd.merge(content_data, movie_summary, on="movieId", how="left")
content_data.head()


Unnamed: 0,movieId,title,genres,tag,corpus,rating,ratings_count
0,0.0,Pulp Fiction (1994),Comedy Crime Drama Thriller,assassin Black comedy cult film dark comedy Qu...,Comedy Crime Drama Thriller assassin Black com...,4.188912,79672
1,1.0,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,atmospheric enigmatic gentle lyrical meditativ...,Drama atmospheric enigmatic gentle lyrical med...,4.072967,7058
2,2.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,atmospheric CRISIS OF FAITH DEATH OF A CHILD D...,Drama atmospheric CRISIS OF FAITH DEATH OF A C...,3.981409,6616
3,3.0,Underground (1995),Comedy Drama War,biting cerebral cynical harsh irreverent madca...,Comedy Drama War biting cerebral cynical harsh...,3.946021,1269
4,4.0,Singin' in the Rain (1952),Comedy Musical Romance,Dance 50s imdb top 250 musical romance happy m...,Comedy Musical Romance Dance 50s imdb top 250 ...,4.050987,10895


In [73]:
content_data.shape

(59047, 7)

In [74]:

C = content_data["rating"].mean()
print(C)
m = content_data["ratings_count"].quantile(0.90)
print(m)

3.071374
413.0


In [75]:
# weighted ratings based on IMDB
def weighted_rating(x):
    v = x['ratings_count']
    R = x['rating']
    return (v / (v + m) * R) + (m / (m + v) * C)

In [76]:

content_data["weighted_rating"] = content_data.apply(weighted_rating, axis=1)

In [77]:
content_data = pd.merge(content_data, links, on="movieId", how="left")

In [78]:
content_data.sort_values("weighted_rating", ascending=False).head(20)

Unnamed: 0,movieId,title,genres,tag,corpus,rating,ratings_count,weighted_rating,imdbId,tmdbId
79,79.0,"Shawshank Redemption, The (1994)",Crime Drama,bad ending stephan king freedom hope inspirati...,Crime Drama bad ending stephan king freedom ho...,4.413576,81482,4.406807,111161,278.0
96,96.0,"Godfather, The (1972)",Crime Drama,italian mafia italy Mafia Marlon Brando Mafia ...,Crime Drama italian mafia italy Mafia Marlon B...,4.324336,52498,4.314556,68646,238.0
252,252.0,"Usual Suspects, The (1995)",Crime Mystery Thriller,imdb top 250 heist suspense thriller twist end...,Crime Mystery Thriller imdb top 250 heist susp...,4.284353,55366,4.275372,114814,629.0
276,276.0,"Godfather: Part II, The (1974)",Crime Drama,imdb top 250 Oscar (Best Picture) 100 Greatest...,Crime Drama imdb top 250 Oscar (Best Picture) ...,4.261758,34188,4.24755,71562,240.0
89,89.0,Schindler's List (1993),Drama War,based on a true story true story imdb top 250 ...,Drama War based on a true story true story imd...,4.247579,60411,4.239593,108052,424.0
297,297.0,Fight Club (1999),Action Crime Drama Thriller,complicated mindfuck violence atmospheric dark...,Action Crime Drama Thriller complicated mindfu...,4.228311,58773,4.220237,137523,550.0
288,288.0,Seven Samurai (Shichinin no samurai) (1954),Action Adventure Drama,Akira Kurosawa atmospheric epic historical lon...,Action Adventure Drama Akira Kurosawa atmosphe...,4.254769,13367,4.219302,47478,346.0
1063,1063.0,Rear Window (1954),Mystery Thriller,50s imdb top 250 Edgar Award (Best Motion Pict...,Mystery Thriller 50s imdb top 250 Edgar Award ...,4.237947,20162,4.214531,47396,567.0
999,999.0,12 Angry Men (1957),Drama,classic courtroom courtroom drama group psycho...,Drama classic courtroom courtroom drama group ...,4.243014,16569,4.21452,50083,389.0
1844,1844.0,One Flew Over the Cuckoo's Nest (1975),Drama,imdb top 250 Oscar (Best Picture) asylum based...,Drama imdb top 250 Oscar (Best Picture) asylum...,4.218662,36058,4.20567,73486,510.0
