In [1]:
# import dependencies
import pandas as pd
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os.path
import re
import pickle
import requests
import math


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")
links = pd.read_csv("links.csv")
genome_tags = pd.read_csv("genome_tags.csv")
genome_scores = pd.read_csv("genome_scores.csv")

In [None]:
%%time
user_ids = ratings.userId.unique().tolist()
user2idx = {userId: idx for (idx, userId) in enumerate(user_ids)}
idx2user = {idx: userId for (idx, userId) in enumerate(user_ids)}
ratings.userId = ratings.userId.map(user2idx)

movie_ids = ratings.movieId.unique().tolist()
movie2idx = {movieId: idx for (idx, movieId) in enumerate(movie_ids)}
idx2movie = {idx: movieId for (idx, movieId) in enumerate(movie_ids)}
ratings.movieId = ratings.movieId.map(movie2idx)

# map rest
tags.movieId = tags.movieId.map(movie2idx)
movies.movieId = movies.movieId.map(movie2idx)
links.movieId = links.movieId.map(movie2idx)

In [None]:
num_users = int(ratings.userId.nunique())
num_movies = int(ratings.movieId.nunique())
ratings["rating"] = ratings["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(ratings["rating"])
max_rating = max(ratings["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)


In [None]:
# drop some columns we do not care about
ratings = ratings.drop("timestamp", axis=1)
tags = tags.drop("timestamp", axis=1)

In [None]:
# drop duplicates for ratings for content 
ratings_content = ratings.drop_duplicates("movieId")

In [None]:
# we need take make sure that we have equal movies and ratings.
movies.isnull().sum()

In [None]:

movies[~movies.movieId.isin(ratings_content.movieId)]

In [None]:

movies = movies[movies['movieId'].notna()]

In [None]:
movies.isnull().sum()

In [None]:
# basic stats
print("{} unique movies in ratings_content".format(len(ratings_content.movieId.unique())))
print("{} unique movies in tags.csv".format(len(tags.movieId.unique())))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

In [None]:
# do we have tags with movies that do NOT have a ratings?
len(tags[~tags["movieId"].isin(ratings_content.movieId)]["movieId"].unique())

In [None]:
# movies that are not in ratings, should be removed (using right join)
tags_content = pd.merge(tags, ratings_content, on="movieId", how="right")

In [None]:

tags_content[tags_content.movieId == 0]

In [None]:
tags_content.head()

In [None]:
len(tags[~tags["movieId"].isin(ratings_content.movieId)]["movieId"].unique())

In [None]:
# after some clean
print("{} unique movies in ratings_content".format(len(ratings_content.movieId.unique())))
print("{} unique movies in tags_content".format(len(tags_content.movieId.unique())))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

In [None]:
# handle genres (remove all non alphabet characters)
movies['genres'] = movies['genres'].str.replace(pat="|", repl=" ")
movies['genres'] = movies['genres'].str.replace(pat="-", repl="")
movies['genres'] = movies['genres'].str.replace(pat="(no genres listed)", repl="")
movies['genres'] = movies['genres'].str.replace(pat="(", repl="")
movies['genres'] = movies['genres'].str.replace(pat=")", repl="")
movies.head()

In [None]:
# merge movies with ratings and map idx
movies_content = pd.merge(movies, ratings_content, on="movieId", how="inner")

In [None]:
movies.query("title == 'Batman Begins (2005)'")

In [None]:
# there is some nans in tags
tags_content.isnull().sum().sort_values(ascending = False)

In [None]:
# fill some nans 
tags_content.fillna("", inplace=True)

In [None]:
tags_content.isnull().sum().sort_values(ascending = False)

In [None]:
tags_content = pd.DataFrame(tags_content.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
tags_content.reset_index(inplace=True)

In [None]:
tags_content.head()

In [None]:
movies_content.head()

In [None]:
# group tags by movie id to create corpus
content_data = pd.merge(movies_content, tags_content, on="movieId", how="right")
content_data["corpus"] = content_data[["genres", "tag"]].apply(lambda x: " ".join(x), axis=1)

In [None]:

movies_content[movies_content.movieId == 245]

In [None]:
content_data[content_data.movieId == 245]

In [None]:
# drop items we do not need 
content_data = content_data.drop("rating", axis=1)
content_data = content_data.drop("userId", axis=1)

In [None]:
content_data.head()

In [None]:
# find ratings count and mean
movie_summary = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
movie_summary['ratings_count'] = ratings.groupby('movieId')['rating'].count()
movie_summary.head()

In [None]:
# merge with content data
content_data = pd.merge(content_data, movie_summary, on="movieId", how="left")
content_data.head()


In [None]:
content_data.shape

In [None]:

C = content_data["rating"].mean()
print(C)
m = content_data["ratings_count"].quantile(0.90)
print(m)

In [None]:
# weighted ratings based on IMDB
def weighted_rating(x):
    v = x['ratings_count']
    R = x['rating']
    return (v / (v + m) * R) + (m / (m + v) * C)

In [None]:

content_data["weighted_rating"] = content_data.apply(weighted_rating, axis=1)

In [None]:
content_data = pd.merge(content_data, links, on="movieId", how="left")

In [None]:
content_data.sort_values("weighted_rating", ascending=False).head(20)