In [None]:
# import comet_ml at the top of your file
from comet_ml import Experiment

# Create an experiment with your api key
experiment = Experiment(
    api_key="p2hFSvlF0e2xt5nFlAsJlzwHu",
    project_name="recommender-system",
    workspace="celelwazi",
)

## import libraries

In [1]:
import numpy as np 
import pandas as pd  

#visualization
import scipy as sp  
from scipy import sparse 
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import urllib.request

# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#recommender systems
from surprise import *
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

#saving model
import pickle

## load datasets

In [2]:
train_df = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/train.csv")
test_df = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/test.csv")
genome_scores = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/genome_scores.csv")
genome_tags = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/genome_tags.csv")
imdb_data = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/imdb_data.csv")
links_df = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/links.csv")
movies_df = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/movies.csv")
sample_submission = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/sample_submission.csv")
tags_df = pd.read_csv("C:/Users/lwazi/Downloads/edsa-movie-recommendation-wilderness/tags.csv")

In [3]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [4]:
test_df.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [7]:
print(train_df.isnull().sum())
print(train_df.shape)

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
(10000038, 4)


In [8]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
movies_df.loc[movies_df["title"].duplicated() == True]
duplicates = movies_df.duplicated(["title"])
print("Number of duplicate rows = "+str(duplicates.sum()))

Number of duplicate rows = 98


In [10]:
# check movies with missing genres
movies_df.loc[movies_df["genres"] == "(no genres listed)"].head()

Unnamed: 0,movieId,title,genres
15881,83773,Away with Words (San tiao ren) (1999),(no genres listed)
16060,84768,Glitterbug (1994),(no genres listed)
16351,86493,"Age of the Earth, The (A Idade da Terra) (1980)",(no genres listed)
16491,87061,Trails (Veredas) (1978),(no genres listed)
17404,91246,Milky Way (Tejút) (2007),(no genres listed)


In [18]:
movies_df[movies_df["genres"] == "(no genres listed)"].count()

movieId    5062
title      5062
genres     5062
dtype: int64

## data cleaning

In [19]:
df = train_df.copy(deep=True)

In [20]:
#removing timestamp as it serves no purpose in determining user rating
df = df.iloc[:, :-1]

In [56]:
#checking duplicated movies
duplicate = movies_df.loc[movies_df.duplicated('title')]

In [54]:
duplicate

Unnamed: 0,movieId,title,genres
9065,26982,Men with Guns (1997),Drama
12909,64997,War of the Worlds (2005),Action|Sci-Fi
12984,65665,Hamlet (2000),Drama
13177,67459,Chaos (2005),Crime|Drama|Horror
16120,85070,Blackout (2007),Drama
...,...,...,...
61521,206117,The Lonely Island Presents: The Unauthorized B...,Comedy
61525,206125,Lost & Found (2018),Comedy|Drama
61697,206674,Camino (2016),Comedy
61800,206925,The Plague (2006),Documentary


In [60]:
# Random check differences between first & second occurence of title in movies
movies_df.loc[movies_df["title"] == "Men with Guns (1997)"]

Unnamed: 0,movieId,title,genres
1710,1788,Men with Guns (1997),Action|Drama
9065,26982,Men with Guns (1997),Drama


In [61]:
movies_df.loc[movies_df["title"] == "War of the Worlds (2005)"]

Unnamed: 0,movieId,title,genres
10055,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
12909,64997,War of the Worlds (2005),Action|Sci-Fi


In [62]:
# Dropping duplicated movie titles.
# Dropping the second occurences of duplicated movies
# as the first occurences have more than 1 genre
movies_df = movies_df.drop(
    movies_df.loc[movies_df["title"].duplicated(keep='first') == True].index)


In [63]:
#checking if duplicates where removed
movies_df.loc[movies_df["title"].duplicated(keep='first') == True].count()

movieId    0
title      0
genres     0
dtype: int64

In [64]:
# remove the separator between genres
movies_df["genres"] = movies_df["genres"].str.replace("|", " ", regex=False)
# Check movies_df
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


## merging datasets

In [100]:
# Merge dataframes on common column
df_movie_info = pd.merge(train_df,movies_df,on = "movieId")

In [67]:
# merge movies_df and imdb_df on a common column ,i.e "movieId"
movies_merge = pd.merge(
    movies_df, imdb_data[['runtime', "movieId", 'budget', ]], on="movieId")
movies_merge['year'] = movies_merge['title'].str.split().str[-1]
movies_merge["year"] = movies_merge["year"].str.replace("(", "", regex=False)
movies_merge["year"] = movies_merge["year"].str.replace(")", "", regex=False)
movies_merge = movies_merge[~movies_merge['year'].str.contains(
    '[A-z]', na=False)]
movies_merge = movies_merge[~movies_merge['year'].str.contains(
    '[a-z]', na=False)]
movies_merge["year"] = movies_merge["year"].astype(float)
movies_merge1 = movies_merge.copy()
movies_merge = movies_merge.dropna()
movies_merge = movies_merge[movies_merge['budget'].str.startswith('$')]
movies_merge["budget"] = movies_merge["budget"].str.replace(
    "$", "", regex=False)
movies_merge["budget"] = movies_merge["budget"].str.replace(
    ",", "", regex=False)
movies_merge["budget"] = movies_merge["budget"].astype(float)
movies_merge.head()

Unnamed: 0,movieId,title,genres,runtime,budget,year
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,81.0,30000000.0,1995.0
1,2,Jumanji (1995),Adventure Children Fantasy,104.0,65000000.0,1995.0
2,3,Grumpier Old Men (1995),Comedy Romance,101.0,25000000.0,1995.0
3,4,Waiting to Exhale (1995),Comedy Drama Romance,124.0,16000000.0,1995.0
4,5,Father of the Bride Part II (1995),Comedy,106.0,30000000.0,1995.0


## EXPLORATORY DATA  ANALYSIS

In [68]:
print("Summary of df:")
print("Number of unique movies = " + str(len(np.unique(df["movieId"]))))
print("Number of unique users = " + str(len(np.unique(df["userId"]))))
print("Total number of movie ratings = " + str(df.shape[0]))

Summary of df:
Number of unique movies = 48213
Number of unique users = 162541
Total number of movie ratings = 10000038


### ratings analysis

In [70]:
# 10 users with the most ratings
no_of_rated_movies_per_user = df_movie_info.copy(deep=True)
no_of_rated_movies_per_user[
    'user_ratings'] = no_of_rated_movies_per_user.groupby(
        by="userId")["rating"].count().sort_values(ascending=False)
no_of_rated_movies_per_user.groupby(
    by="userId")["rating"].count().sort_values(ascending=False)[:10]

userId
72315     12934
80974      3679
137293     3586
33844      3207
20055      3050
109731     2670
49403      2667
92046      2578
110971     2287
30879      2283
Name: rating, dtype: int64

In [72]:
# Highest number of ratings by single user in database
# Lowest number of ratings by single user in database
print("Summary of user ratings: ")
print("Highest number of ratings by single user in database = " + str(
    max(no_of_rated_movies_per_user.groupby(
        by="userId")["rating"].count().sort_values(ascending=False))))

print("Lowest number of ratings by single user in database = " + str(
    min(no_of_rated_movies_per_user.groupby(
        by="userId")["rating"].count().sort_values(ascending=False))))


Summary of user ratings: 
Highest number of ratings by single user in database = 12934
Lowest number of ratings by single user in database = 1


In [73]:
#most rated movies
ratings_per_movie = df_movie_info.groupby(
    by="title")["rating"].mean().sort_values(ascending=True)
ratings_per_movie.tail()

## does not show true reflection, elaboration to be completed

title
Detective Conan: The Raven Chaser (2009)    5.0
Rockabilly Zombie Weekend (2013)            5.0
Sweaty Betty (2015)                         5.0
Robert Williams Mr. Bitchin' (2013)         5.0
Day the Universe Changed, The (1985)        5.0
Name: rating, dtype: float64

In [99]:
# movies with the most ratings
df_movie_info.groupby('title')['rating'].count().sort_values(
    ascending=False)[:10]

title
Shawshank Redemption, The (1994)             32831
Forrest Gump (1994)                          32383
Pulp Fiction (1994)                          31697
Silence of the Lambs, The (1991)             29444
Matrix, The (1999)                           29014
Star Wars: Episode IV - A New Hope (1977)    27560
Jurassic Park (1993)                         25518
Schindler's List (1993)                      24004
Braveheart (1995)                            23722
Fight Club (1999)                            23536
Name: rating, dtype: int64

In [77]:
#plt.figure(figsize=(12, 7))
#df_movie_info['rating'].value_counts().plot(kind='bar',
                                            #title='Distribution of the ratings')
#plt.ylabel('count')
#plt.xlabel('rating')
#plt.rcParams['patch.force_edgecolor'] = True
#experiment.log_figure(figure=plt, figure_name='Ratings per Movie')
#plt.show()

### genres analysis

In [105]:
# Create new dataframe to see the genres
genres = movies_df['genres'].str.replace("|", " ", regex=False)
genres = genres.str.replace("(no genres listed)", "Not_listed", regex=False)
genres_string = ' '.join(genres)
all_genres = genres_string.split()


# function to get unique genres ,to avoid repetation in dataframe
def unique_genres(genre_list):
    """The function inputs the genres of the movies and outputs all the unique genres 
    """
    # insert the list to the set
    list_set = set(genre_list)
    # convert the set to the list
    unique_list = (list(genre_list))
    return unique_list

all_genres = unique_genres(all_genres)
print("The number are " + str(
    len(set(all_genres))) + " unique movie genres in the dataframe")


The number are 20 unique movie genres in the dataframe


In [None]:
# Plot Movie count per Genre
plt.figure(figsize=(10,10))
plt.bar(height = genre_dict.values(), x = genre_dict.keys())
plt.xticks(rotation=90)
plt.show()