In [1]:
import pandas as pd
import os

In [2]:
import random

### Read the data

In [3]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [4]:
movies_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Data preprocessing

#### 1. Create user and item columns for LensKit

In [6]:
ratings_df = ratings_df.rename(columns = {'userId' : 'user', 'movieId': 'item'})
movies_df = movies_df.rename(columns={'movieId': 'item','title' : 'title_original'})

In [7]:
def compute_title_year(row):
    title_original = str(row['title_original']).strip() 

    year = title_original[-5:-1] 

    if year.isdigit():
        title = title_original[:-6].strip().lower()
        return int(year), title
    else:
        return 0, title_original.lower()

In [8]:

movies_df[['year', 'title']] = movies_df.apply(compute_title_year, axis=1, result_type ='expand')
display(movies_df.head(10))

Unnamed: 0,item,title_original,genres,year,title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,toy story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,grumpier old men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,waiting to exhale
4,5,Father of the Bride Part II (1995),Comedy,1995,father of the bride part ii
5,6,Heat (1995),Action|Crime|Thriller,1995,heat
6,7,Sabrina (1995),Comedy|Romance,1995,sabrina
7,8,Tom and Huck (1995),Adventure|Children,1995,tom and huck
8,9,Sudden Death (1995),Action,1995,sudden death
9,10,GoldenEye (1995),Action|Adventure|Thriller,1995,goldeneye


### 2. Remove duplicated records from the dataframe:

In [9]:
movies_count = movies_df.groupby(['title','year'],as_index=False).size()

In [10]:
movies_count.loc[movies_count['size'] > 1]

Unnamed: 0,title,year,size
1942,confessions of a dangerous mind,2002,2
2727,emma,1996,2
2789,eros,2004,2
7395,saturn 3,1980,2
9289,war of the worlds,2005,2


In [11]:
movies_df= movies_df.drop_duplicates(subset=['title','year'], keep="last")

#### Check to see if the duplicated records are removed:

In [12]:
movies_count = movies_df.groupby(['title','year'],as_index=False).size()
movies_count.loc[movies_count['size'] > 1]

Unnamed: 0,title,year,size


In [13]:
plots_df = pd.read_csv('wiki_movie_plots_deduped.csv')

In [14]:
plots_df['title'] = plots_df['Title'].str.strip().str.lower()
plots_df = plots_df.rename(columns={'Release Year': 'year', 'Plot' : 'plot'})
plots_df = plots_df[['title', 'year', 'plot']]

In [15]:
plots_count = plots_df.groupby(['title','year'],as_index=False).size()
plots_count.loc[plots_count['size'] > 1]

Unnamed: 0,title,year,size
61,12,2006,2
69,12 years a slave,2013,2
71,123,2002,3
75,13,2006,2
87,14,2006,3
...,...,...,...
34002,woman in gold,2015,2
34100,x-men: days of future past,2014,2
34101,x-men: first class,2011,2
34179,yashoda krishna,1975,2


In [16]:
plots_df = plots_df.drop_duplicates(subset=['title','year'], keep="last")

In [17]:
plots_count = plots_df.groupby(['title','year'],as_index=False).size()
plots_count.loc[plots_count['size'] > 1]

Unnamed: 0,title,year,size


In [18]:

movies_df = pd.merge(movies_df, plots_df,  how='inner', left_on=['title','year'], right_on = ['title','year'])
movies_df = movies_df[['item','title','year','genres','plot']]

In [19]:
movies_df.head(5)

Unnamed: 0,item,title,year,genres,plot
0,1,toy story,1995,Adventure|Animation|Children|Comedy|Fantasy,In a world where toys are living things who pr...
1,2,jumanji,1995,Adventure|Children|Fantasy,"In 1869, near Brantford, New Hampshire, two br..."
2,3,grumpier old men,1995,Comedy|Romance,The feud between Max (Walter Matthau) and John...
3,4,waiting to exhale,1995,Comedy|Drama|Romance,"""Friends are the People who let you be yoursel..."
4,5,father of the bride part ii,1995,Comedy,The film begins five years after the events of...


In [20]:
ratings_df.head(5)

Unnamed: 0,user,item,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


#### 2.  Non personalized recommendations: ordering by average rating

In [21]:
test_movies = movies_df.copy()

In [22]:
user_rating_count = ratings_df.groupby(['item'], as_index = False).count()[['item','user']]

In [23]:
rating_avg = ratings_df.groupby('item',as_index = False).mean()[['item','rating']]

In [24]:
test_movies = pd.merge(test_movies, user_rating_count,  how='inner', left_on=['item'], right_on = ['item'])

In [25]:
test_movies = pd.merge(test_movies, rating_avg,  how='inner', left_on=['item'], right_on = ['item'])

In [26]:
test_movies = test_movies.rename(columns = {'user':'vote_count'})

In [27]:
test_movies = test_movies.rename(columns = {'rating':'avg_rating'})

In [28]:
test_movies.head(5)

Unnamed: 0,item,title,year,genres,plot,vote_count,avg_rating
0,1,toy story,1995,Adventure|Animation|Children|Comedy|Fantasy,In a world where toys are living things who pr...,215,3.92093
1,2,jumanji,1995,Adventure|Children|Fantasy,"In 1869, near Brantford, New Hampshire, two br...",110,3.431818
2,3,grumpier old men,1995,Comedy|Romance,The feud between Max (Walter Matthau) and John...,52,3.259615
3,4,waiting to exhale,1995,Comedy|Drama|Romance,"""Friends are the People who let you be yoursel...",7,2.357143
4,5,father of the bride part ii,1995,Comedy,The film begins five years after the events of...,49,3.071429


We calculate the weighted rating for each movie using the IMDB's formula:

![alt text](ww.png "Title")

However, since there are movies with only 1 rating, we want to have movies with more ratings to avoid bias. Therefore, we apply 90% quantile to select the top 10% most rated movies:

In [29]:
C= test_movies['avg_rating'].mean()
m= test_movies['vote_count'].quantile(0.9) #select the top 10% most rated movies

In [30]:
quantiled_movies = test_movies.copy().loc[test_movies['vote_count'] >= m]

We define the function to calculate the weighted rating as follow: 

In [31]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['avg_rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [32]:
quantiled_movies['score'] = quantiled_movies.apply(weighted_rating, axis=1)

We then sort the result and recommend the movies with highest weighted rating:

In [33]:
quantiled_movies = quantiled_movies.sort_values('score', ascending=False)

#Print the top 15 movies
quantiled_movies[['title', 'vote_count', 'avg_rating', 'score']].head(10)

Unnamed: 0,title,vote_count,avg_rating,score
1317,fight club,218,4.272936,4.105076
160,pulp fiction,307,4.197068,4.080868
281,schindler's list,220,4.225,4.065501
186,forrest gump,329,4.164134,4.058309
533,goodfellas,126,4.25,3.993289
890,saving private ryan,188,4.146277,3.977431
1034,american history x,129,4.217054,3.972451
526,one flew over the cuckoo's nest,133,4.203008,3.967248
500,reservoir dogs,131,4.20229,3.963901
359,dr. strangelove or: how i learned to stop worr...,97,4.268041,3.951106


In [34]:
def get_best_movies_for_genre(genre):
    genre_filtered = quantiled_movies.loc[quantiled_movies['genres'].str.contains(genre)]
    return genre_filtered[['title','genres','score']].head(10)

In [35]:
get_best_movies_for_genre('Comedy')

Unnamed: 0,title,genres,score
160,pulp fiction,Comedy|Crime|Drama|Thriller,4.080868
186,forrest gump,Comedy|Drama|Romance|War,4.058309
359,dr. strangelove or: how i learned to stop worr...,Comedy|War,3.951106
317,fargo,Comedy|Crime|Drama|Thriller,3.947029
515,monty python and the holy grail,Adventure|Comedy|Fantasy,3.939077
558,back to the future,Adventure|Comedy|Sci-Fi,3.875116
1765,snatch,Comedy|Crime|Thriller,3.861827
1134,office space,Comedy|Crime,3.817419
0,toy story,Adventure|Animation|Children|Comedy|Fantasy,3.80395
365,trainspotting,Comedy|Crime|Drama,3.79571


#### 3. Content based recommender system:

##### 3.1 Content similarity:


We want to find movies that are similar. We will use the plots of the movies to calculate similarity. We will do the steps below:

1. Remove the stop words from the plots of each movie

2. Create TF-IDF matrix of the plots without stop words

3. Using cosine-similarity to calculate similarity among movies' plots

4. Recommend similar movies based on the plot given a movie

In [36]:
# WE first create a copy to work on:
content_df = movies_df.copy()

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tf_idf.fit_transform(content_df['plot'])

In [38]:
tfidf_matrix

<4783x58746 sparse matrix of type '<class 'numpy.float64'>'
	with 1046003 stored elements in Compressed Sparse Row format>

In [39]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [40]:
indices = pd.Series(content_df.index, index=content_df['title']).drop_duplicates()
def get_recommendations(title, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return content_df['title'].iloc[movie_indices]

In [41]:
get_recommendations('jumanji', cosine_sim)

4280        alan partridge: alpha papa
2289                       harvard man
820                     small soldiers
2494                             equus
4762    jumanji: welcome to the jungle
3517                     reign over me
4516                too late for tears
773                          b. monkey
2153                white water summer
4086                           carnage
Name: title, dtype: object

#### 4. Collaborative filtering

In [42]:
cf_movies =  movies_df.copy()

In [43]:
cf_ratings = ratings_df.copy()

In [44]:
cf_ratings

Unnamed: 0,user,item,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [45]:
column_values = cf_ratings[["user"]].values.ravel()
unique_users =  pd.unique(column_values)

In [46]:
random_user = random.choice(unique_users)

In [47]:
from surprise import Reader, Dataset, SVD

In [48]:
from surprise.model_selection.validation import cross_validate

In [49]:
reader = Reader()
data = Dataset.load_from_df(cf_ratings[['user', 'item', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8755  0.8733  0.8815  0.8698  0.8725  0.8745  0.0039  
MAE (testset)     0.6731  0.6704  0.6760  0.6721  0.6679  0.6719  0.0027  
Fit time          9.08    8.87    8.67    8.68    8.62    8.78    0.17    
Test time         0.31    0.14    0.15    0.19    0.14    0.19    0.06    


{'test_rmse': array([0.87545264, 0.87325345, 0.8814591 , 0.86984016, 0.87252703]),
 'test_mae': array([0.67309901, 0.67041482, 0.67602052, 0.67206251, 0.66788055]),
 'fit_time': (9.076875925064087,
  8.865256786346436,
  8.67281460762024,
  8.679453611373901,
  8.621935844421387),
 'test_time': (0.3061521053314209,
  0.13763213157653809,
  0.15159344673156738,
  0.192474365234375,
  0.13763213157653809)}

In [None]:
trainset = data.build_full_trainset()
svd.trai