In [1]:
import pandas as pd
import numpy as np

## Load data

MovieLens data sets were collected by the GroupLens Research Project
at the University of Minnesota.
 
This data set consists of:
- 100,000 ratings (1-5) from 943 users on 1682 movies. 
- Each user has rated at least 20 movies. 


The data was collected through the MovieLens web site
(movielens.umn.edu) during the seven-month period from September 19th, 
1997 through April 22nd, 1998. This data has been cleaned up - users
who had less than 20 ratings or did not have complete demographic
information were removed from this data set

In [2]:
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./ml-100k/u.data', names=ratings_cols, sep='\t')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movies_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('./ml-100k/u.item', names=movies_cols, sep='|', encoding='latin-1')
movies.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Pre process data

In [4]:
# remove timestamp from ratings
ratings = ratings.drop(columns=['timestamp'])

In [5]:
# remove unwanted columns from movies
movies = movies.drop(columns=['video release date', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

In [6]:
# strip year of release from `movie title`
movies['movie title'] = movies['movie title'].apply(lambda x: ' '.join(x.split(' ')[:-1]))

## Create user input

In [7]:
# user input as a dictionary
user_input = {
    'title':['Beauty and the Beast', 'Lion King, The', 'Toy Story', 'Jumanji', 'Cinderella', 'Space Jam'],
    'rating':[4, 5, 5, 2, 4.5, 4]
}

In [8]:
# convert user input to a pandas df
input_data = pd.DataFrame.from_dict(user_input)
input_data

Unnamed: 0,title,rating
0,Beauty and the Beast,4.0
1,"Lion King, The",5.0
2,Toy Story,5.0
3,Jumanji,2.0
4,Cinderella,4.5
5,Space Jam,4.0


Add movie_id to input_data

In [9]:
# filter out movies dataframe using isin
filtered_movies = movies[movies['movie title'].isin(input_data.title.to_list())]
filtered_movies

Unnamed: 0,movie id,movie title,release date,IMDb URL
0,1,Toy Story,01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...
70,71,"Lion King, The",01-Jan-1994,"http://us.imdb.com/M/title-exact?Lion%20King,%..."
417,418,Cinderella,01-Jan-1950,http://us.imdb.com/M/title-exact?Cinderella%20...
587,588,Beauty and the Beast,01-Jan-1991,http://us.imdb.com/M/title-exact?Beauty%20and%...
754,755,Jumanji,01-Jan-1995,http://us.imdb.com/M/title-exact?Jumanji%20(1995)
819,820,Space Jam,15-Nov-1996,http://us.imdb.com/M/title-exact?Space%20Jam%2...


In [10]:
# merge input_data and filtered_movies
input_data_with_id = pd.merge(input_data, filtered_movies, left_on='title', right_on='movie title')
input_data_with_id = input_data_with_id.drop(columns=['release date', 'IMDb URL', 'movie title'], axis=0)

In [11]:
input_data_with_id = input_data_with_id.sort_values(by=['movie id'])
input_data_with_id

Unnamed: 0,title,rating,movie id
2,Toy Story,5.0,1
1,"Lion King, The",5.0,71
4,Cinderella,4.5,418
0,Beauty and the Beast,4.0,588
3,Jumanji,2.0,755
5,Space Jam,4.0,820


## Filter out user ratings

Here I filter out the ratings df to find out the ratings of movies present in user_input

In [12]:
filtered_ratings = ratings[ratings.movie_id.isin(input_data_with_id['movie id'].tolist())]

Group filtered ratings based on user_id and sort them based on number of movies rated by each user

In [13]:
user_subset_groups = filtered_ratings.groupby(by='user_id')

In [14]:
# sort the groups based on the number of ratings in each group
user_subset_groups = sorted(user_subset_groups, key=lambda x:len(x[1]), reverse=True)
user_subset_groups = user_subset_groups[:100]

## Calculate Pearson Correlation Coefficient


In [15]:
# from scipy.stats import pearsonr

### Pearson Correlation Coefficient
The Pearson correlation coefficient shows the similarity of two variables. It can have values between -1 and 1. The value 1 shows that there is a strong positive association between the variables. The value -1 shows a strong negetive correlation. The value 0 means there is no relation between two variables.

Given below is the formula for calculating Pearson Correlation Coefficient:

![pcc_formula](pearson_correlation_formula.png)

In [16]:
p_c_c_dict = {}

for user_id, group in user_subset_groups:

    group = group.sort_values(by=['movie_id'])

    n_ratings_group = len(group)

    # find ratings of the movies that are common in both group rating as well as users ratings
    common_ratings = input_data_with_id[input_data_with_id['movie id'].isin(group['movie_id'])]

    group_ratings_list = group['rating'].tolist()
    common_ratings_list = common_ratings['rating'].tolist()

    # using scipy library
    # pcc = pearsonr(group_ratings_list, common_ratings_list)
    
    x_mean = sum(group_ratings_list)/len(group_ratings_list)
    y_mean = sum(common_ratings_list)/len(common_ratings_list)

    Sxy = sum([(i-x_mean)*(j-y_mean) for i, j in zip(group_ratings_list, common_ratings_list)])

    Sxx = sum([(i-x_mean)**2 for i in group_ratings_list])
    Syy = sum([(i-y_mean)**2 for i in common_ratings_list])

    if Sxx != 0 and Syy!=0:
        pcc = Sxy / (Sxx*Syy)**.5
    else:
        pcc = 0
    
    # populate pcc dictionary
    p_c_c_dict[user_id] = pcc

In [17]:
# convert dictionary to df
pcc_df = pd.DataFrame.from_dict(p_c_c_dict, orient='index')
pcc_df.columns = ['pearson_similarity']
pcc_df['user_id'] = pcc_df.index
pcc_df = pcc_df.reset_index().drop(columns=['index'])

Select 50 most similar users

In [18]:
# sort pcc_df based on pearson similarity
pcc_df = pcc_df.sort_values(by=['pearson_similarity'], ascending=False)
most_similar_users = pcc_df[:5]

Get all movie ratings of the most similar users

In [19]:
# user inner join to merge ratings df and most_similar_users df
similar_users_ratings = ratings.merge(most_similar_users, how='inner')

In [20]:
# make weighted rating of similar users ratings
similar_users_ratings['weighted rating'] = similar_users_ratings['pearson_similarity'] * similar_users_ratings['rating']
similar_users_ratings

Unnamed: 0,user_id,movie_id,rating,pearson_similarity,weighted rating
0,298,474,4,0.912871,3.651484
1,298,317,4,0.912871,3.651484
2,298,281,3,0.912871,2.738613
3,298,486,3,0.912871,2.738613
4,298,181,4,0.912871,3.651484
...,...,...,...,...,...
1123,642,1182,2,0.942928,1.885856
1124,642,998,3,0.942928,2.828784
1125,642,756,5,0.942928,4.714639
1126,642,955,3,0.942928,2.828784


## Make recommendations

The ratings of the similar users are grouped based on movie_id. The weighted rating and pearson similarity are summed for each movies. The sum of these weighted rating and pearson similarity are used to calculate a recommendation score for each movies. The movies with the most recommendation score are recommended to the user. 

![formula for recommendation score](recommendation_score.png)

In [21]:
# group by movie_id
user_ratings_grouped = similar_users_ratings.groupby(by=['movie_id'])
sum_ratings = user_ratings_grouped[['weighted rating', 'pearson_similarity']].sum()
sum_ratings.columns = ['sum_weighted_rating', 'sum_p_similarity']

In [22]:
sum_ratings

Unnamed: 0_level_0,sum_weighted_rating,sum_p_similarity
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,22.212790,4.631120
2,3.771711,0.942928
3,2.783934,0.927978
4,6.540696,1.870906
5,4.639890,0.927978
...,...,...
1503,1.885856,0.942928
1505,3.711912,0.927978
1508,2.828427,0.942809
1509,1.885856,0.942928


In [23]:
# create a new df and calculate recommendation score
recommendation_df = pd.DataFrame()
recommendation_df['movie_id'] = sum_ratings.index
recommendation_df['recommendation score'] = sum_ratings['sum_weighted_rating']/sum_ratings['sum_p_similarity']

In [24]:
# sort the df and get the top 20 movies
recommendation_df = recommendation_df.sort_values(by=['recommendation score'], ascending=False)[:20]

In [25]:
recommendation_df

Unnamed: 0,movie_id,recommendation score
333,520,5.0
178,239,5.0
259,398,5.0
370,578,5.0
354,561,5.0
347,552,5.0
50,73,5.0
102,144,5.0
316,482,5.0
313,475,5.0


In [26]:
# show the details of movies that are in recommendation_df
movies[movies['movie id'].isin(recommendation_df['movie_id'])]

Unnamed: 0,movie id,movie title,release date,IMDb URL
72,73,Maverick,01-Jan-1994,http://us.imdb.com/M/title-exact?Maverick%20(1...
121,122,"Cable Guy, The",14-Jun-1996,"http://us.imdb.com/M/title-exact?Cable%20Guy,%..."
143,144,Die Hard,01-Jan-1988,http://us.imdb.com/M/title-exact?Die%20Hard%20...
226,227,Star Trek VI: The Undiscovered Country,01-Jan-1991,http://us.imdb.com/M/title-exact?Star%20Trek%2...
233,234,Jaws,01-Jan-1975,http://us.imdb.com/M/title-exact?Jaws%20(1975)
238,239,Sneakers,01-Jan-1992,http://us.imdb.com/M/title-exact?Sneakers%20(1...
256,257,Men in Black,04-Jul-1997,http://us.imdb.com/M/title-exact?Men+in+Black+...
332,333,"Game, The",01-Jan-1997,http://us.imdb.com/M/title-exact?Game%2C+The+(...
368,369,Black Sheep,02-Feb-1996,http://us.imdb.com/M/title-exact?Black%20Sheep...
397,398,Super Mario Bros.,01-Jan-1993,http://us.imdb.com/M/title-exact?Super%20Mario...
