In [1]:
import numpy as np
import pandas as pd
from plotly.offline import init_notebook_mode,iplot
import cufflinks as cf
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
cf.go_offline(connected=True)
cf.set_config_file(offline=False, world_readable=True, theme='space')
from sklearn.metrics import mean_squared_error
import math

Download Dataset [here](https://raw.githubusercontent.com/sureshgorakala/RecommenderSystems_R/master/movie_rating.csv)

In [2]:
ratings=pd.read_csv('movie_rating.csv')

In [3]:
ratings.columns

Index(['critic', 'title', 'rating'], dtype='object')

We got critic column that tells the name of the user, title that tells the name of the movie, rating that tells the rating given by the user to the movie.

In [4]:
ratings.describe()

Unnamed: 0,rating
count,31.0
mean,3.225806
std,0.920495
min,1.0
25%,3.0
50%,3.0
75%,3.75
max,5.0


ratings are in range 1-5

In [5]:
table_fig=ff.create_table(ratings)

In [6]:
iplot(table_fig)

In [7]:
ratings.groupby(['critic']).count()

Unnamed: 0_level_0,title,rating
critic,Unnamed: 1_level_1,Unnamed: 2_level_1
Claudia Puig,5,5
Gene Seymour,6,6
Jack Matthews,5,5
Lisa Rose,6,6
Mick LaSalle,6,6
Toby,3,3


In [8]:
critics_ratings=ratings.groupby(['critic']).count()
critics_ratings.iplot(kind='bar', y='rating', theme='pearl')

There are 6 users in which everyone, apart from Toby, rated 6 movies.
Toby rated only 3 movies. So lets consider recommending movies to Toby.

# The Aim is to recommend movies to Toby

## User-based Collaborative Filtering using Euclidean Distance

In [9]:
critics_and_ratings = ratings.pivot_table(index = 'title', values='rating', columns='critic')
iplot(ff.create_table(critics_and_ratings, index=True, index_title='Movies'))

In [10]:
mses=[]
for column1 in critics_and_ratings.columns:
    mses.append([])
    for column2 in critics_and_ratings.columns:
        mask=critics_and_ratings[column1].isna()|critics_and_ratings[column2].isna()
        mask=~mask
        x=critics_and_ratings[column1][mask]
        y=critics_and_ratings[column2][mask]
        mses[-1].append(mean_squared_error(x,y))
sim_matrix = pd.DataFrame(mses,index=critics_and_ratings.columns,columns=critics_and_ratings.columns)

In [11]:
iplot(ff.create_annotated_heatmap(z=sim_matrix.values.round(3), x=list(critics_and_ratings.columns), y=list(critics_and_ratings.columns), colorscale='Greys'))

Mick LaSalle, Claudia Puig and Lisa Rose are closely related to Toby.

Toby only rated 3 movies so lets calculate Tobys predicted ratings for the other 3 movies using user similarity

In [12]:
recom_movies=critics_and_ratings[critics_and_ratings['Toby'].isna()].index.values

Ratings for a movie are predicted using the formula
$$rating_{movie,toby}=\sum_{for\ all\ other\ users} \frac{rating_{movie,user} * similarity_{toby,user}}{similarity_{toby,user}}$$

In [13]:
pred_ratings=[]
for movie in recom_movies:
    num=0
    den=0
    for user in set(critics_and_ratings.columns)-{'Toby',}:
        if ~np.isnan(critics_and_ratings.loc[movie, user]):
            num+=critics_and_ratings.loc[movie, user]*sim_matrix.loc['Toby', user]
            den+=sim_matrix.loc['Toby', user]
    
    pred_ratings.append([num/den])

In [14]:
toby_rat=pd.DataFrame(pred_ratings, index=recom_movies, columns=['Predicted_Toby_Ratings'])
iplot(ff.create_table(toby_rat,index=True, index_title='Movies'))

The above table shows approximate ratings for movies if Toby would have watched and rated these movies.

## Item based Collaborative Filtering using Cosine Similarity

In [15]:
item_ratings = ratings.pivot_table(index='critic', columns=['title'], values='rating')

In [16]:
iplot(ff.create_table(item_ratings, index=True, index_title='Movies'))

## Movie Similarity using Cosine Similarity

In [17]:
movies=item_ratings.columns

In [18]:
def dot_prod(a,b):
    summ=0
    for i,j in zip(a,b):
        summ+= i*j
    return summ
def cosine_similarity(a,b):
    dot=dot_prod(a,b)
    norm_a=math.sqrt(dot_prod(a,a))
    norm_b=math.sqrt(dot_prod(b,b))
    return dot / (norm_a * norm_b)

In [19]:
item_cosine_similarity=[]
for i in movies:
    item_cosine_similarity.append([])
    for j in movies:
        x = item_ratings[i].copy()
        y = item_ratings[j].copy()
        mask=x.isna()|y.isna()
        x=x[~mask]
        y=y[~mask]
        y=y.values
        x=x.values
        item_cosine_similarity[-1].append(cosine_similarity(x,y))

In [20]:
item_similarity = pd.DataFrame(item_cosine_similarity, index=movies, columns=movies)

In [21]:
iplot(ff.create_annotated_heatmap(z=item_similarity.values.round(3), x=list(movies), y=list(movies), colorscale='Greys'))

In [22]:
recom_movies

array(['Just My Luck', 'Lady in the Water', 'The Night Listener'],
      dtype=object)

Ratings for a movie are predicted using the formula
$$rating_{unrated\_movie,toby}=\sum_{for\ all\ rated\_movies} \frac{rating_{rated\_movie,toby} * similarity_{rated\_movie,unrated\_movie}}{similarity_{rated\_movie,unrated\_movie}}$$

In [23]:
set(movies)-set(recom_movies)

{'Snakes on a Plane', 'Superman Returns', 'You Me and Dupree'}

In [24]:
pred_ratings=[]
for unrated_movie in recom_movies:
    num=0
    den=0
    for movie in set(movies)-set(recom_movies):
        if ~np.isnan(critics_and_ratings.loc[movie, 'Toby']):
            num+=critics_and_ratings.loc[movie, 'Toby']*item_similarity.loc[movie, unrated_movie]
            den+=item_similarity.loc[movie, unrated_movie]
    
    pred_ratings.append([num/den])

In [25]:
toby_item_rat=pd.DataFrame(pred_ratings, index=recom_movies, columns=['Predicted_Toby_Ratings'])
iplot(ff.create_table(toby_item_rat,index=True, index_title='Movies'))

Average of predicted rating using the above techniques is

In [26]:
iplot(ff.create_table(((toby_item_rat+toby_rat)/2),index=True, index_title='Movies'))