<h1 align="center"> Collaborative Filtering Movie Recommendation</h1>
<h4 align="center">by Shivam Shukla</h4>

In [1]:
import numpy as np
import pandas as pd

In [2]:
ratings = pd.read_csv('dataset/ratings.csv')
movies = pd.read_csv('dataset/movies.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
user_rating = pd.merge(movies,ratings)

In [6]:
user_rating.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [7]:
user_rating.drop(['genres','timestamp'],axis=1,inplace=True)

In [8]:
user_rating.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [9]:
user_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 4 columns):
movieId    100836 non-null int64
title      100836 non-null object
userId     100836 non-null int64
rating     100836 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 3.8+ MB


In [10]:
user_rating.describe()

Unnamed: 0,movieId,userId,rating
count,100836.0,100836.0,100836.0
mean,19435.295718,326.127564,3.501557
std,35530.987199,182.618491,1.042529
min,1.0,1.0,0.5
25%,1199.0,177.0,3.0
50%,2991.0,325.0,3.5
75%,8122.0,477.0,4.0
max,193609.0,610.0,5.0


In [11]:
user_rating.groupby('title')['rating'].count().sort_values(ascending=False)

title
Forrest Gump (1994)                                                               329
Shawshank Redemption, The (1994)                                                  317
Pulp Fiction (1994)                                                               307
Silence of the Lambs, The (1991)                                                  279
Matrix, The (1999)                                                                278
                                                                                 ... 
Late Night Shopping (2001)                                                          1
Late Night with Conan O'Brien: The Best of Triumph the Insult Comic Dog (2004)      1
Late Shift, The (1996)                                                              1
Latter Days (2003)                                                                  1
'71 (2014)                                                                          1
Name: rating, Length: 9719, dtype: int64

In [12]:
rating_data = pd.DataFrame(user_rating.groupby('title')['rating'].mean(),columns=['rating'])
rating_data.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'71 (2014),4.0
'Hellboy': The Seeds of Creation (2004),4.0
'Round Midnight (1986),3.5
'Salem's Lot (2004),5.0
'Til There Was You (1997),4.0


In [13]:
rating_data['no of ratings'] = user_rating.groupby('title')['rating'].count()
rating_data.sort_values('no of ratings',ascending=False)

Unnamed: 0_level_0,rating,no of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),4.164134,329
"Shawshank Redemption, The (1994)",4.429022,317
Pulp Fiction (1994),4.197068,307
"Silence of the Lambs, The (1991)",4.161290,279
"Matrix, The (1999)",4.192446,278
...,...,...
King Solomon's Mines (1950),3.000000,1
King Solomon's Mines (1937),2.500000,1
King Ralph (1991),1.500000,1
King Kong Lives (1986),2.000000,1


In [43]:
rating_data[rating_data['no of ratings'] <=5]['no of ratings'].count()

6451

Since there are 6451 movies which are reted by less than 5 users, so we should drop these movies as the will create noise in our recommendation. We will delete the movies after creating matrix of user and movies.

---

---
We have to create a matrix which contains user as cols and movies as rows.

---

In [44]:
movie_user_matrix = user_rating.pivot_table(index=['userId'],columns=['title'],values='rating',dropna=True)

In [47]:
movie_user_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [48]:
movie_user_matrix[movie_user_matrix["'71 (2014)"]==4]

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
610,4.0,,,,,,,,3.5,,...,,4.0,3.5,3.0,,,2.0,1.5,,


In [49]:
movie_user_matrix = movie_user_matrix.dropna(thresh=5, axis=1).fillna(0,axis=1)

In [51]:
movie_user_matrix.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## In collaborative filtering we have 2 models-

* ***USER-ITEM*** - We recommend the item based on similar users preferences. 
* ***ITEM-ITEM*** - We recommend the item based on similar items.

## Item - Item CF model

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

***First we have to normalise the data such that it has mean =0 and range =1 i.e max-min = 1***

In [55]:
def mean_normalize(row):
    new_row = (row - row.mean())/(row.max()-row.min())
    return new_row

movie_user_normalized_matrix = movie_user_matrix.apply(mean_normalize)
movie_user_normalized_matrix.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.017705,-0.050492,-0.009426,-0.016885,-0.062459,-0.018852,-0.047377,-0.049508,-0.009107,-0.077541,...,-0.019508,-0.04623,-0.067377,-0.062131,-0.04082,-0.010656,-0.027869,-0.027254,-0.006557,0.773279
2,-0.017705,-0.050492,-0.009426,-0.016885,-0.062459,-0.018852,-0.047377,-0.049508,-0.009107,-0.077541,...,-0.019508,-0.04623,0.532623,-0.062131,-0.04082,-0.010656,-0.027869,-0.027254,-0.006557,-0.026721
3,-0.017705,-0.050492,-0.009426,-0.016885,-0.062459,-0.018852,-0.047377,-0.049508,-0.009107,-0.077541,...,-0.019508,-0.04623,-0.067377,-0.062131,-0.04082,-0.010656,-0.027869,-0.027254,-0.006557,-0.026721
4,-0.017705,-0.050492,-0.009426,-0.016885,-0.062459,-0.018852,-0.047377,-0.049508,-0.009107,0.922459,...,-0.019508,-0.04623,-0.067377,-0.062131,-0.04082,-0.010656,-0.027869,-0.027254,-0.006557,-0.026721
5,-0.017705,-0.050492,-0.009426,-0.016885,-0.062459,-0.018852,-0.047377,-0.049508,-0.009107,-0.077541,...,-0.019508,-0.04623,-0.067377,-0.062131,-0.04082,-0.010656,-0.027869,-0.027254,-0.006557,-0.026721


### Finding Item-Item similarity using COSINE similarity.

Here each column is our ITEM and i_th row represnts a rating for each item by i_th user.
___

In [59]:
item_similarity = cosine_similarity(movie_user_normalized_matrix.T)

***Creating dataframe to represent similarity between wach movie***

In [60]:
item_similarity_matrix = pd.DataFrame(item_similarity,index=movie_user_normalized_matrix.columns,columns=movie_user_normalized_matrix.columns)

In [62]:
item_similarity_matrix.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.063117,0.235908,-0.023768,0.143482,0.011998,0.087931,0.224052,-0.018608,0.034223,...,0.134701,0.153158,0.101301,0.049897,0.003233,-0.017905,0.187953,0.062174,-0.014025,0.353194
(500) Days of Summer (2009),0.063117,1.0,0.133949,0.142471,0.273989,0.19396,0.148903,0.142141,0.066567,0.159756,...,0.068407,0.414585,0.355723,0.252226,0.216007,0.126147,0.053614,0.241092,0.139511,0.125905
*batteries not included (1987),0.235908,0.133949,1.0,0.035596,0.061144,-0.017106,0.073459,0.1061,-0.012561,0.026377,...,0.039055,0.19453,0.12101,0.071852,-0.024573,-0.012086,0.115396,-6e-05,-0.009467,0.234514
10 Cloverfield Lane (2016),-0.023768,0.142471,0.035596,1.0,-0.005799,0.112396,0.006139,-0.016835,-0.017692,0.031704,...,-0.023477,0.272347,0.241751,0.195054,0.319371,0.082246,0.177846,0.096638,0.081429,0.002733
10 Things I Hate About You (1999),0.143482,0.273989,0.061144,-0.005799,1.0,0.24467,0.223481,0.211473,0.109729,0.011784,...,0.13246,0.091853,0.158637,0.281934,0.050031,0.088391,0.121029,0.130813,0.068745,0.110612


In [65]:

def get_similar(movie_name,rating):
    similar_ratings = item_similarity_matrix[movie_name]*(rating-2.5)    # We subtract 2.5 to make more disliked movie more negative.
    similar_ratings = similar_ratings.sort_values(ascending=False)
    return similar_ratings

In [66]:

romantic_lover = [("(500) Days of Summer (2009)",5),("Alice in Wonderland (2010)",3),("Aliens (1986)",1),("2001: A Space Odyssey (1968)",2)]
similar_movies = pd.DataFrame()
for movie,rating in romantic_lover:
    similar_movies = similar_movies.append(get_similar(movie,rating),ignore_index = True)

similar_movies.head(10)

Unnamed: 0,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
0,0.157792,2.5,0.334873,0.356179,0.684973,0.4849,0.372257,0.355353,0.166417,0.399389,...,0.171018,1.036463,0.889309,0.630565,0.540017,0.315367,0.134034,0.60273,0.348776,0.314763
1,-0.016276,0.203998,0.016161,0.126834,0.113241,0.092218,0.08579,0.072825,0.037458,0.097794,...,0.011564,0.176888,0.168302,0.12171,0.07259,0.050178,0.025695,0.081764,0.032384,0.02754
2,-0.304722,-0.062634,-0.099756,-0.2147,-0.118754,-0.037059,-0.063992,-0.170195,-0.0107,-0.28009,...,-0.368712,-0.281119,-0.263686,-0.228562,-0.144292,-0.319902,-0.410899,-0.242935,-0.106132,-0.23841
3,-0.102988,-0.056808,-0.076462,-0.049655,-0.042987,-0.021729,-0.055422,-0.051115,-0.014088,-0.097954,...,-0.128795,-0.175166,-0.098088,-0.074205,-0.049626,-0.026909,-0.153017,-0.082048,-0.013598,-0.091432


In [67]:
similar_movies.sum().sort_values(ascending=False).head(20)

(500) Days of Summer (2009)                      2.584556
Alice in Wonderland (2010)                       1.395229
Silver Linings Playbook (2012)                   1.254800
Yes Man (2008)                                   1.116264
Adventureland (2009)                             1.112235
Marley & Me (2008)                               1.108381
About Time (2013)                                1.102192
Crazy, Stupid, Love. (2011)                      1.088757
50/50 (2011)                                     1.086517
Help, The (2011)                                 1.075963
Up in the Air (2009)                             1.053037
Holiday, The (2006)                              1.034470
Friends with Benefits (2011)                     1.030875
Notebook, The (2004)                             1.025880
Easy A (2010)                                    1.015771
Secret Life of Walter Mitty, The (2013)          0.997979
Perks of Being a Wallflower, The (2012)          0.967425
Toy Story 3 (2

In [68]:
action_lover = [("Amazing Spider-Man, The (2012)",0),("Mission: Impossible III (2006)",0),("Toy Story 3 (2010)",0),("2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",0)]
similar_movies = pd.DataFrame()
for movie,rating in action_lover:
    similar_movies = similar_movies.append(get_similar(movie,rating),ignore_index = True)

similar_movies.head(10)
similar_movies.sum().sort_values(ascending=False).head(20)

Piano, The (1993)                                             0.541369
Postman, The (Postino, Il) (1994)                             0.505397
Madness of King George, The (1994)                            0.503704
Like Water for Chocolate (Como agua para chocolate) (1992)    0.468719
Remains of the Day, The (1993)                                0.442919
Grifters, The (1990)                                          0.411111
Clear and Present Danger (1994)                               0.389026
Nixon (1995)                                                  0.363701
Bridges of Madison County, The (1995)                         0.360745
Arsenic and Old Lace (1944)                                   0.354892
Shadowlands (1993)                                            0.338869
Eat Drink Man Woman (Yin shi nan nu) (1994)                   0.332963
Lone Star (1996)                                              0.332481
Fly Away Home (1996)                                          0.329514
Disclo