## USER BASED COLLABORATIVE FILTERING

In [1]:
import pandas as pd
import numpy as np
import warnings

In [2]:
warnings.filterwarnings('ignore')

### Loading the Dataset

In [3]:
#Adding column names
cols1 = ['user_id','item_id','rating','timestamp']

data = pd.read_csv("ml-100k/u.data",sep='\t',names=cols1)
data.head(n=10)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [4]:
item = pd.read_csv("ml-100k/u.item",sep="\|",header=None)
item.head(n=10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
item = item[[0,1]]
item.head(10)

Unnamed: 0,0,1
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,7,Twelve Monkeys (1995)
7,8,Babe (1995)
8,9,Dead Man Walking (1995)
9,10,Richard III (1995)


In [6]:
item.columns=['item_id','movie_title']

In [7]:
item.head(10)

Unnamed: 0,item_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,7,Twelve Monkeys (1995)
7,8,Babe (1995)
8,9,Dead Man Walking (1995)
9,10,Richard III (1995)


### Creating normalized ratings dataframe

In [8]:
Mean = data.groupby(by='user_id',as_index=False)['rating'].mean()
print(Mean)

     user_id    rating
0          1  3.610294
1          2  3.709677
2          3  2.796296
3          4  4.333333
4          5  2.874286
..       ...       ...
938      939  4.265306
939      940  3.457944
940      941  4.045455
941      942  4.265823
942      943  3.410714

[943 rows x 2 columns]


In [9]:
#Creating new df with average rating given by each user
avg_rating = pd.merge(data,Mean,on='user_id')

In [10]:
avg_rating.head()

Unnamed: 0,user_id,item_id,rating_x,timestamp,rating_y
0,196,242,3,881250949,3.615385
1,196,393,4,881251863,3.615385
2,196,381,4,881251728,3.615385
3,196,251,3,881251274,3.615385
4,196,655,5,881251793,3.615385


In [11]:
# Here rating_x represents the rating given by user to a particular movie
# and rating_y represents avg rating given by user
# we create a column new_rating which is the normalized rating

avg_rating['new_rating'] = avg_rating['rating_x']-avg_rating['rating_y']
avg_rating.head()

Unnamed: 0,user_id,item_id,rating_x,timestamp,rating_y,new_rating
0,196,242,3,881250949,3.615385,-0.615385
1,196,393,4,881251863,3.615385,0.384615
2,196,381,4,881251728,3.615385,0.384615
3,196,251,3,881251274,3.615385,-0.615385
4,196,655,5,881251793,3.615385,1.384615


In [12]:
# Matrix of user_id and item_id with actual ratings
xrating_matrix = pd.pivot_table(avg_rating,values='rating_x',index='user_id',columns='item_id')
xrating_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [13]:
# Matrix of user_id and item_id with normalized ratings
newrating_matrix = pd.pivot_table(avg_rating,values='new_rating',index='user_id',columns='item_id')
newrating_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.610294,0.389706,-0.610294,-0.610294,1.389706,0.389706,-2.610294,1.389706,-0.610294,...,,,,,,,,,,
2,0.290323,,,,,,,,,-1.709677,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.125714,0.125714,,,,,,,,,...,,,,,,,,,,


### Filling up NaN values
- These values can be filled either by
        - the movie average over the column or 
        - the user rating average along the row
     

In [14]:
# filling up NaN values by user average
user_rating = newrating_matrix.apply(lambda row: row.fillna(row.mean()),axis=1)
user_rating.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.6102941,0.3897059,-0.6102941,-0.6102941,1.389706,0.3897059,-2.610294,1.389706,-0.6102941,...,-5.2245790000000005e-17,-5.2245790000000005e-17,-5.2245790000000005e-17,-5.2245790000000005e-17,-5.2245790000000005e-17,-5.2245790000000005e-17,-5.2245790000000005e-17,-5.2245790000000005e-17,-5.2245790000000005e-17,-5.2245790000000005e-17
2,0.2903226,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,-1.709677,...,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16,1.86231e-16
3,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,...,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17,1.6447750000000002e-17
4,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,...,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16
5,1.125714,0.1257143,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,...,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16,1.725604e-16


In [15]:
# filling up NaN values with movie average
movie_rating = newrating_matrix.fillna(newrating_matrix.mean(axis=0))
movie_rating.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.610294,0.389706,-0.610294,-0.610294,1.389706,0.389706,-2.610294,1.389706,-0.610294,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931
2,0.290323,-0.253455,-0.406476,-0.02917,-0.206708,0.099592,0.241369,0.370904,0.316282,-1.709677,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931
3,0.299264,-0.253455,-0.406476,-0.02917,-0.206708,0.099592,0.241369,0.370904,0.316282,0.251461,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931
4,0.299264,-0.253455,-0.406476,-0.02917,-0.206708,0.099592,0.241369,0.370904,0.316282,0.251461,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931
5,1.125714,0.125714,-0.406476,-0.02917,-0.206708,0.099592,0.241369,0.370904,0.316282,0.251461,...,-1.147059,-0.137056,-0.45933,-1.45933,-0.211982,-2.121495,-0.121495,-1.121495,0.019337,-0.365931


In [16]:
# Computing cosine similarity on user_rating 
from sklearn.metrics.pairwise import cosine_similarity
cos_user_similar = cosine_similarity(user_rating)

#Filling diagonal with 0 as complete similarity
np.fill_diagonal(cos_user_similar,0)

# Creating cosine similarity matrix of users
similarity_of_users = pd.DataFrame(cos_user_similar,index=user_rating.index)
similarity_of_users.columns = user_rating.index
similarity_of_users.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.043411,0.011051,0.059303,0.134514,0.103373,0.110556,0.180891,0.012253,-0.000621,...,0.025835,-0.047952,0.087224,0.007718,0.074378,0.078714,0.067433,0.02879,-0.03127,0.032123
2,0.043411,0.0,0.013658,-0.017016,0.03577,0.094503,0.089408,0.05564,0.027294,0.097846,...,0.012853,-0.028798,0.056659,0.197835,0.090009,0.032505,0.015053,-0.017344,0.012068,0.039173
3,0.011051,0.013658,0.0,-0.059638,0.016037,-0.017158,0.016141,0.041177,-0.010093,0.023856,...,0.001615,0.000658,-0.006888,0.036157,-0.018513,-0.00624,-0.023907,0.034414,-0.009187,0.001489
4,0.059303,-0.017016,-0.059638,0.0,0.007373,-0.053929,-0.025604,0.136046,0.016082,-0.013588,...,0.011895,0.002174,-0.028,-0.025021,0.022882,-0.00596,0.279818,0.258594,0.064504,-0.019222
5,0.134514,0.03577,0.016037,0.007373,0.0,0.038484,0.067874,0.140106,0.010195,0.014335,...,0.070014,-0.070821,0.024278,0.038672,0.093567,0.051782,0.02954,0.036234,0.043318,0.099324
6,0.103373,0.094503,-0.017158,-0.053929,0.038484,0.0,0.109288,0.087991,0.019336,0.147415,...,-0.003298,-0.05023,0.071964,0.075697,0.036463,-0.04752,-0.012071,0.001559,0.036605,0.013969
7,0.110556,0.089408,0.016141,-0.025604,0.067874,0.109288,0.0,0.083913,-0.024401,0.145122,...,-0.000368,-0.036151,0.033468,0.075249,0.010144,0.013584,0.005844,0.001943,0.106252,0.112547
8,0.180891,0.05564,0.041177,0.136046,0.140106,0.087991,0.083913,0.0,-0.017857,0.097316,...,0.054016,0.00244,0.060706,0.051131,0.057601,0.025026,0.078222,0.057949,0.030609,0.042047
9,0.012253,0.027294,-0.010093,0.016082,0.010195,0.019336,-0.024401,-0.017857,0.0,0.040276,...,0.026783,0.034075,0.039073,0.073518,0.026682,0.018475,0.004491,0.040748,0.026259,0.034393
10,-0.000621,0.097846,0.023856,-0.013588,0.014335,0.147415,0.145122,0.097316,0.040276,0.0,...,-0.028213,-0.007988,0.013252,0.066074,0.037995,0.014387,0.044572,0.026179,0.062067,0.047657


In [17]:
# Computing cosine similarity on movie_rating 
cos_movie_similar = cosine_similarity(movie_rating)

#Filling diagonal with 0 as complete similarity
np.fill_diagonal(cos_movie_similar,0)

# Creating cosine similarity matrix of users
similarity_with_movie = pd.DataFrame(cos_movie_similar,index=movie_rating.index)
similarity_with_movie.columns = movie_rating.index
similarity_with_movie.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.843356,0.826035,0.857827,0.764312,0.779988,0.713977,0.853533,0.855259,0.816118,...,0.780533,0.83732,0.818577,0.83578,0.794697,0.851909,0.817304,0.859819,0.831436,0.745902
2,0.843356,0.0,0.927383,0.956761,0.843712,0.872662,0.804099,0.941021,0.956912,0.933291,...,0.884964,0.946499,0.906733,0.951505,0.885671,0.952297,0.909712,0.961835,0.947336,0.857147
3,0.826035,0.927383,0.0,0.93998,0.82725,0.852937,0.779676,0.923743,0.939207,0.917426,...,0.867559,0.930667,0.889823,0.926949,0.868076,0.934073,0.892905,0.94632,0.924428,0.839377
4,0.857827,0.956761,0.93998,0.0,0.855949,0.879266,0.8018,0.959257,0.974333,0.94694,...,0.898505,0.964633,0.919858,0.953728,0.900577,0.967555,0.939123,0.98232,0.963425,0.868326
5,0.764312,0.843712,0.82725,0.855949,0.0,0.768636,0.706424,0.844057,0.854108,0.82936,...,0.791333,0.837326,0.806724,0.840056,0.797082,0.851854,0.811623,0.858221,0.843267,0.771807
6,0.779988,0.872662,0.852937,0.879266,0.768636,0.0,0.738928,0.869299,0.882723,0.860679,...,0.806767,0.868896,0.840463,0.873479,0.812566,0.869683,0.833814,0.886511,0.866416,0.778813
7,0.713977,0.804099,0.779676,0.8018,0.706424,0.738928,0.0,0.792667,0.804805,0.801276,...,0.73164,0.795876,0.766481,0.803615,0.741661,0.799894,0.762568,0.812112,0.80653,0.725921
8,0.853533,0.941021,0.923743,0.959257,0.844057,0.869299,0.792667,0.0,0.952733,0.930135,...,0.881836,0.945228,0.902357,0.939172,0.882877,0.949623,0.909567,0.959258,0.936613,0.85007
9,0.855259,0.956912,0.939207,0.974333,0.854108,0.882723,0.804805,0.952733,0.0,0.949024,...,0.900045,0.964529,0.922029,0.958285,0.899504,0.967211,0.929998,0.979256,0.961161,0.871264
10,0.816118,0.933291,0.917426,0.94694,0.82936,0.860679,0.801276,0.930135,0.949024,0.0,...,0.872557,0.939537,0.894182,0.93202,0.874321,0.940556,0.909082,0.952376,0.937395,0.843653


This gives us the cosine similarity between each pair of users

### Finding nearest K neighbours

In [18]:
def nearestK(df,k):
    order = np.argsort(df.values,axis=1)[:,:k]
    df = df.apply(lambda x:pd.Series(x.sort_values(ascending=False)
                                    .iloc[:k].index,
                                    index=['top{}'.format(i) for i in range (1,k+1)]),axis=1)
    
    return df

In [19]:
# Top 30 neighbours for each user
top_30 = nearestK(similarity_of_users,30)
top_30.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,773,868,592,880,429,276,916,222,457,8,...,246,886,92,682,648,268,643,933,94,44
2,651,34,310,485,296,329,781,46,743,547,...,209,384,356,460,444,582,33,266,26,414
3,808,687,656,512,783,266,556,220,155,284,...,817,819,150,569,369,546,46,225,345,140
4,35,841,353,940,431,443,626,341,725,511,...,464,105,893,282,552,570,165,717,240,408
5,268,497,276,92,650,868,622,44,22,457,...,429,453,407,748,267,307,308,545,13,727


In [20]:
#top 30 neighbours when movie avg is considered
top_30_movie = nearestK(similarity_with_movie,30)
top_30_movie.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,225,549,895,266,105,800,594,926,384,769,...,876,628,359,631,400,662,120,376,27,369
2,384,33,849,888,800,171,252,482,651,728,...,549,687,359,544,895,273,400,163,477,266
3,33,810,687,47,191,266,284,155,512,369,...,800,631,926,772,876,547,74,252,728,400
4,849,888,431,827,631,33,800,384,876,941,...,441,926,171,895,273,810,832,784,516,685
5,584,369,728,384,800,571,319,849,565,549,...,252,909,266,171,165,827,513,273,564,431


In [21]:
def get_common_movies(user1,user2):
    common_movies = avg_rating[avg_rating.user_id == user1].merge(
        avg_rating[avg_rating.user_id == user2],
        on='item_id',
        how='inner')
    return common_movies.merge(item,on='item_id')

In [22]:
# Get common movies for user_id 1 and 2
a = get_common_movies(1,2)
a.head()

Unnamed: 0,user_id_x,item_id,rating_x_x,timestamp_x,rating_y_x,new_rating_x,user_id_y,rating_x_y,timestamp_y,rating_y_y,new_rating_y,movie_title
0,1,272,3,887431647,3.610294,-0.610294,2,5,888979061,3.709677,1.290323,Good Will Hunting (1997)
1,1,258,5,878873389,3.610294,1.389706,2,3,888549961,3.709677,-0.709677,Contact (1997)
2,1,14,5,874965706,3.610294,1.389706,2,4,888551853,3.709677,0.290323,"Postino, Il (1994)"
3,1,242,5,889751633,3.610294,1.389706,2,5,888552084,3.709677,1.290323,Kolya (1996)
4,1,10,3,875693118,3.610294,-0.610294,2,2,888551853,3.709677,-1.709677,Richard III (1995)


In [23]:
a = a.loc[:,['rating_x_x','rating_x_y','movie_title']]
a.head()

Unnamed: 0,rating_x_x,rating_x_y,movie_title
0,3,5,Good Will Hunting (1997)
1,5,3,Contact (1997)
2,5,4,"Postino, Il (1994)"
3,5,5,Kolya (1996)
4,3,2,Richard III (1995)


In [24]:
def Score_unwatched_movie(user,item):
    #top 30 users for given user ID
    a = top_30_movie[top_30_movie.index==user].values
    b = a.squeeze().tolist()
    #Movie ratings by all users given to item
    c = movie_rating.loc[:,item]
    #Movie ratings given by the top30 neighbours of user to item
    d = c[c.index.isin(b)]
    f = d[d.notnull()]
    
    #average rating by user
    avg_user = Mean.loc[Mean['user_id']==user,'rating'].values[0]
    
    #list of all user IDs in f
    index = f.index.values.squeeze().tolist()
    
    corr = similarity_with_movie.loc[user,index]
    final = pd.concat([f,corr],axis=1)
    final.columns = ['avg_score','correlation']
    final['score']=final.apply(lambda x:x['avg_score']*x['correlation'],axis=1)
    num = final['score'].sum()
    den = final['correlation'].sum()
    final_score = avg_user + (num/den)
    return final_score

In [25]:
# User Id=2, item ID=2
# User hasn't watched this movie before
score = Score_unwatched_movie(2,271)
print('Predicted Score/Rating: ',score)

Predicted Score/Rating:  3.543652791412089


In [26]:
avg_rating = avg_rating.astype({"item_id":str})
Movie_user = avg_rating.groupby(by='user_id')['item_id'].apply(lambda x:','.join(x))

In [27]:
def UB_CF(user):
    watched_by_user = xrating_matrix.columns[xrating_matrix[xrating_matrix.index==user].notna().any()]
    watched_by_user = watched_by_user.tolist()
    a = top_30[top_30.index==user].values
    b = a.squeeze().tolist()
    d = Movie_user[Movie_user.index.isin(b)]
    l = ','.join(d.values)
    watched_by_similar_users = l.split(',')
    under_consideration = list(set(watched_by_similar_users)-set(list(map(str,watched_by_user))))
    under_consideration = list(map(int,under_consideration))
    score=[]
    for i in under_consideration:
        c = movie_rating.loc[:,i]
        d = c[c.index.isin(b)]
        f = d[d.notnull()]
        avg_user = Mean.loc[Mean['user_id']==user,'rating'].values[0]
    
        #list of all user IDs in f
        index = f.index.values.squeeze().tolist()

        corr = similarity_with_movie.loc[user,index]
        final = pd.concat([f,corr],axis=1)
        final.columns = ['avg_score','correlation']
        final['score']=final.apply(lambda x:x['avg_score']*x['correlation'],axis=1)
        num = final['score'].sum()
        den = final['correlation'].sum()
        final_score = avg_user + (num/den)
        score.append(final_score)
    
    movies_rec = pd.DataFrame({'item_id':under_consideration,'score':score})
    top_5_rec = movies_rec.sort_values(by='score',ascending=False).head()
    movies = top_5_rec.merge(item,how='inner',on='item_id')
    movies = movies.movie_title.values.tolist()
    return movies

In [28]:
userID = int(input("Enter User ID: "))
recommendations = UB_CF(userID)
print()
print("The recommendations for User ID {} are: ".format(userID))
print()
for i in recommendations:
    print(i)

Enter User ID: 3

The recommendations for User ID 3 are: 

Close Shave, A (1995)
Casablanca (1942)
Star Wars (1977)
Wallace & Gromit: The Best of Aardman Animation (1996)
Usual Suspects, The (1995)
