## collaborative filtering (item X item similarity)

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('movies.csv')
links = pd.read_csv('links.csv')
rating = pd.read_csv('25Kratings.csv')

In [3]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [5]:
rating.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817


In [6]:
def extract_year(text):
    num = str(text.split()[-1].strip('()'))
    if num.isnumeric():
        return int(num)
    else: 0
#         print(num)

In [7]:
movies['year'] = movies['title'].apply(extract_year)

In [8]:
df = movies[movies['year'] >= 1994].copy()

In [9]:
df['title'] = df['title'].apply(lambda x:" ".join(x.split()[0:-1]))

In [10]:
titles = df.title.sort_values(ascending=True)

### merge

In [11]:
df_movies = df.merge(links,on='movieId')

In [12]:
df2 = df_movies.merge(rating,on='movieId')

In [13]:
a = pd.DataFrame(df2.groupby(['imdbId'])['rating'].agg(['mean','count']))

In [14]:
df_movies = df_movies.merge(a,on='imdbId')

#### filter popularity

In [15]:
df_movies.shape

(22353, 8)

In [16]:
df_movies.head()

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId,mean,count
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0,114709,862.0,3.898784,8798
1,2,Jumanji,Adventure|Children|Fantasy,1995.0,113497,8844.0,3.262764,3741
2,3,Grumpier Old Men,Comedy|Romance,1995.0,113228,15602.0,3.13991,1769
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0,114885,31357.0,2.912929,379
4,5,Father of the Bride Part II,Comedy,1995.0,113041,11862.0,3.092593,1782


In [17]:
C = df_movies['mean'].mean()
m = df_movies['count'].quantile(.80)
print(m)
df_movies = df_movies[df_movies['count'] >= m].copy()
def rate_it(row):
    v = row['count']
    R = row['mean']
    
    return (v/(v+m)*R) + (m/(v+m)*C)

45.0


In [18]:
df_movies['overall_score'] = df_movies.apply(rate_it,axis=1)

In [19]:
popular = df_movies.sort_values(by='overall_score',ascending=False)

In [20]:
popular = popular[popular['overall_score'] > 2.5]

In [21]:
popular.reset_index(inplace=True)

In [22]:
popular = popular.drop(['index'],axis=1)

In [23]:
popular['imdbId'] = popular['imdbId'].apply(lambda x: 'tt{:0>7}'.format(x))

In [24]:
popular.shape

(4314, 9)

In [25]:
popular.head()

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId,mean,count,overall_score
0,318,"Shawshank Redemption, The",Crime|Drama,1994.0,tt0111161,278.0,4.42049,12401,4.415754
1,50,"Usual Suspects, The",Crime|Mystery|Thriller,1995.0,tt0114814,629.0,4.298884,8510,4.292633
2,159817,Planet Earth,Documentary,2006.0,tt0795176,192040.0,4.471042,259,4.269666
3,2959,Fight Club,Action|Crime|Drama|Thriller,1999.0,tt0137523,550.0,4.227595,9016,4.222048
4,171011,Planet Earth II,Documentary,2016.0,tt5491994,420714.0,4.52439,164,4.219992


In [26]:
main_df = popular.merge(rating,on='movieId')

In [27]:
main_df

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId,mean,count,overall_score,userId,rating,timestamp
0,318,"Shawshank Redemption, The",Crime|Drama,1994.0,tt0111161,278.0,4.420490,12401,4.415754,2,5.0,1141417181
1,318,"Shawshank Redemption, The",Crime|Drama,1994.0,tt0111161,278.0,4.420490,12401,4.415754,3,4.0,1439472424
2,318,"Shawshank Redemption, The",Crime|Drama,1994.0,tt0111161,278.0,4.420490,12401,4.415754,6,5.0,945142558
3,318,"Shawshank Redemption, The",Crime|Drama,1994.0,tt0111161,278.0,4.420490,12401,4.415754,8,1.0,890489849
4,318,"Shawshank Redemption, The",Crime|Drama,1994.0,tt0111161,278.0,4.420490,12401,4.415754,10,4.5,1227571320
...,...,...,...,...,...,...,...,...,...,...,...,...
2342379,4735,Ghosts of Mars,Horror|Sci-Fi|Thriller,2001.0,tt0228333,10016.0,2.376147,218,2.501819,24784,2.5,1467644320
2342380,4735,Ghosts of Mars,Horror|Sci-Fi|Thriller,2001.0,tt0228333,10016.0,2.376147,218,2.501819,24808,1.0,1059188416
2342381,4735,Ghosts of Mars,Horror|Sci-Fi|Thriller,2001.0,tt0228333,10016.0,2.376147,218,2.501819,24814,1.0,1014520320
2342382,4735,Ghosts of Mars,Horror|Sci-Fi|Thriller,2001.0,tt0228333,10016.0,2.376147,218,2.501819,24921,1.0,1217354884


### pivot table

In [28]:
utr = main_df.pivot_table(index='userId',columns='title',values='rating')
utr = utr.fillna(0)
utr

title,'71,'Til There Was You,(500) Days of Summer,10 Cloverfield Lane,10 Items or Less,10 Things I Hate About You,"10,000 BC",100 Girls,101 Dalmatians,101 Reykjavik (101 Reykjavík),...,Zodiac,Zombieland,Zoolander,Zoolander 2,Zootopia,[REC],[REC]²,eXistenZ,xXx,xXx: Return of Xander Cage
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,3.5,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
def standardize(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

ratings_std = utr.apply(standardize)
item_similarity = cosine_similarity(ratings_std.T)

In [30]:
item_similarity_df = pd.DataFrame(item_similarity, index=utr.columns, columns=utr.columns)

In [31]:
item_similarity_df.shape

(4276, 4276)

In [32]:
# item_similarity_df.shape
# import pickle
# pickle.dump(item_similarity_df,open('collab_similarities.pkl','wb'))

In [33]:
item_similarity_df.head(3)

title,'71,'Til There Was You,(500) Days of Summer,10 Cloverfield Lane,10 Items or Less,10 Things I Hate About You,"10,000 BC",100 Girls,101 Dalmatians,101 Reykjavik (101 Reykjavík),...,Zodiac,Zombieland,Zoolander,Zoolander 2,Zootopia,[REC],[REC]²,eXistenZ,xXx,xXx: Return of Xander Cage
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,1.0,-0.001993,0.063166,0.115704,0.021625,0.02234,0.040141,0.009775,0.0174,0.04251,...,0.095222,0.05025,0.035868,0.035852,0.060145,0.096808,0.041983,0.048294,0.035951,0.028031
'Til There Was You,-0.001993,1.0,-0.009515,-0.004287,-0.001579,0.041384,-0.004966,-0.001984,0.053567,0.002148,...,-0.005151,-0.003408,0.02528,0.001484,-0.008,-0.00479,-0.002223,-0.000323,0.031893,0.004445
(500) Days of Summer,0.063166,-0.009515,1.0,0.16175,0.044202,0.18874,0.093289,0.071779,0.052278,0.026307,...,0.254996,0.286386,0.157897,0.075971,0.189518,0.102659,0.069118,0.056036,0.072983,0.028557


In [34]:
def get_similar_movies(movie_name, user_rating):
    similar_score = item_similarity_df[movie_name]*(user_rating - 2.5)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score.iloc[0:10]

print(get_similar_movies('Batman v Superman: Dawn of Justice',4))

title
Batman v Superman: Dawn of Justice    1.500000
Suicide Squad                         0.742481
X-Men: Apocalypse                     0.740088
Justice League                        0.718205
Captain America: Civil War            0.713911
Man of Steel                          0.701350
Ant-Man                               0.667340
The Amazing Spider-Man 2              0.649054
Avengers: Age of Ultron               0.647965
Thor: The Dark World                  0.635644
Name: Batman v Superman: Dawn of Justice, dtype: float64


In [35]:
#search movies
titles2 = utr.columns.sort_values(ascending=True)
r = []
def search(name):
    for i in titles2:
        if(name.lower() in i.lower()):
            r.append(i)
    r.sort()
    for j in r:
         print(j)
search("dark knight")

Dark Knight Rises, The
Dark Knight, The


In [36]:
popular[popular['title'] == 'Dark Knight, The']

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId,mean,count,overall_score
14,58559,"Dark Knight, The",Action|Crime|Drama|IMAX,2008.0,tt0468569,155.0,4.148275,6262,4.140872


#### based on user ratings

In [37]:
#user
user_rated = [("Iron Man 2",2),("Interstellar",5),("Skyfall",3)]

similar_movies = pd.DataFrame()

for movie,rating in user_rated:
    similar_movies = similar_movies.append(get_similar_movies(movie,rating),ignore_index=True)
    
similar_movies.head()
similar_movies.sum().sort_values(ascending=False).iloc[0:15]

Interstellar               2.500000
Dark Knight Rises, The     1.488104
The Martian                1.474169
Inception                  1.413751
Ex Machina                 1.335698
Edge of Tomorrow           1.320976
The Imitation Game         1.319143
Django Unchained           1.284251
Guardians of the Galaxy    1.256665
Mad Max: Fury Road         1.248506
Skyfall                    0.500000
Quantum of Solace          0.230978
Spectre                    0.229533
Avengers, The              0.229417
Casino Royale              0.228142
dtype: float64