In [441]:
import pandas as pd
import numpy as np

In [442]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [443]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [444]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [445]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [446]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [447]:
dataset = movies.merge(ratings,on='movieId', how = 'inner')

In [448]:
dataset.drop(['timestamp'],axis=1, inplace=True)
dataset.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0


In [449]:
dataset.describe()

Unnamed: 0,movieId,userId,rating
count,100004.0,100004.0,100004.0
mean,12548.664363,347.01131,3.543608
std,26369.198969,195.163838,1.058064
min,1.0,1.0,0.5
25%,1028.0,182.0,3.0
50%,2406.5,367.0,4.0
75%,5418.0,520.0,4.0
max,163949.0,671.0,5.0


### Now, at this point, I am trying to get some possible high rated movies from different genres that I might have seen and liked or disliked so that I can give my own ratings, at least, to some number of movies in the dataset we loaded.

In [450]:
# let's see the our genre category

genres_list = list(dataset['genres'])
all_list = []

for each in genres_list:
    for genre in each.split('|'):
        all_list.append(genre)

print(set(all_list))

{'Romance', 'IMAX', 'Drama', 'Fantasy', 'Adventure', 'Comedy', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'Documentary', 'War', 'Children', 'Western', '(no genres listed)', 'Musical', 'Animation', 'Film-Noir', 'Action'}


In [451]:
stats = pd.DataFrame(dataset.groupby('title')['rating'].sum())
stats_count = pd.DataFrame(dataset.groupby('title')['rating'].count())
stats_count = list(stats_count['rating'])
stats['number of rates'] = stats_count
stats.sort_values(by=['number of rates'], inplace=True, ascending=False)

In [452]:
# average rate across all movies (C)
C = stats['rating'].mean()
print('Average rate in the dataset {}'.format(C))

Average rate in the dataset 39.09697705207414


In [453]:
# getting 90th quantile (m)
m = stats['number of rates'].quantile(0.90)

In [512]:
q_movies = stats.copy().loc[stats['number of rates'] >= m]

def weighted_rating(x, m=m, C=C):
    v = x['number of rates']
    R = x['rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies.head(20)

Unnamed: 0_level_0,rating,number of rates,score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Forrest Gump (1994),1382.5,341,1280.561559
Pulp Fiction (1994),1379.0,324,1272.416805
"Shawshank Redemption, The (1994)",1395.5,311,1283.466712
"Silence of the Lambs, The (1991)",1258.0,304,1155.20095
Star Wars: Episode IV - A New Hope (1977),1228.5,291,1124.100989
Jurassic Park (1993),1015.5,274,924.972567
"Matrix, The (1999)",1083.5,259,981.607022
Toy Story (1995),956.5,247,863.091692
Schindler's List (1993),1050.0,244,945.936454
Terminator 2: Judgment Day (1991),949.5,237,853.306473


### After rating the top ten movies which were rated by the biggest number of voters, I can include my own ratings to the dataset then I will go for the analysis and try to see the similar users' taste as mine

In [455]:
# reading my own ratings
my_ratings = pd.read_excel('my ratings.xlsx')
my_ratings

Unnamed: 0,title,rating,userId
0,Forrest Gump (1994),3,9999
1,Pulp Fiction (1994),2,9999
2,"Shawshank Redemption, The (1994)",4,9999
3,"Silence of the Lambs, The (1991)",3,9999
4,Star Wars: Episode IV - A New Hope (1977),3,9999
5,Jurassic Park (1993),4,9999
6,"Matrix, The (1999)",5,9999
7,Toy Story (1995),5,9999
8,Schindler's List (1993),4,9999
9,Terminator 2: Judgment Day (1991),5,9999


In [456]:
# dropping the uncessary fields and add my ratings to the dataset
dataset.drop(['movieId','genres'], axis=1, inplace=True)
frames = [dataset,my_ratings]
dataset = pd.concat(frames)
dataset.tail()

Unnamed: 0,rating,title,userId
5,4.0,Jurassic Park (1993),9999
6,5.0,"Matrix, The (1999)",9999
7,5.0,Toy Story (1995),9999
8,4.0,Schindler's List (1993),9999
9,5.0,Terminator 2: Judgment Day (1991),9999


In [457]:
# Transforming the dataframe to UserId x Movies format form Movies x UserId
userRates = dataset.pivot_table(index='userId', columns='title', values='rating')
userRates.index = userRates.index.astype('int')
userRates.head()
#userRates.replace(np.nan,'no vote', inplace= True)

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [458]:
my_movies = userRates[userRates.index == 9999].dropna(axis = 1)
listofmovies = list(my_movies.columns)
userRatings = userRates.copy()
userRates = userRates[listofmovies]
userRates.dropna(inplace=True)
userRates.head(2)

title,Forrest Gump (1994),Jurassic Park (1993),"Matrix, The (1999)",Pulp Fiction (1994),Schindler's List (1993),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Terminator 2: Judgment Day (1991),Toy Story (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15,1.0,3.0,5.0,5.0,4.0,2.0,5.0,5.0,4.0,2.0
23,4.5,3.5,4.0,4.5,3.5,5.0,4.5,4.5,3.5,3.0


In [459]:
from math import sqrt

def distance_sim(prefs,person1,person2):
    rates_1 = list(prefs.loc[person1])
    rates_2 = list(prefs.loc[person2])
    sum_of_squares=sum([pow(rates_1[item]-rates_2[item],2)
                      for item in range(0,len(list(prefs.columns)))])
    return 1/(1+sqrt(sum_of_squares))

In [460]:
score_by_user = []
for user in list(userRates.index):
    score_by_user.append(distance_sim(userRates,9999,user))

In [461]:
userRates['Score'] = score_by_user

In [462]:
userRates.drop([9999],axis=0, inplace=True)
userRates.sort_values(by='Score',ascending = False, inplace=True)
userRates

title,Forrest Gump (1994),Jurassic Park (1993),"Matrix, The (1999)",Pulp Fiction (1994),Schindler's List (1993),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Terminator 2: Judgment Day (1991),Toy Story (1995),Score
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
580,3.5,3.0,4.5,4.5,4.0,4.0,4.0,4.0,4.5,4.0,0.231662
311,5.0,4.5,4.0,3.0,5.0,4.5,2.0,4.0,4.5,3.0,0.2124
463,3.0,4.0,5.0,3.0,5.0,5.0,5.0,5.0,4.0,3.0,0.2
212,4.0,3.0,5.0,4.0,5.0,4.5,4.0,4.0,3.0,3.0,0.19405
461,4.0,5.0,4.5,4.5,4.0,5.0,5.0,4.5,5.0,3.5,0.190744
654,4.0,4.5,5.0,5.0,5.0,5.0,4.5,5.0,5.0,5.0,0.188638
596,5.0,4.0,4.5,5.0,4.5,5.0,4.0,3.0,4.0,3.5,0.187613
562,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,4.5,0.181818
442,5.0,4.5,4.5,4.5,4.5,4.5,5.0,5.0,4.0,4.0,0.17826
23,4.5,3.5,4.0,4.5,3.5,5.0,4.5,4.5,3.5,3.0,0.176563


In [485]:
recommendation = pd.DataFrame()
for user in list(userRates.index):
    recommendation = recommendation.append(userRatings[userRatings.index == user])
recommendation

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
580,,,,,,,,,,,...,,,,1.5,,0.5,,3.5,,
311,,,,,,,,,,,...,,,,,,,,,,
463,,,,,,,,,,,...,,,,,,,,,,
212,,,,,,,,,,3.5,...,,,,,,2.0,,,,
461,,,,,,,,,,,...,,,,,,,,,,
654,,,,,,,,4.0,,,...,,,,,,,,4.5,,
596,,,,,,,,,,,...,,,,,,1.5,,,,
562,,,,,,,,,,,...,,,,,,,,,,
442,,,,,,,,,,,...,,,,,,,,,,
23,,,,,,,,,,,...,,,,4.0,,,,,,


In [508]:
recommendation.replace(np.nan,0,inplace=True)
recommendation.sum(axis=1)
recommendation = recommendation.transpose()
recommendation.sort_values(by='Total Score', ascending= False)

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014),Total Score
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Total Score,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,25.0,...,0.0,0.0,33.0,0.0,19.0,0.0,44.0,0.0,0.0,105287.0
73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,...,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,5432.5
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,1.0,0.0,4.0,0.0,0.0,4457.0
452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,4273.5
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3806.5
472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,3143.5
509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,3107.5
311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3063.5
580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.5,0.0,0.5,0.0,3.5,0.0,0.0,3017.5
212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2723.5


In [509]:
recommendation

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014),Total Score
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.5,0.0,0.5,0.0,3.5,0.0,0.0,3017.5
311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3063.5
463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1622.0
212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2723.5
461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1976.0
654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,2547.0
596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,1766.0
562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1016.5
442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,950.5
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2637.5


In [510]:
top_movie_recc = recommendation[recommendation.index == 'Total Score'].transpose()
top_movie_recc.sort_values(by='Total Score', ascending= False, inplace= True)

In [514]:
movies_watched = list(my_ratings['title'])
top_movie_recc.drop(movies_watched,axis=0, inplace=True)

In [515]:
top_movie_recc

userId,Total Score
title,Unnamed: 1_level_1
Total Score,105287.0
"Godfather, The (1972)",202.0
Star Wars: Episode V - The Empire Strikes Back (1980),187.0
American Beauty (1999),179.0
Saving Private Ryan (1998),176.0
Gladiator (2000),170.0
Braveheart (1995),169.0
"Godfather: Part II, The (1974)",168.0
"Sixth Sense, The (1999)",167.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),166.0
