# BlockCluster EDA | Kyle McLester
The data for this project is found on [Kaggle](https://www.kaggle.com/rounakbanik/the-movies-dataset). Our goal for this portion of the project is to perform a basic exploratory analysis of the data and report our findings

## Importing Dependencies

In [1]:
from networkx.algorithms.distance_measures import radius
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import operator
import os
import numpy as np
from pathlib import Path
import plotly.express as go
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# set app threads to 16 (max of my system)
os.environ["NUMEXPR_MAX_THREADS"] = "16"
os.environ["NUMEXPR_NUM_THREADS"] = "16"
pd.options.mode.chained_assignment = None
# force matplotlib graphs to be same shape
plt.rcParams.update({"figure.autolayout": True})

## Loading Data

In [2]:
links = pd.read_csv('data/movielens/100k/links.csv', sep=',', encoding='latin-1')
movies = pd.read_csv('data/movielens/100k/movies.csv', sep=',', encoding='latin-1')
ratings = pd.read_csv('data/movielens/100k/ratings.csv', sep=',', encoding='latin-1', usecols=['userId','movieId','rating'])
tags = pd.read_csv('data/movielens/100k/tags.csv', sep=',', encoding='latin-1', usecols=['userId','movieId','tag'])

## Exploring Data

In [3]:
print(5*'*'+' COLUMN LABELS '+5*'*')
print('links: {}'.format(list(links.columns)))
print('movies: {}'.format(list(movies.columns)))
print('ratings: {}'.format(list(ratings.columns)))
print('tags: {}'.format(list(tags.columns)))

***** COLUMN LABELS *****
links: ['movieId', 'imdbId', 'tmdbId']
movies: ['movieId', 'title', 'genres']
ratings: ['userId', 'movieId', 'rating']
tags: ['userId', 'movieId', 'tag']


In [4]:
print(5*'*'+' DATA SHAPE '+5*'*')
print('links: {}'.format(links.shape))
print('movies: {}'.format(movies.shape))
print('ratings: {}'.format(ratings.shape))
print('tags: {}'.format(tags.shape))

***** DATA SHAPE *****
links: (9742, 3)
movies: (9742, 3)
ratings: (100836, 3)
tags: (3683, 3)


In [5]:
print(5*'*'+' RATING SUMMARY STATS '+5*'*')
ratings['rating'].describe()

***** RATING SUMMARY STATS *****


count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [6]:
print(5*'*'+' NULL VALUE COUNTS '+5*'*')
print('links\n{}\n'.format(links.isnull().sum()))
print('movies\n{}\n'.format(movies.isnull().sum()))
print('ratings\n{}\n'.format(ratings.isnull().sum()))
print('tags\n{}\n'.format(tags.isnull().sum()))

***** NULL VALUE COUNTS *****
links
movieId    0
imdbId     0
tmdbId     8
dtype: int64

movies
movieId    0
title      0
genres     0
dtype: int64

ratings
userId     0
movieId    0
rating     0
dtype: int64

tags
userId     0
movieId    0
tag        0
dtype: int64



In [7]:
print(5*'*'+' REMOVE ROWS W/NULL VALUES ' + 5*'*')
links = links.dropna()
links.isnull().sum()

***** REMOVE ROWS W/NULL VALUES *****


movieId    0
imdbId     0
tmdbId     0
dtype: int64

In [8]:
print(5*'*'+' FIRST 5 ROWS '+5*'*')
print('links\n{}\n'.format(links.head()))
print('movies\n{}\n'.format(movies.head()))
print('ratings\n{}\n'.format(ratings.head()))
print('tags\n{}\n'.format(tags.head()))

***** FIRST 5 ROWS *****
links
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0

movies
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

ratings
   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0

tags
   userId  movieId   

In [3]:
# break up movies['genre'] into a string
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna('').astype('str')

In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']"
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
3,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']"
4,5,Father of the Bride Part II (1995),['Comedy']


In [4]:
# extract year from title column
movies['year'] = [re.search('(\d){4}', x) for x in movies['title']]
movies = movies.dropna()
movies['year'] = [re.search('(\d){4}', x).group() for x in movies['title']]
movies = movies.replace(to_replace='\((.*)\)', value='', regex=True)
movies = movies.replace(to_replace=' +$', value='', regex=True)
movies['year'] = movies['year'].astype('int')

In [12]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1995
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']",1995
2,3,Grumpier Old Men,"['Comedy', 'Romance']",1995
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",1995
4,5,Father of the Bride Part II,['Comedy'],1995


In [5]:
# remove title that were missing a year or are recorded after this study
movies = movies[movies['year']<=2018]

In [14]:
# data shapes post processing
print(5*'*'+' DATA SHAPE '+5*'*')
print('links: {}'.format(links.shape))
print('movies: {}'.format(movies.shape))
print('ratings: {}'.format(ratings.shape))
print('tags: {}'.format(tags.shape))

***** DATA SHAPE *****
links: (9734, 3)
movies: (9724, 4)
ratings: (100836, 3)
tags: (3683, 3)


In [6]:
rating_options = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
rating_count = [ratings[ratings['rating']==x]['rating'].count() for x in rating_options]
rating_counts = pd.DataFrame(columns=['rating','count'])
rating_counts['rating'] = rating_options
rating_counts['count'] = rating_count

In [10]:
import plotly.express as px
fig = px.bar(data_frame=rating_counts, x='rating', y='count', width=800, height=400)
fig.show()

In [7]:
import ast
genres = []
z=[ast.literal_eval(x) for x in movies['genres']]
for i in z:
    for j in i:
        genres.append(j)
genres = pd.DataFrame(data=genres)
x = pd.unique(genres[0])
y = [genres.value_counts()[x] for x in range(0,len(genres.value_counts()))]

In [8]:
xy = pd.DataFrame(columns=['genre','count'])
xy['genre'] = x
xy['count'] = y
xy[xy['genre']==''] = 'null'

In [11]:
fig = px.bar(data_frame=xy, x='genre', y='count', width=800, height=400)
fig.show()

## Recommendation based on Genre

In [12]:
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1,2),
                     min_df=0,
                     stop_words='english')

In [13]:
tfidf_matrix = tf.fit_transform(movies['genres'])
print('tfidf_matrix\nshape: {}'.format(tfidf_matrix.shape))

tfidf_matrix
shape: (9724, 174)


In [20]:
for i in tfidf_matrix:
    print(i)

  (0, 63)	0.4051560427399779
  (0, 47)	0.3681858417114528
  (0, 34)	0.3836987462715231
  (0, 18)	0.40089749128201696
  (0, 108)	0.3026200066389389
  (0, 59)	0.1675715931646092
  (0, 46)	0.31620562407044245
  (0, 33)	0.3233369703441734
  (0, 17)	0.2610600318872138
  (0, 51)	0.5796215265019781
  (0, 19)	0.533786479964968
  (0, 108)	0.36562038360426763
  (0, 46)	0.38203429725120497
  (0, 17)	0.31540832366786353
  (0, 68)	0.7696177592874737
  (0, 157)	0.524259020251557
  (0, 59)	0.3644735714344812
  (0, 103)	0.5645118365307613
  (0, 62)	0.5418641154358464
  (0, 96)	0.290412420355974
  (0, 157)	0.4522391997863829
  (0, 59)	0.3144041970126223
  (0, 59)	1.0
  (0, 84)	0.604512249650911
  (0, 5)	0.5454112466253515
  (0, 165)	0.3148326802893081
  (0, 73)	0.3691173554790952
  (0, 0)	0.31894242957396657
  (0, 68)	0.7696177592874737
  (0, 157)	0.524259020251557
  (0, 59)	0.3644735714344812
  (0, 19)	0.7329628013211978
  (0, 46)	0.5245860268555241
  (0, 17)	0.43309933307189175
  (0, 0)	1.0
  (0, 30)

In [14]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.31378594, 0.06107542, 0.05268521],
       [0.31378594, 1.        , 0.        , 0.        ],
       [0.06107542, 0.        , 1.        , 0.3516825 ],
       [0.05268521, 0.        , 0.3516825 , 1.        ]])

In [22]:
print(cosine_sim[:,:])

[[1.         0.31378594 0.06107542 ... 0.         0.16119834 0.16757159]
 [0.31378594 1.         0.         ... 0.         0.         0.        ]
 [0.06107542 0.         1.         ... 0.         0.         0.36447357]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.16119834 0.         0.         ... 0.         1.         0.        ]
 [0.16757159 0.         0.36447357 ... 0.         0.         1.        ]]


In [15]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [16]:
def genre_recommendations(title, num_titles):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_titles+1]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [17]:
genre_recommendations('Tomb Raider', 10)

1516               Flight of the Navigator
4274    Journey to the Center of the Earth
5004                             Explorers
5396                            D.A.R.Y.L.
6436                       Last Mimzy, The
6862                         City of Ember
9016        Doctor Who: The Waters of Mars
1506                       Black Hole, The
1554            Return from Witch Mountain
2803                            Titan A.E.
Name: title, dtype: object

In [80]:
genre_recommendations('Ant-Man and the Wasp', 10)

2861                                        Space Cowboys
3069                                           Innerspace
3927                        Adventures of Pluto Nash, The
5259                                Last Starfighter, The
6754    Indiana Jones and the Kingdom of the Crystal S...
7052                                     Land of the Lost
8691                                             Deadpool
9731                                              Gintama
560                                            Barbarella
969                                    Back to the Future
Name: title, dtype: object

## Recommendation based on Title

In [81]:
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0,
                     stop_words='english')

In [82]:
tfidf_matrix =tf.fit_transform(movies['title'])
tfidf_matrix.shape

(9724, 15855)

In [83]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [84]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [85]:
def title_recommendations(title, num_titles):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_titles+1]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [86]:
title_recommendations('Tomb Raider', 10)

9433                         Rogue One: A Star Wars Story
567                                                  Solo
6823                            Star Wars: The Clone Wars
9645                             Star Wars: The Last Jedi
8908                        The Star Wars Holiday Special
7367    Empire of Dreams: The Story of the 'Star Wars'...
1979            Star Wars: Episode I - The Phantom Menace
898        Star Wars: Episode V - The Empire Strikes Back
1570                                           L.A. Story
2227                                     Story of Us, The
Name: title, dtype: object

In [87]:
title_recommendations('Toy Story', 10)

2355           Toy Story 2
7355           Toy Story 3
3595              Toy, The
1570            L.A. Story
2227      Story of Us, The
4156       Story of O, The
4089          Toy Soldiers
3187            Love Story
2110    Christmas Story, A
4047           Ghost Story
Name: title, dtype: object

## Average Rating Given by Each User

In [88]:
merged_df = ratings.join(movies.set_index('movieId'), on='movieId')

In [89]:
merged_df[merged_df['userId']==1]

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1995.0
1,1,3,4.0,Grumpier Old Men,"['Comedy', 'Romance']",1995.0
2,1,6,4.0,Heat,"['Action', 'Crime', 'Thriller']",1995.0
3,1,47,5.0,Seven,"['Mystery', 'Thriller']",1995.0
4,1,50,5.0,"Usual Suspects, The","['Crime', 'Mystery', 'Thriller']",1995.0
...,...,...,...,...,...,...
227,1,3744,4.0,Shaft,"['Action', 'Crime', 'Thriller']",2000.0
228,1,3793,5.0,X-Men,"['Action', 'Adventure', 'Sci-Fi']",2000.0
229,1,3809,4.0,What About Bob?,['Comedy'],1991.0
230,1,4006,4.0,Transformers: The Movie,"['Adventure', 'Animation', 'Children', 'Sci-Fi']",1986.0


In [90]:
user_avg_ratings = pd.DataFrame(columns=['userId','avg_rating'])
userId = []
avg_rating = []
for user in merged_df['userId'].unique():
    userId.append(user)
    avg_rating.append(merged_df[merged_df['userId']==user]['rating'].sum()/len(merged_df[merged_df['userId']==user]))
user_avg_ratings['userId'] = userId
user_avg_ratings['avg_rating'] = avg_rating

In [91]:
user_avg_ratings = user_avg_ratings.set_index('userId')

In [134]:
user_avg_ratings

Unnamed: 0_level_0,avg_rating
userId,Unnamed: 1_level_1
1,4.366379
2,3.948276
3,2.435897
4,3.555556
5,3.636364
...,...
606,3.657399
607,3.786096
608,3.134176
609,3.270270


In [92]:
user_avg_ratings.describe()

Unnamed: 0,avg_rating
count,610.0
mean,3.657222
std,0.480635
min,1.275
25%,3.36
50%,3.694385
75%,3.9975
max,5.0


## Average Rating for Each Movie (Naive)

In [94]:
avg_movie_ratings = pd.DataFrame(columns=['movieId','avg_rating'])
movieId = []
avg_rating = []
vote_count = []
for movie in merged_df['movieId'].unique():
    movieId.append(movie)
    avg_rating.append(merged_df[merged_df['movieId']==movie]['rating'].sum()/len(merged_df[merged_df['movieId']==movie]))
    vote_count.append(merged_df[merged_df['movieId']==movie]['rating'].count())
avg_movie_ratings['movieId'] = movieId
avg_movie_ratings['avg_rating'] = avg_rating
avg_movie_ratings['vote_count'] = vote_count

In [95]:
avg_movie_ratings = avg_movie_ratings.join(movies.set_index('movieId'), on='movieId')
avg_movie_ratings = avg_movie_ratings.drop(['genres','year'], axis=1)
avg_movie_ratings = avg_movie_ratings.dropna()

In [96]:
avg_movie_ratings.head()

Unnamed: 0,movieId,avg_rating,vote_count,title
0,1,3.92093,215,Toy Story
1,3,3.259615,52,Grumpier Old Men
2,6,3.946078,102,Heat
3,47,3.975369,203,Seven
4,50,4.237745,204,"Usual Suspects, The"


In [97]:
# 10 lowest rated movies
avg_movie_ratings.sort_values(by=['avg_rating'])[:10]

Unnamed: 0,movieId,avg_rating,vote_count,title
8941,7114,0.5,1,"Collector, The"
8859,122246,0.5,1,Tooth Fairy 2
8531,8632,0.5,1,Secret Society
7822,82095,0.5,1,Skyline
9298,136297,0.5,1,Mortal Kombat: The Journey Begins
6513,54934,0.5,1,"Brothers Solomon, The"
6518,67799,0.5,1,The Butterfly Effect 3: Revelations
9610,61818,0.5,1,"Crow, The: Wicked Prayer"
6535,134528,0.5,1,Aloha
6539,138798,0.5,1,Joe Dirt 2: Beautiful Loser


In [98]:
# 10 highest rated movies
avg_movie_ratings.sort_values(by=['avg_rating'], ascending=False)[:10]

Unnamed: 0,movieId,avg_rating,vote_count,title
4275,4788,5.0,1,Moscow Does Not Believe in Tears
8685,120130,5.0,1,Into the Forest of Fireflies' Light
7102,6021,5.0,1,"American Friend, The"
5572,5607,5.0,1,Son of the Bride
4277,5888,5.0,1,Brother
8557,71268,5.0,1,Tyler Perry's I Can Do Bad All by Myself
8558,93320,5.0,1,Trailer Park Boys
4278,5889,5.0,1,"Cruel Romance, A"
8580,467,5.0,1,Live Nude Girls
8581,876,5.0,1,Supercop 2


In [99]:
avg_movie_ratings['avg_rating'].describe()

count    9706.000000
mean        3.261957
std         0.869360
min         0.500000
25%         2.800000
50%         3.416667
75%         3.909773
max         5.000000
Name: avg_rating, dtype: float64

## IMDB Weighted Rating

In [100]:
C = avg_movie_ratings['avg_rating'].mean()
m = avg_movie_ratings['vote_count'].quantile(0.9)
avg_copy = avg_movie_ratings.copy().loc[avg_movie_ratings['vote_count']>=m]

In [101]:
avg_copy

Unnamed: 0,movieId,avg_rating,vote_count,title
0,1,3.920930,215,Toy Story
1,3,3.259615,52,Grumpier Old Men
2,6,3.946078,102,Heat
3,47,3.975369,203,Seven
4,50,4.237745,204,"Usual Suspects, The"
...,...,...,...,...
3350,8807,3.455882,34,Harold and Kumar Go to White Castle
3384,45720,3.558824,34,"Devil Wears Prada, The"
3490,94959,3.775862,29,Moonrise Kingdom
3498,1172,4.161765,34,Cinema Paradiso


In [102]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['avg_rating']
    return (v/(v+m)*R) + (m/(m/v)*C)

In [103]:
avg_copy['score'] = avg_copy.apply(weighted_rating, axis=1)

In [104]:
avg_copy.head()

Unnamed: 0,movieId,avg_rating,vote_count,title,score
0,1,3.92093,215,Toy Story,704.804189
1,3,3.259615,52,Grumpier Old Men,171.767325
2,6,3.946078,102,Heat,335.839752
3,47,3.975369,203,Seven,665.685932
4,50,4.237745,204,"Usual Suspects, The",669.181617


In [105]:
avg_copy.to_csv('weighted_ratings.csv',sep=',',encoding='utf-8', index=False)

In [106]:
avg_copy.sort_values(by='score',ascending=False, inplace=True)
avg_copy.head(20)

Unnamed: 0,movieId,avg_rating,vote_count,title,score
20,356,4.164134,329,Forrest Gump,1077.032111
232,318,4.429022,317,"Shawshank Redemption, The",1038.12171
16,296,4.197068,307,Pulp Fiction,1005.278531
34,593,4.16129,279,"Silence of the Lambs, The",913.880073
166,2571,4.192446,278,"Matrix, The",910.64531
15,260,4.231076,251,Star Wars: Episode IV - A New Hope,822.571308
26,480,3.75,238,Jurassic Park,779.71365
7,110,4.031646,237,Braveheart,776.703087
478,589,3.970982,224,Terminator 2: Judgment Day,734.222154
28,527,4.225,220,Schindler's List,721.39366


## Creating User Profiles

In [107]:
profiles = {}

for user in merged_df['userId'].unique():
    username = 'user_' + str(user)
    
    temp = merged_df[merged_df['userId']==user]
    
    titles = temp['title']
    ratings = temp['rating']
    ratings = list(ratings)
    
    movie_list = {}
    for i, title in enumerate(titles):
        movie_list[title] = ratings[i]
        
    profiles[username] = movie_list

In [108]:
import json

In [109]:
with open('profiles.json','w') as outfile:
    json.dump(profiles, outfile,indent=4)

## Get Common Movies

In [136]:
def get_common_movies(user_A, user_B):
    return [movie for movie in profiles[user_A] if movie in profiles[user_B]]

In [111]:
# number of available user profiles
len(profiles)

610

In [112]:
get_common_movies('user_234','user_610')

['Toy Story',
 'Batman Forever',
 'Dumb & Dumber',
 'Star Wars: Episode IV - A New Hope',
 'Ace Ventura: Pet Detective',
 'Speed',
 'Jurassic Park',
 'Batman',
 'Wizard of Oz, The',
 'Die Hard',
 'E.T. the Extra-Terrestrial',
 'Star Wars: Episode V - The Empire Strikes Back',
 'Raiders of the Lost Ark',
 'Aliens',
 'Star Wars: Episode VI - Return of the Jedi',
 'Alien',
 'Army of Darkness',
 'Indiana Jones and the Last Crusade',
 'Face/Off',
 'As Good as It Gets',
 'Labyrinth',
 'Lethal Weapon',
 'Saving Private Ryan',
 'Dark Crystal, The',
 'NeverEnding Story, The',
 "Bug's Life, A",
 'Matrix, The',
 'Star Wars: Episode I - The Phantom Menace',
 'RoboCop',
 'Toy Story 2',
 'Beach, The',
 'Teenage Mutant Ninja Turtles',
 'Flintstones in Viva Rock Vegas, The',
 'Shanghai Noon',
 'Starman',
 'Chicken Run',
 'X-Men',
 "Charlie's Angels",
 'Cast Away',
 'Memento',
 'Spy Kids',
 'Mummy Returns, The',
 'Shrek',
 'Jurassic Park III']

## Compare User Ratings

In [113]:
# get reviews from common movies
def get_reviews(user_A, user_B):
    common_movies = get_common_movies(user_A, user_B)
    return [(profiles[user_A][movie], profiles[user_B][movie]) for movie in common_movies]

In [114]:
get_reviews('user_70','user_391')

[(4.0, 4.0),
 (4.5, 5.0),
 (4.0, 5.0),
 (5.0, 5.0),
 (5.0, 5.0),
 (5.0, 5.0),
 (4.5, 5.0),
 (5.0, 5.0),
 (5.0, 5.0),
 (5.0, 4.0),
 (5.0, 4.0),
 (4.5, 4.0),
 (4.0, 4.0),
 (3.5, 4.0),
 (4.5, 3.0),
 (4.0, 4.0),
 (4.0, 4.0),
 (3.5, 4.0),
 (4.0, 4.0),
 (4.5, 4.0)]

In [115]:
# get reviews from common movies
def get_reviews_detailed(user_A, user_B):
    common_movies = get_common_movies(user_A, user_B)
    reviews = [(profiles[user_A][movie], profiles[user_B][movie]) for movie in common_movies]
    result = zip(common_movies, reviews)
    return set(result)

In [116]:
get_reviews_detailed('user_70','user_391')

{('American Beauty', (4.0, 4.0)),
 ('Apollo 13', (4.0, 4.0)),
 ('Back to the Future', (4.0, 4.0)),
 ('Beautiful Mind, A', (4.5, 4.0)),
 ('Breakfast Club, The', (4.0, 4.0)),
 ('Cinema Paradiso', (5.0, 5.0)),
 ('Dead Poets Society', (5.0, 4.0)),
 ('E.T. the Extra-Terrestrial', (4.5, 5.0)),
 ("Ferris Bueller's Day Off", (3.5, 4.0)),
 ('Forrest Gump', (4.0, 5.0)),
 ('Godfather, The', (5.0, 5.0)),
 ('Good Will Hunting', (4.5, 3.0)),
 ("It's a Wonderful Life", (5.0, 5.0)),
 ("One Flew Over the Cuckoo's Nest", (5.0, 5.0)),
 ("Schindler's List", (5.0, 5.0)),
 ('Shawshank Redemption, The', (4.5, 5.0)),
 ('Shrek', (4.0, 4.0)),
 ('Stand by Me', (4.5, 4.0)),
 ('Star Wars: Episode VI - Return of the Jedi', (5.0, 4.0)),
 ('Truman Show, The', (3.5, 4.0))}

## Collaborative Filtering

In [117]:
def euclidean_distance(points):
    squared_diff = [(point[0] - point[1]) ** 2 for point in points]
    summed_squared_diffs = sum(squared_diff)
    distance = np.sqrt(summed_squared_diffs)
    return distance

In [118]:
def similarity(reviews):
    return 1/(1+euclidean_distance(reviews))

In [119]:
def get_user_similarity(user_A, user_B):
    reviews = get_reviews(user_A, user_B)
    return similarity(reviews)

In [120]:
get_user_similarity('user_1','user_86')

0.21239994067041815

In [121]:
def recommend_movies(user, num_suggestions):
    similarity_scores = [(get_user_similarity(user, other), other) for other in profiles if other != user]
    # get sim scores for all users
    similarity_scores.sort()
    similarity_scores.reverse()
    # similarity_scores = similarity_scores[0:num_suggestions]
    
    recommendations = {}
    for similarity, other in similarity_scores:
        reviewed = profiles[other]
        for movie in reviewed:
            if movie not in profiles[user]:
                weight = similarity * reviewed[movie]
                if movie in recommendations:
                    sim, weights = recommendations[movie]
                    recommendations[movie] = (sim + similarity, weights + [weight])
                else:
                    recommendations[movie] = (similarity, [weight])
                    
    for recommendation in recommendations:
        similarity, movie = recommendations[recommendation]
        recommendations[recommendation] = sum(movie) / similarity
        
    sorted_recommendations = sorted(recommendations.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_recommendations[:num_suggestions]

In [122]:
recommend_movies('user_203', 10)

[('Jonah Who Will Be 25 in the Year 2000', 5.000000000000001),
 ('Come and See', 5.000000000000001),
 ('Story of Women', 5.0),
 ('Passenger, The', 5.0),
 ('Marriage of Maria Braun, The', 5.0),
 ('61*', 5.0),
 ('Thin Line Between Love and Hate, A', 5.0),
 ('Animals are Beautiful People', 5.0),
 ('Woman Is a Woman, A', 5.0),
 ('Thousand Clowns, A', 5.0)]

In [123]:
recommend_movies('user_120', 10)

[('Particle Fever', 5.000000000000001),
 ('Heidi Fleiss: Hollywood Madam', 5.000000000000001),
 ('Lamerica', 5.0),
 ('Entertaining Angels: The Dorothy Day Story', 5.0),
 ('Story of Women', 5.0),
 ('Passenger, The', 5.0),
 ('Marriage of Maria Braun, The', 5.0),
 ('One I Love, The', 5.0),
 ('Laggies', 5.0),
 ('Delirium', 5.0)]

In [124]:
merged_df = merged_df.dropna(axis=0)

In [125]:
merged_df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1995.0
1,1,3,4.0,Grumpier Old Men,"['Comedy', 'Romance']",1995.0
2,1,6,4.0,Heat,"['Action', 'Crime', 'Thriller']",1995.0
3,1,47,5.0,Seven,"['Mystery', 'Thriller']",1995.0
4,1,50,5.0,"Usual Suspects, The","['Crime', 'Mystery', 'Thriller']",1995.0


In [126]:
merged_df['year'] = [int(x) for x in merged_df['year']]

In [127]:
merged_df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1995
1,1,3,4.0,Grumpier Old Men,"['Comedy', 'Romance']",1995
2,1,6,4.0,Heat,"['Action', 'Crime', 'Thriller']",1995
3,1,47,5.0,Seven,"['Mystery', 'Thriller']",1995
4,1,50,5.0,"Usual Suspects, The","['Crime', 'Mystery', 'Thriller']",1995


## Movies Produced per Year

In [128]:
import plotly.express as px

In [129]:
merged_df = merged_df[merged_df['year']>=1915]

In [132]:
fig = px.bar(merged_df, merged_df['year'].unique(), merged_df['year'].value_counts(), title='Movies Produced by Year', labels=dict(x='Year',y='Movies Produced'))

In [133]:
fig.show()

## PCA and KMeans Clustering (DROPPED)

In [63]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
merged_df['genres_cat'] = encoder.fit_transform(merged_df['genres'])
merged_df['title_cat'] = encoder.fit_transform(merged_df['title'])
merged_df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year,genres_cat,title_cat
0,1,1,4.0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1995,353,8554
1,1,3,4.0,Grumpier Old Men,"['Comedy', 'Romance']",1995,735,3526
2,1,6,4.0,Heat,"['Action', 'Crime', 'Thriller']",1995,260,3699
3,1,47,5.0,Seven,"['Mystery', 'Thriller']",1995,936,7242
4,1,50,5.0,"Usual Suspects, The","['Crime', 'Mystery', 'Thriller']",1995,788,8793


## Predict Movie Rating for Given User

In [64]:
def predict_rating(user, title):
    similarity_scores = [(get_user_similarity(user, other), other) for other in profiles if other != user]
    # get sim scores for all users
    similarity_scores.sort()
    similarity_scores.reverse()
    
    recommendations = {}
    for similarity, other in similarity_scores:
        reviewed = profiles[other]
        for movie in reviewed:
            weight = similarity * reviewed[movie]
            if movie in recommendations:
                sim, weights = recommendations[movie]
                recommendations[movie] = (sim + similarity, weights + [weight])
            else:
                recommendations[movie] = (similarity, [weight])
                    
    for recommendation in recommendations:
        similarity, movie = recommendations[recommendation]
        recommendations[recommendation] = sum(movie) / similarity

    return recommendations[title]

In [65]:
predict_rating('user_1','Toy Story')

3.9972001085979056

## Find Accuracy of Recommendations

In [244]:
users = profiles.keys()
test_df = pd.DataFrame(users, columns=['userId'])
subset = test_df.sample(n=10)

In [245]:
subset

Unnamed: 0,userId
87,user_88
50,user_51
168,user_169
104,user_105
34,user_35
473,user_474
35,user_36
590,user_591
321,user_322
24,user_25


In [246]:
def accuracy(correct, total_items):
    return (correct/total_items)*100

In [247]:
title = []
predicted_rating = []
actual_rating = []
label = []

In [248]:
for user in subset['userId']:
    for movie in profiles[user]:
        try:
            predicted_value = predict_rating(user,movie)
            predicted_rating.append(predicted_value)
            title.append(movie)
            actual_value = profiles[user][movie]
            actual_rating.append(actual_value)
            if abs(actual_value - predicted_value) <= 1:
                label.append('correct')
            else:
                label.append('incorrect')
        except:
            predicted_rating.pop()
            title.pop()
            actual_rating.pop()
            label.pop()
        

In [249]:
predicted_labels = pd.DataFrame(columns=['title','predicted_rating'])
predicted_labels['title'] = title
predicted_labels['predicted_rating'] = predicted_rating
predicted_labels['actual_rating'] = actual_rating
predicted_labels['label'] = label

In [250]:
correct, incorrect = predicted_labels['label'].value_counts()

In [251]:
print(f'Accuracy: {accuracy(correct, len(predicted_labels)):.2f}%')

Accuracy: 76.83%
