In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD

In [6]:
movie_df = pd.read_csv('movies.csv')
rating_df = pd.read_csv('ratings.csv')

In [7]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
combine_movie_rating = pd.merge(rating_df, movie_df, on='movieId')
combine_movie_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [10]:
columns = ['timestamp', 'genres']
combine_movie_rating = combine_movie_rating.drop(columns, axis=1)

In [11]:
combine_movie_rating = combine_movie_rating.dropna(axis = 0, subset = ['title'])

movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )

In [12]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
user_rating = rating_with_totalRatingCount.drop_duplicates(['userId','title'])
movie_user_rating_pivot = user_rating.pivot(index = 'userId', columns = 'title', values = 'rating').fillna(0)

In [13]:
X = movie_user_rating_pivot.values.T

In [14]:
SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
matrix.shape

(9719, 12)

In [17]:
corr = np.corrcoef(matrix)
corr.shape

(9719, 9719)

In [18]:
movie_title = movie_user_rating_pivot.columns
movie_title_list = list(movie_title)
coffey_hands = movie_title_list.index("Guardians of the Galaxy (2014)")

In [19]:
corr_coffey_hands  = corr[coffey_hands]
list(movie_title[(corr_coffey_hands >= 0.9)])

['Amazing Spider-Man, The (2012)',
 'Ant-Man (2015)',
 'Avatar (2009)',
 'Avengers, The (2012)',
 'Avengers: Age of Ultron (2015)',
 'Big Hero 6 (2014)',
 'Brave (2012)',
 'Captain America: The First Avenger (2011)',
 'Captain America: The Winter Soldier (2014)',
 'Cloudy with a Chance of Meatballs (2009)',
 'Dark Knight Rises, The (2012)',
 'Deadpool (2016)',
 'Deadpool 2 (2018)',
 'Despicable Me (2010)',
 'District 9 (2009)',
 'Django Unchained (2012)',
 'Doctor Strange (2016)',
 'Edge of Tomorrow (2014)',
 "Ender's Game (2013)",
 'Grand Budapest Hotel, The (2014)',
 'Gravity (2013)',
 'Guardians of the Galaxy (2014)',
 'Guardians of the Galaxy 2 (2017)',
 'Harry Potter and the Deathly Hallows: Part 1 (2010)',
 'Harry Potter and the Deathly Hallows: Part 2 (2011)',
 'Hobbit: An Unexpected Journey, The (2012)',
 'Hobbit: The Desolation of Smaug, The (2013)',
 'How to Train Your Dragon (2010)',
 'Hugo (2011)',
 'Inside Out (2015)',
 'Interstellar (2014)',
 'Iron Man (2008)',
 'Iron Man