# Recommenders Assignments

## 1. Content-Based Filtering: Cosine Similarity

In [2]:
# 1. Read in the data from the first tab of the Movie Ratings spreadsheet, which contains genre labels
import pandas as pd

df = pd.read_excel('../Data/Movie_Ratings.xlsx')
df.head()

Unnamed: 0,Movie_ID,Movie_Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [3]:
# 2. Remove the non-genre columns of the movie genre data
movies = df.drop(columns=['Movie_ID']).set_index('Movie_Title')
movies


Unnamed: 0_level_0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
Movie_Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
GoldenEye (1995),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
Get Shorty (1995),1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
Copycat (1995),0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mat' i syn (1997),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
B. Monkey (1998),0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
Sliding Doors (1998),0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
You So Crazy (1994),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
movies.index.name = None

In [5]:
movies.index

Index(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)',
       'Get Shorty (1995)', 'Copycat (1995)',
       'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
       'Twelve Monkeys (1995)', 'Babe (1995)', 'Dead Man Walking (1995)',
       'Richard III (1995)',
       ...
       'Mirage (1995)', 'Mamma Roma (1962)', 'Sunchaser, The (1996)',
       'War at Home, The (1996)', 'Sweet Nothing (1995)', 'Mat' i syn (1997)',
       'B. Monkey (1998)', 'Sliding Doors (1998)', 'You So Crazy (1994)',
       'Scream of Stone (Schrei aus Stein) (1991)'],
      dtype='object', length=1682)

In [6]:
# 3. Calculate the cosine similarity of Toy Story (1995) and Get Shorty (1995)
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(movies.loc[['Toy Story (1995)', 'Get Shorty (1995)']])

array([[1.        , 0.33333333],
       [0.33333333, 1.        ]])

In [7]:
# 4. Calculate the cosine similarity of Toy Story (1995) compared with all the movies
def get_similar_movies(df, movie_name):
    similarity_df = pd.DataFrame(cosine_similarity(df), index= df.index, columns = df.index)
    if movie_name not in similarity_df.index:
        return f'{movie_name} can not found'
    return similarity_df[[movie_name]].sort_values(by = movie_name, ascending =False)

In [8]:
get_similar_movies(movies, 'Toy Story (1995)')

Unnamed: 0,Toy Story (1995)
Toy Story (1995),1.000000
Aladdin and the King of Thieves (1996),1.000000
"Goofy Movie, A (1995)",0.866025
Aladdin (1992),0.866025
Home Alone (1990),0.816497
...,...
Rough Magic (1995),0.000000
Nothing Personal (1995),0.000000
Four Rooms (1995),0.000000
GoldenEye (1995),0.000000


In [9]:
# 5. Return the top 5 movies with the highest cosine similarity
get_similar_movies(movies, 'Toy Story (1995)')[:5]

Unnamed: 0,Toy Story (1995)
Toy Story (1995),1.0
Aladdin and the King of Thieves (1996),1.0
"Goofy Movie, A (1995)",0.866025
Aladdin (1992),0.866025
Home Alone (1990),0.816497


## 2. User-Item Matrix

In [10]:
# 1. Read in the movies, users and ratings tabs of the spreadsheet into three DataFrames
movies_df = df.copy() 

In [11]:
users = pd.read_excel('../Data/Movie_Ratings.xlsx', sheet_name=1)
users.head()

Unnamed: 0,User_ID,Age,Gender,Occupation
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other


In [12]:
ratings = pd.read_excel('../Data/Movie_Ratings.xlsx', sheet_name=2)
ratings.head()

Unnamed: 0,User_ID,Movie_ID,Rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [13]:
# 2. Use .pivot to restructure the ratings data into a user-item matrix
ratings.describe()

Unnamed: 0,User_ID,Movie_ID,Rating
count,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986
std,266.61442,330.798356,1.125674
min,1.0,1.0,1.0
25%,254.0,175.0,3.0
50%,447.0,322.0,4.0
75%,682.0,631.0,4.0
max,943.0,1682.0,5.0


In [14]:
X = (ratings.pivot(index= 'User_ID', columns = 'Movie_ID', values = 'Rating').fillna(3))
X

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
2,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
5,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
940,3.0,3.0,3.0,2.0,3.0,3.0,4.0,5.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
941,5.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
942,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


## 3. TruncatedSVD

In [15]:
# 1. Apply TruncatedSVD to the user-item matrix from the last assignment
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2)
svd.fit(X)

0,1,2
,n_components,2
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,
,tol,0.0


In [16]:
# 2. View the user-item matrix and the shape of the user-item matrix
print(X)
print(X.shape)

Movie_ID  1     2     3     4     5     6     7     8     9     10    ...  \
User_ID                                                               ...   
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2          4.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   2.0  ...   
3          3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0  ...   
4          3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0  ...   
5          4.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0  ...   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939        3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   5.0   3.0  ...   
940        3.0   3.0   3.0   2.0   3.0   3.0   4.0   5.0   3.0   3.0  ...   
941        5.0   3.0   3.0   3.0   3.0   3.0   4.0   3.0   3.0   3.0  ...   
942        3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0  ...   
943        3.0   5.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0   3.0  ...   

In [17]:
# 3. View the U matrix and the shape of the U matrix
U = svd.transform(X)
print(U)
print(U.shape)

[[127.32561834   7.17304653]
 [124.14575385  -2.1738839 ]
 [122.72783502  -2.53303269]
 ...
 [123.59193451  -2.36152685]
 [125.55804029   1.7140485 ]
 [124.8229789    2.27926259]]
(943, 2)


In [20]:
U_df = pd.DataFrame(U, index = X.index)
U_df.head()

Unnamed: 0_level_0,0,1
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,127.325618,7.173047
2,124.145754,-2.173884
3,122.727835,-2.533033
4,123.802128,-2.822315
5,122.566026,0.754504


In [19]:
# optional : view sigma

sigma = svd.singular_values_

print(sigma)
print(sigma.shape)

[3822.1590609   100.21021733]
(2,)


In [18]:
# 4. View the VT matrix and the shape of the VT matrix
VT = svd.components_

print(VT)

print(VT.shape)

[[ 0.02751476  0.02434468  0.02413305 ...  0.0240913   0.02409975
   0.02409975]
 [ 0.06807071  0.00836915 -0.00701416 ... -0.01008334 -0.01038314
  -0.01038314]]
(2, 1682)


In [22]:
VT_df = pd.DataFrame(VT, columns= X.columns)
VT_df.head()

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,0.027515,0.024345,0.024133,0.0251,0.024327,0.024231,0.026796,0.025988,0.026408,0.024741,...,0.0241,0.024109,0.0241,0.024091,0.0241,0.024083,0.0241,0.024091,0.0241,0.0241
1,0.068071,0.008369,-0.007014,0.051881,-0.004402,-0.009123,0.06183,0.073482,0.052807,0.007769,...,-0.010383,-0.009653,-0.010383,-0.0106,-0.010383,-0.009784,-0.010383,-0.010083,-0.010383,-0.010383


## 4. Choosing the Number of Components

In [12]:
# 1. Fit a TruncatedSVD model with 500 components


In [13]:
# 2. Plot the cumulative explained variance ratios


In [14]:
# 3. Suggest a “good” number of components that best captures the info in the data set


In [15]:
# 4. Fit another TruncatedSVD model with the “good” number of components


## 5. Making Collaborative Filtering Recommendations

In [16]:
# 1. View the new user vector
import pandas as pd
import numpy as np

# read in the movies tab of the spreadsheet
movies = pd.read_excel('../Data/Movie_Ratings.xlsx')

# read in the ratings tab
ratings = pd.read_excel('../Data/Movie_Ratings.xlsx', sheet_name=2)

# use .pivot to restructure the ratings data into a user-item matrix
X = (ratings.pivot(index='User_ID', columns='Movie_ID', values='Rating').fillna(3))

# create a vector of NaN values
vector_length = X.shape[1]
new_user = np.full(vector_length, np.NaN)

# populate with 5 ratings
ratings = [5, 3, 5, 5, 4]
positions_to_populate = [11, 14, 28, 63, 66]
new_user[positions_to_populate] = ratings

# view as a DataFrame
new_user_df = pd.DataFrame([new_user], columns=movies.Movie_Title)
new_user_df

Movie_Title,Toy Story (1995),GoldenEye (1995),Four Rooms (1995),Get Shorty (1995),Copycat (1995),Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),Twelve Monkeys (1995),Babe (1995),Dead Man Walking (1995),Richard III (1995),...,Mirage (1995),Mamma Roma (1962),"Sunchaser, The (1996)","War at Home, The (1996)",Sweet Nothing (1995),Mat' i syn (1997),B. Monkey (1998),Sliding Doors (1998),You So Crazy (1994),Scream of Stone (Schrei aus Stein) (1991)
0,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# view only the movies that the new user rated
new_user_df.T.dropna()

Unnamed: 0_level_0,0
Movie_Title,Unnamed: 1_level_1
"Usual Suspects, The (1995)",5.0
Mr. Holland's Opus (1995),3.0
Batman Forever (1995),5.0
"Shawshank Redemption, The (1994)",5.0
Ace Ventura: Pet Detective (1994),4.0


In [18]:
# 2. Transform the user into the latent space using .transform()


In [19]:
# 3. Reconstruct the user-item matrix for the user using np.dot()


In [20]:
# 4. Make 10 movie recommendations for the user


In [21]:
# 5. Review the movies and determine if they make sense


In [22]:
# 6. Optional: Try playing around with the number of components to see the difference in movie recommendations
