In [None]:
#only run if not already in environment
!pip install surprise

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(1)

# Workshop Functions\n",
import sys
sys.path.append('..')
from WKDSS420_functions import * 

In [None]:
from surprise import Dataset
from surprise import Reader

Scikit-surprise information can be found here: https://pypi.org/project/scikit-surprise/

In [None]:
df = pd.read_csv('MovieLensCombined.csv')
df.head()
# item_id corresponds to a unique movie title

### Data Exploration

In [None]:
df.info()
# we see that there are 100K observations, mean rating is high at 3.5

In [None]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

# Need to convert user_id, item_id, timestamp to string
df['user_id'] = df['user_id'].astype(str)
df['item_id'] = df['item_id'].astype(str)
df['timestamp'] = df['timestamp'].astype(str)
df.info()

In [None]:
df.loc[:,'user_id'].nunique() # number of unique users

In [None]:
df.loc[:,'item_id'].nunique() # number of unique movies (items)

In [None]:
mean_ratings = df.groupby('title')['rating'].mean()
plt.hist(mean_ratings, bins=70)
plt.show()
# we see some spikes around the whole numbers, which are the rating options given to the users.  
# We're also seeing a large spike at 3.0.  This is because if a movies was not particularly good or bad in my opinion, 
# ...I split the difference and give it a 3.  And it seems many users do the same.
# Also there is a large spike of 1.0 or bad movies.  

In [None]:
# let's sort so the highest rated movies are on the top
mean_ratings.sort_values(ascending=False)

You'll notice that the highest rated movies are relatively unknown or limited release.  That's because these were probably rated by just one or a handful of viewers who all loved the movie.  But such a small number of raters for a movie is not representative of the general public.  

To see a better effect, try grouping by the count instead of the mean

In [None]:
num_of_ratings = df.groupby('title')['rating'].count().sort_values(ascending=False)
# these movies are much more what we would think of when we think of generally popular movies.
num_of_ratings.head()

In [None]:
# More advanced method that shows the mean and the count, sorted by count.
df.groupby('title')['rating'].agg(['count', 'mean']).sort_values(by='count', ascending=False)
# these movies are much more what we would think of when we think of generally popular movies.

In [None]:
# OPTIONAL: when you get more people rating a movie, you start getting a normal distribution of ratings.  
df.loc[df.loc[:,'title']=='Contact (1997)','rating'].hist(bins=5)

In [None]:
# Let's store this in a dataframe to use later
df2 = df.groupby('title')['rating'].agg(['count', 'mean']).sort_values(by='count', ascending=False)
df2.columns = ['num_of_ratings', 'mean_rating']
df2.head()

In [None]:
df2.loc['They Made Me a Criminal (1939)','num_of_ratings']
# the more unknown movies have one 1 rating

In [None]:
# what are these movies with just one rating?
df2.loc[df2.loc[:,'num_of_ratings']<2,:]

In [None]:
df2.loc[df2.loc[:,'num_of_ratings']<2,'num_of_ratings'].count()
# 134 movies with just one rating.  This will affect our predictions later on.

In [None]:
df2.loc[:,'num_of_ratings'].hist(bins=70)
plt.show()

This plot shows the number of ratings that each movie got.  Most movies have just a few ratings.  There are a lot of movies that got only a handful of reviewers.  Most movies are not blockbusters, and so there are only a few that are in the large count range on the right hand side of the histogram.  

Finally, let's check the relationship between average rating and the number of ratings

In [None]:
ax = sns.scatterplot(x=df2.loc[:,'mean_rating'], y=df2.loc[:,'num_of_ratings'])
ax.text(1.5,500,'More popular movies,\nRated by many',color='red',fontsize=14)
ax.text(5,5,'Less popular movie, \nRated by few similar viewers ',color='red',fontsize=14)

We see that there is some relationship betwen the 2 variables, generally showing that as the number of ratings goes up, the overall value of the ratings goes up too (positive trend).  

Most of the popular movies with >500 reviews are popular and above the 3.5 mean.  And movies with very few reviwers span the gamut from 1-5 stars.  

### Convert data to scikit-surprise format

In [None]:
df.head()

In [None]:
from surprise import Dataset, Reader
reader = Reader(rating_scale=(1, 5))
# From the library author: "The columns must correspond to user id, item id and ratings (in that order)."
data = Dataset.load_from_df(df[["user_id", "item_id", "rating"]], reader)

### Cosine Similarity
Let's do the cosine similarity to find recommendations from users that are similar to a target user. This is a **memory-based** collaborative filter which we discussed in the lecture.  

We will user K-Near Neighbors to find the *K* users that are closest to the target user as determined through the cosine similarity between them, where similar users will have a smaller angle between the vectors representing the users.

In [None]:
from surprise import KNNWithMeans
# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": True  # Compute  similarities between users (False = between items, which is Content Based Filtering)
}
algo = KNNWithMeans(sim_options=sim_options)

In [None]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.25)

algo.fit(trainset)

In [None]:
prediction = algo.predict(2, 19) #(user, item_id)
prediction.est

In [None]:
predictions = algo.test(testset)

from surprise import accuracy
accuracy.rmse(predictions)

In [None]:
top_n = get_top_n(predictions, n=3)

### Exploring One User's Recommendation

In [None]:
# Get the top 3 movie recommendations for user 1
top_n['1']

In [None]:
# What are the top movies the person has rated:
df_1 = df.loc[df.loc[:,'user_id']=='1',['item_id','title','rating']].sort_values(by='rating',ascending=False)
df_1 = df_1.set_index('item_id')
df_1.head(10)

#### Full Recommendation List

In [None]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(f"For User ID: {uid}")
    for title, rating in user_ratings:
        print(f"  ID/Title: [{title.name}] {title[0]}")

### What Happened???
User 1 has rated many popular movies very high.  So why is the recommender suggesting movies that very few people have watched? 

Remember that there were a lot of movies that were mostly unknown, had just 1 or 2 reviewers that rated them 5 stars?  These are affecting the recommendations of movies.  

***Curiosity Points (15 points)*** In the original dataframe **df** filter out the movies that have just a few reviews.  Play around with the threshold until you start getting good recommendations.  

#### Plotting cosine similarities

In [None]:
# First create a blank matrix to hold a 0 if a movie was not rated, and the user rating otherwise
recArr = np.zeros((3,1682))

In [None]:
u1 = np.array(df.loc[ df.loc[:,'user_id']=='1', ['item_id','rating' ]])
u2 = np.array(df.loc[ df.loc[:,'user_id']=='2', ['item_id','rating' ]])
u3 = np.array(df.loc[ df.loc[:,'user_id']=='3', ['item_id','rating' ]])

In [None]:
np.array(u1).shape

In [None]:
counter=0
for x in [u1,u2,u3]:    
    r,c = x.shape
    for y in range(r):
        recArr[counter,int(x[y,0])] = x[y,1]
    counter = counter+1
    
recArr[:,0:10] 

In [None]:
mags = np.linalg.norm(recArr, axis=1)
dots01 = np.dot(recArr[0,:],recArr[1,:])
dots02 = np.dot(recArr[0,:],recArr[2,:])

Angle01 = dots01/(mags[0]*mags[1]) 
Angle02 = dots01/(mags[0]*mags[2]) 
print(mags)
print(Angle01, Angle02)

Angle01Rad = Angle01/np.pi
Angle02Rad = Angle02/np.pi

u01Coord = mags[1]*np.array([np.cos(Angle01Rad),np.sin(Angle01Rad)])
print(u01Coord)

u02Coord = mags[2]*np.array([np.cos(Angle02Rad),np.sin(Angle02Rad)])
print(u02Coord)

In [None]:
# We have the vector magnitudes and angles - let's plot
V = np.array([[mags[0],1],u01Coord, u02Coord])
origin = np.array([[0, 0, 0],[0, 0, 0]]) # origin point

plt.quiver(*origin, V[:,0], V[:,1], color=['r','b','g'], scale=151)

#### Running a Grid Search 
A grid search allows you to specify various options for your hyper-parameters of your recommender algorithm.  The **gridsearch** will then go through each option and you can retrieve the best option.  

In [None]:
from surprise import KNNWithMeans

help(KNNWithMeans)

In [None]:
param_grid = {
    'k' : [3,5,10,20,40],
    'sim_options': {
        'name' : ['msd','cosine'], 
        'user_based': [True, False],
    },
}
# "msd" = MSE (Mean Square Error)

In [None]:
from surprise.model_selection import GridSearchCV

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse","mae"],cv=None)
gs.fit(data)

results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.loc[:,'params']

In [None]:
print(gs.best_score)
print(gs.best_params)

### Recommender System with Matrix Factorization
Here we will practice how to use Matrix Factorization with Singular Vector Decomposition (SVD).  

An excellent description of SVD can be found here if you wish to explore: https://towardsdatascience.com/understanding-singular-value-decomposition-and-its-application-in-data-science-388a54be95d

In [None]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.25)

In [None]:
from surprise import SVD, accuracy
algo = SVD(n_factors=2) # default is 100
algo.fit(trainset)

In [None]:
user_factors = algo.pu
item_factors = algo.qi

In [None]:
predictions = algo.test(testset)

In [None]:
from surprise import accuracy
accuracy.rmse(predictions)

**Online User Results** For reference, this website (https://surpriselib.com/) shows the results for different combinations of parameters for this dataset ("Movielens 100k")

In [None]:
top_n = get_top_n(predictions, n=6)

In [None]:
top_n['1']

####  Grid search
Let's do a grid search with various parameters to see which are best for this dataset.  You can set any of the parameters for the SVD function described here (https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD) in the **param_grid** dict

In [None]:
from surprise.model_selection import GridSearchCV

param_grid = {"n_epochs": [5, 10,15,20], "lr_all": [0.001, 0.002, 0.005,0.010],"n_factors" : [2,10,100]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=None)

gs.fit(data)

# best RMSE score
print(gs.best_score)

# combination of parameters that gave the best RMSE score
print(gs.best_params)