In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
a = 2
b = 3
a
b

## User & Item based Collaborative Filtering

### Load data

In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv("3 - ratings_sub.csv",encoding = "ISO-8859-1")

In [3]:
ratings.shape

(487469, 7)

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,3218,3889,1.0,1172532894,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
1,3663,3889,1.0,1044474348,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
2,3704,3889,3.0,971391538,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
3,8877,3889,1.0,1050744366,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
4,9599,3889,0.5,1378056755,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487469 entries, 0 to 487468
Data columns (total 7 columns):
userId       487469 non-null int64
movieId      487469 non-null int64
rating       487469 non-null float64
timestamp    487469 non-null int64
title        487469 non-null object
genres       487469 non-null object
year         487469 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 26.0+ MB


In [6]:
ratings.userId=ratings.userId.astype(str)
ratings.movieId=ratings.movieId.astype(str)

In [7]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres', 'year'], dtype='object')

### Data Exploration & Transformation

<b> Find the top 10 most popular movies watched </b>


In [8]:
# Total unique users 
print("total unique users - ",len(ratings["userId"].unique()))

total unique users -  2827


<b> Q: Who are the users with maximum no of movies watched? </b>

In [9]:
# Users with max no of movies watches
ratings["userId"].value_counts().head()

37629    200
61382    200
44530    200
88164    200
65117    200
Name: userId, dtype: int64

### Transforming data to surprise format

In [10]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

ModuleNotFoundError: No module named 'surprise'

In [None]:
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [None]:
data

In [None]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [None]:
type(trainset)

### Making sense of trainset 

Points to Note:
    
    1) Trainset is no longer a pandas dataframe. Rather, it's a specific datatypes defined by the Surprise library
    2) UserId and ItemId in the pandas dataframe can contain any value (either string/integer etc). However, Trainset convert these raw ids into numeric indexes called as "inner id"
    3) Methods are provided to convert rw id to inner id and vice verca

In [None]:
# user item rating data can be obtained as follows
user_records = trainset.ur
type(user_records)

In [None]:
for keys in user_records.keys():
    print(keys)

In [None]:
user_records[0]

In [None]:
# However the ids are the inner ids and not the raw ids
# raw ids can be obatined as follows

print(trainset.to_raw_uid(0))
print(trainset.to_raw_iid(1066))

In [None]:
user_records[0]

<b> In Class Assignment </b>

Confirm the raw to internal id mapping with original data, for a given user/item combination (uid - 0 & iid - 1066)


### Training the model

In [None]:
from surprise import KNNWithMeans
from surprise import accuracy
from surprise import Prediction

In [None]:
algo = KNNWithMeans(k=51, sim_options={'name': 'pearson', 'user_based': False})
algo.fit(trainset)


### Find K most similiar items

<b> In-class assignment </b>

Which movies are most similiar to Finding Nemo? (Hint: Use <b> get_neighbors </b> method of the algo object)

### Evaluating Model Performance

In [None]:
len(testset)

In [None]:
testset[0:5]

In [None]:
# Evalute on test set
test_pred = algo.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

In [None]:
# View a particular prediction
test_pred[12]

# To access a particular value, say estimate simply mention test_pred[12].est

In [None]:
test_pred[12].details["actual_k"]

In [None]:
# convert results to dataframe
test_pred_df = pd.DataFrame(test_pred)
test_pred_df["was_impossible"] = [x["was_impossible"] for x in test_pred_df["details"]]

In [None]:
test_pred_df.loc[test_pred_df.was_impossible].tail(5)

<b> In class assignment </b>

What does <i>"was impossible": True indicate?</i>  

For how many cases in Test set, the predictions are set to "was_impossible"? And what could be the reasons for it?

### Predictions

In [None]:
# Mkae prediction for a single user
algo.predict(uid="41891",iid="Wrong Trousers, The (1993)")

### Generating top n recommendations

In [None]:
testset_new = trainset.build_anti_testset()

In [None]:
len(testset_new)

In [None]:
testset_new[0:5]

In [None]:
predictions = algo.test(testset_new[0:10000])

In [None]:
predictions_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in predictions])

In [None]:
predictions_df.columns = ["userId","movie_name","est_rating"]
predictions_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [None]:
predictions_df.head(10)

In [None]:
top_10_recos = predictions_df.groupby("userId").head(10).reset_index(drop=True)

In [None]:
top_10_recos

## SVD Based Recommendation

In [None]:
# Lets exclude movies with very few ratings, say less than 5
movie_count = ratings["title"].value_counts(ascending=False)
pop_movie = movie_count.loc[movie_count.values > 200].index
len(pop_movie)


In [None]:
ratings = ratings.loc[ratings.title.isin(pop_movie)]
ratings.shape

In [None]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [None]:
ratings.shape

In [None]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [None]:
from surprise import SVD
from surprise import accuracy

In [None]:
svd_model = SVD(n_factors=50,biased=False)
svd_model.fit(trainset)

In [None]:
test_pred = svd_model.test(testset)

In [None]:
test_pred_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in test_pred])

In [None]:
test_pred_df.head()

In [None]:
test_pred_df.columns = ["userId","movie_name","est_rating"]
test_pred_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [None]:
test_pred_df.head()

In [None]:
top_10_recos = test_pred_df.groupby("userId").head(10).reset_index(drop=True)

In [None]:
top_10_recos.head(30)

In [None]:

# compute RMSE
accuracy.rmse(test_pred)