## Collaborative Filtering

In [31]:
import pandas as pd
import numpy as np
import scipy
import math
import scipy.stats
from sklearn.metrics import mean_squared_error
import textract
import os
import pickle
from pathlib import Path
import numpy as np
import random
import pandas as pd
from itertools import combinations
from datetime import datetime
root = Path(".")

### Reading the movie-rating dataset and loading the item similarity matrix

In [32]:
my_path = root / "Pickled_files" / "item_similarity_matrix"
dbfile = open(my_path, 'rb')     
item_sim = pickle.load(dbfile)
dbfile.close()

In [33]:
rating_data = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::')
rating_data.columns = ['UserID', 'ItemID', 'Rating', 'Timestamp']
rating_data.drop(columns=['Timestamp'], axis=1, inplace=True)

  rating_data = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::')


In [34]:
rating_data

Unnamed: 0,UserID,ItemID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [35]:
item_cnt = max(rating_data['ItemID']) + 1
users_cnt = max(rating_data['UserID']) + 1
item_cnt, users_cnt

(3953, 6041)

In [36]:
rating_data.shape

(1000209, 3)

### Splitting the dataset into test and train

In [37]:
rating_train = rating_data.sample(frac=0.8, random_state=200)
rating_test = rating_data.drop(rating_train.index)

### Populating the rating matrix 

In [38]:
matrix = np.zeros(shape=(users_cnt, item_cnt))
matrix.shape

(6041, 3953)

In [39]:
for row in rating_train.itertuples():
    matrix[row.UserID][row.ItemID] = row.Rating
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.]])

In [40]:
colab_baseline = matrix.copy()

In [41]:
item_sim.shape

(3953, 3953)

### Calculating User, Item and Global Average for baseline prediction

In [42]:
algo_stats = list()
actual_by_pred = dict()
global_avg , cnt = 0 , 0
item_avg = [0] * item_cnt
user_avg = [0] * users_cnt
item_sz = [0] * item_cnt
user_siz = [0] * users_cnt

In [43]:
for users in range(1 , users_cnt) : 
    for items in range(1 , item_cnt) : 
        if matrix[users][items] > 0 : 
            global_avg += matrix[users][items]
            cnt += 1
            item_avg[items] += matrix[users][items]
            item_sz[items] += 1
            user_avg[users] += matrix[users][items]
            user_siz[users] += 1

In [44]:
global_avg = global_avg / cnt

We are using Item-Item based collaborative filtering for our prediction.
Here, we explore the relationship between the pair of items (the user who bought Y, also bought Z). We find the missing rating with the help of the ratings given to the other items by the user.
</br> </br>
We get the top K similar items for every item which will help us in predicting the rating a user might give for any particular Item.

In [45]:
def get_sim_items(itemID) : 
    sim_userID = list()
    for item in range(1 , item_sim.shape[0]) : 
        if item == itemID : 
            continue
        sim_userID.append((item_sim[itemID][item] , item))
    sim_userID.sort(key=lambda y: -y[0])
    return sim_userID

In [46]:
sim_top_k = dict()
for i in range(1 , item_cnt) : 
    sim_top_k[i] = get_sim_items(i)

## Prediction using collaborative filtering
we use the items (already rated by the user) that are most similar to the missing item to generate rating. We hence try to generate predictions based on the ratings of similar products. We compute this using a formula which computes rating for a particular item using weighted sum of the ratings of the other similar products.

$rating(U,Ii) = \tfrac{\sum_{j}^{}rating(U,Ij)*S(Ii,Ij)}{\sum_{j}^{}S(Ii,Ij)}$

In [47]:
def predict(user_id , item_id , K) : 
    sim_items = sim_top_k[item_id]
    pred_rating , sum_sim = 0 , 0
    taken = 0
    for (item_s , itemID) in sim_items :
        if item_s < 0 or math.isnan(item_s): 
            continue
        if taken == K :
            break
        if matrix[user_id][itemID] > 0 : 
            taken += 1
            sum_sim += item_s   
            pred_rating += item_s * matrix[user_id][itemID]
    if sum_sim == 0 : 
        return sum_sim
    a = pred_rating / sum_sim
    return pred_rating / sum_sim

#### Combining baseline with collaborative filtering
For baseline approach to handle strict and lineant raters we have prediction : </br>
$U_{global} + U_{bias} + Item_{bias}$

where $U_{global}$ = global average rating </br>
$U_{bias}$ = user average - global average </br>
$Item_{bias}$ = item average - global average </br>

In combination with collaborative filtering we have predicted rating as average of colab filtering rating and baseline approach

In [48]:
def predict_baseline(user_id , item_id) : 
    if user_siz[user_id] == 0 :
        user_bias = -global_avg
    else : 
        user_bias = (user_avg[user_id] / user_siz[user_id]) - global_avg
    if item_sz[item_id] == 0 : 
        item_bias = -global_avg
    else :
        item_bias = (item_avg[item_id] / item_sz[item_id]) - global_avg
    return global_avg + user_bias + item_bias

In [50]:
start_prediction_time = datetime.now()
for user in range(1 , users_cnt) :  
    unrated_items = np.where(matrix[user] == 0)[0]
    for item in unrated_items:
        if item == 0 : 
            continue
        colab_rat = predict(user , item , 5)
        matrix[user][item] = colab_rat
        colab_baseline[user][item] = (colab_rat + predict_baseline(user, item)) / 2
end_prediction_time = datetime.now()
print("Time for prediction" , (end_prediction_time - start_prediction_time).total_seconds())

Time for prediction 235.235813


In [51]:
def rmse_matrix(mat, test_data):
    y_actual = list(test_data.Rating)
    y_pred = list()
    y_pred_baseline = list()
    y_pred_comb = list()
    for id , row in test_data.iterrows() : 
        uid , itid = row['UserID'] , row['ItemID']
        y_pred.append(mat[uid][itid])
        baseline = predict_baseline(uid , itid)
        y_pred_baseline.append(baseline)
        y_pred_comb.append((mat[uid][itid] + baseline) / 2)
    return (mean_squared_error(y_actual , y_pred) , mean_squared_error(y_actual , y_pred_comb))

In [52]:
rmse_matrix(matrix , rating_test)

(1.1557051129397395, 0.9212280220671454)

In [53]:
my_path = root / "Pickled_files" / "pred_matrix_collab"
dbfile = open(my_path, 'wb')     
pickle.dump(matrix,dbfile)
dbfile.close()

In [55]:
my_path = root / "Pickled_files" / "pred_basline"
dbfile = open(my_path, 'wb')     
pickle.dump(colab_baseline,dbfile)
dbfile.close()