## 2. Rating Prediction

Now as the dataset has been prepared, it can now be used.

In [1]:
import math
import time
import pickle

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity

import utils as UT

#np.random.seed(40)

In [2]:
DF = pd.read_pickle("DF.pickle")
with open("indices.pickle", "rb") as f:
    uid_idx, pid_idx, uid_set, pid_set = pickle.load(f)

In [3]:
# compute the total number of users
m = len(uid_set)
n = len(pid_set)

user_ratings = UT.prepare_table(DF, (m, n), uid_idx, pid_idx)
#user_ratings = sparse.csr_matrix(user_ratings) # convert to sparse matrix so saves memory
#bitmask = np.random.rand(m, n) > 0.8
bitmask = UT.generate_bitmask(user_ratings, 0.8)
train = user_ratings.copy()
train[bitmask] = 0

prepared 18.36 seconds


In [4]:
with open("user_ratings.pickle", "wb") as f:
    pickle.dump([user_ratings], f)

### 2a. Item-based Collaborative Filtering
This similarity function can take many forms, such as correlation between ratings or cosine of those rating vectors

In [5]:
tr2 = train.view()
tr2_transposed = np.transpose(tr2)

review_similarities = cosine_similarity(tr2_transposed)

In [6]:
I = np.identity(n)
review_similarities = review_similarities - I
rs_transposed = np.transpose(review_similarities)

prediction_sum = np.matmul(train, rs_transposed)
weight_sum = np.sum(rs_transposed, axis=0, keepdims=True)
weight_sum[weight_sum == 0] = np.infty

In [7]:
predictions = np.divide(prediction_sum, weight_sum)

(DO NOT USE. This is a non-vectorized way, which is more accurate, but extremely slow)
```
start = time.time()
predictions = UT.my_cosine_similarity(train)
end = time.time()
print(end-start)
```

In [8]:
mask = train != 0
predictions[mask] = 0

row_min = np.min(predictions, axis=1, keepdims=True)
row_max = np.max(predictions, axis=1, keepdims=True)
scaled_pred = (predictions-row_min) / (row_max+0.01-row_min) * 5

predictions = scaled_pred + train

In [9]:
test_mask = np.nonzero(user_ratings - train) # area of test
Y = user_ratings[test_mask]
Yhat = predictions[test_mask]

In [10]:
num_data = np.shape(test_mask)[1]
#num_data = np.count_nonzero(Y)
print(num_data)
mse = mean_squared_error(Y, Yhat)
rmse = math.sqrt(mse/num_data)
rse = math.sqrt(mse/(num_data-2))
#mae=mean_absolute_error(Y, Yhat)
mae = np.mean(Y - Yhat)

176703


In [11]:
print("Mean Absolute Error (MAE): %.2f"%(mae))
print("Root Mean Square Error (RMSE): %.2f"%rmse)


Mean Absolute Error (MAE): 4.06
Root Mean Square Error (RMSE): 0.01


#### *continues in *`part2b.ipynb`