In [None]:
import pandas as pd
import numpy as np
from tqdm import trange

In [None]:
df = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.data', delimiter=r'\t',
                 names=['user_id', 'item_id', 'rating', 'timestamp'])

r = df.pivot(index='user_id', columns='item_id', values='rating').values

  return func(*args, **kwargs)


In [None]:
r.shape

(943, 1682)

In [None]:
print(len(np.nonzero(~np.isnan(r))[0])) #the number of ratings that are not nan

100000


In [None]:
irow, jcol = np.where(~np.isnan(r))

idx = np.random.choice(np.arange(100_000), 10000, replace=False)
test_irow = irow[idx]
test_jcol = jcol[idx]

r_copy = r.copy()

for i in idx:
  r_copy[irow[i]][jcol[i]] = np.nan


In [None]:
print("The size of the initial train data: ", len(np.where(~np.isnan(r_copy))[0]))
print("The size of the test data: ", len(test_irow))
#print(len(test_jcol))

The size of the initial train data:  90000
The size of the test data:  10000


In [None]:
irow2, jcol2 = np.where(~np.isnan(r_copy))

idx2 = np.random.choice(np.arange(90_000), 10000, replace=False)
val_irow = irow2[idx2]
val_jcol = jcol2[idx2]

r_train = r_copy.copy()

for i in idx2:
  r_train[irow2[i]][jcol2[i]] = np.nan

In [None]:
print("The size of the final train data: ", len(np.where(~np.isnan(r_train))[0]))
print("The size of the validation data: ", len(val_irow))

The size of the final train data:  80000
The size of the validation data:  10000


In [None]:
def model(r, lamb=0.1):

  b_user = np.random.rand(r.shape[0])
  b_item = np.random.rand(r.shape[1])

  alpha = 0.0001

  row, col = np.nonzero(~np.isnan(r))

  with trange(1500) as epochs:
      for _ in epochs:
          total_e = 0
          for i, j in zip(row, col):
              # Prediction of r_ij
              y_pred = b_user[i] + b_item[j]
              e = r[i][j] - y_pred

              b_user[i] += alpha*(e - lamb*b_user[i]) #eksi mi yapmak lazim
              b_item[j] += alpha*(e - lamb*b_item[j])

              total_e += e ** 2

          epochs.set_description(f'Total Square Error: {total_e:.2f}')
  return b_user, b_item, total_e

In [None]:
b_user_all = []
b_item_all = []
final_error_all =[]
all_lamb = [1, 0.1, 0.01, 0.001, 0.0001]

#Model 1
b_user0, b_item0, final_error0 = model(r_train, lamb=all_lamb[0])
b_user_all.append(b_user0)
b_item_all.append(b_item0)
final_error_all.append(final_error0)
print('Final error of model with lambda = ', all_lamb[0],': ', final_error_all[0])

#Model 2
b_user1, b_item1, final_error1 = model(r_train, all_lamb[1])
b_user_all.append(b_user1)
b_item_all.append(b_item1)
final_error_all.append(final_error1)
print('Final error of model with lambda = ', all_lamb[1],': ', final_error_all[1])

#Model 3
b_user2, b_item2, final_error2 = model(r_train, all_lamb[2])
b_user_all.append(b_user2)
b_item_all.append(b_item2)
final_error_all.append(final_error2)
print('Final error of model with lambda = ', all_lamb[2],': ', final_error_all[2])

#Model 4
b_user3, b_item3, final_error3 = model(r_train, all_lamb[3])
b_user_all.append(b_user3)
b_item_all.append(b_item3)
final_error_all.append(final_error3)
print('Final error of model with lambda = ', all_lamb[3],': ', final_error_all[3])

#Model 5
b_user4, b_item4, final_error4 = model(r_train, all_lamb[4])
b_user_all.append(b_user4)
b_item_all.append(b_item4)
final_error_all.append(final_error4)
print('Final error of model with lambda = ', all_lamb[4],': ', final_error_all[4])


Total Square Error: 186130.74: 100%|██████████| 1500/1500 [05:40<00:00,  4.40it/s]


Final error of model with lambda =  1 :  186130.73876717358


Total Square Error: 69582.93: 100%|██████████| 1500/1500 [05:20<00:00,  4.69it/s]


Final error of model with lambda =  0.1 :  69582.9346164018


Total Square Error: 66953.74: 100%|██████████| 1500/1500 [05:18<00:00,  4.70it/s]


Final error of model with lambda =  0.01 :  66953.74193198138


Total Square Error: 66927.34: 100%|██████████| 1500/1500 [05:19<00:00,  4.69it/s]


Final error of model with lambda =  0.001 :  66927.34118567134


Total Square Error: 66901.98: 100%|██████████| 1500/1500 [05:22<00:00,  4.65it/s]

Final error of model with lambda =  0.0001 :  66901.98487933981





In [None]:
#finding the validation errors for all models
val_error = []
val_error_ = 0

for i in range (5):
  val_error_ = 0
  for j in range(len(val_irow)):
    val_error_ += (r[val_irow[j],val_jcol[j]] - (b_user_all[i][val_irow[j]] + b_item_all[i][val_jcol[j]]))**2
  val_error.append(val_error_)


In [None]:
#finding the best lambda
best_model=0
for i in range(4):

  if val_error[i+1] < val_error[i]:
    best_model = i+1
  elif val_error[i+1] > val_error[i]:
    best_model = i
  else:
    best_model = best_model

print("The model with best parameters are: Model ", best_model)
print("Lambda: ", all_lamb[best_model])
  

The model with best parameters are: Model  4
Lambda:  0.0001


In [None]:
#Finding the final error using test data
test_error = 0

for j in range(len(test_irow)):
  test_error += (r[test_irow[j],test_jcol[j]] - (b_user_all[best_model][test_irow[j]] + b_item_all[best_model][test_jcol[j]]))**2

print(f'The final test error:  {test_error:.2f}')

The final test error:  9130.91
