In [14]:
import pandas as pd

### Import Data

In [15]:
df_train = pd.read_csv('training.txt', header=None, names=['user_id', 'item_id', 'rating'])
df_test = pd.read_csv('testing.txt', header=None, names=['user_id', 'item_id', 'rating'])

In [16]:
with open('item_tag.txt', 'r') as f:
    lines = f.readlines()

df_tag = pd.DataFrame({'item_id': [int(line.split(':')[0]) for line in lines],
                   'categories': [[int(x) for x in line.split(':')[1].split(',')] for line in lines]})


In [17]:
df_tag_dupes = df_tag['item_id'].duplicated()
df_tag_dupes.sum() # no duplicates of ids in the df_tag data

0

In [18]:
df_train = df_train.sort_values(by='user_id')
df_test = df_test.sort_values(by='user_id')
df_tag = df_tag.sort_values(by='item_id')

### Checking if users in test are not in train

In [19]:
train_user_id = df_train['user_id'].drop_duplicates()
test_user_id = df_test['user_id'].drop_duplicates()

In [20]:
train_user_id.shape

(162541,)

In [21]:
check = test_user_id.isin(train_user_id)

In [22]:
test_user_id.shape[0] == test_user_id[check].shape[0] #in further parts I also found that there are items in the test set that aren't present in the train set

True

all user_ids in test set are in training set

### Checking ratings

In [23]:
df_train['rating'].drop_duplicates().sort_values()

3876688     0.5
17166485    1.0
16312916    1.5
11389711    2.0
2446325     2.5
1398466     3.0
9461865     3.5
0           4.0
17127033    4.5
9470443     5.0
Name: rating, dtype: float64

In [24]:
from surprise import Reader, Dataset

In [25]:
reader = Reader(rating_scale=(1, 5))
data_train = Dataset.load_from_df(df_train, reader)

### Testing base methods

In [26]:
from surprise import BaselineOnly, SVD, SVDpp, NMF
from surprise.model_selection import cross_validate

In [27]:
# baseline method
baseline = BaselineOnly()
cross_validate(baseline, data_train, measures=["RMSE"], cv=3, verbose=True)

Estimating biases using als...


KeyboardInterrupt: 

the baseline methods takes a long time just to train even using cloud computing thus it is unwise to use the whole dataset for experimentation. I will select a subset of the dataset that is reprentative of the population

The steps to ensure this is:
I use Random User Sampling. This technique is based on the paper On Sampling Collaborative Filtering Datasets by Noveen Sachdeva et al. The researches state

" In Random User Sampling, we retain users from D at random.
To be more specific, we iteratively preserve all the interactions
for a random user until we have retained 𝑝% of the original interactions."

D is the 
user_id, item_id, rating table

### Chosing sample of data for experimentation

In [28]:
import numpy as np

In [29]:
train_user_id_set = set(train_user_id.copy().reset_index(drop=True))

In [30]:
user_to_review_count = df_train.groupby('user_id').size()

In [31]:
review_count = df_train.shape[0]

In [32]:
user_to_review_count_hash = {}

In [33]:
for uid, val in enumerate(user_to_review_count):
    user_to_review_count_hash[uid] = val

In [34]:
import random

In [35]:
percentage_retained = 0.30 #choosing p as 30%
current_review_count = 0
chosen_user_ids = []
while current_review_count < review_count * percentage_retained:
    chosen_user_id = random.sample(train_user_id_set,1)[0]
    train_user_id_set.remove(chosen_user_id)
    chosen_user_ids.append(chosen_user_id)
    current_review_count += user_to_review_count_hash[chosen_user_id]

since Python 3.9 and will be removed in a subsequent version.
  chosen_user_id = random.sample(train_user_id_set,1)[0]


In [36]:
df_train_sample = df_train[df_train['user_id'].isin(set(chosen_user_ids))]

In [37]:
current_review_count / review_count

0.3000205114911854

In [38]:
current_review_count

5999974

In [39]:
df_train_sample.to_csv('sample_train.csv')

### Testing base methods on sample

In [40]:
df_train_sample = pd.read_csv('sample_train.csv')

In [41]:
df_train_sample = df_train_sample[['user_id','item_id','rating']]

In [42]:
data_train_sample = Dataset.load_from_df(df_train_sample, reader)

In [43]:
import time

In [44]:
# baseline method
start_time = time.time()
baseline = BaselineOnly()
cross_validate(baseline, data_train_sample, measures=["RMSE"], cv=3, verbose=True)
end_time = time.time()

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8649  0.8645  0.8642  0.8646  0.0003  
Fit time          16.27   16.61   15.37   16.08   0.53    
Test time         84.33   39.21   39.47   54.34   21.21   


In [45]:
time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))

'00:04:16'

In [46]:
# SVD Method
start_time_svd = time.time()
svd = SVD()
cross_validate(svd, data_train_sample, measures=["RMSE"], cv=3, verbose=True)
end_time_svd = time.time()

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8203  0.8190  0.8195  0.8196  0.0005  
Fit time          52.59   51.35   51.76   51.90   0.51    
Test time         53.32   46.91   35.84   45.36   7.22    


In [47]:
time.strftime('%H:%M:%S', time.gmtime(end_time_svd - start_time_svd))

'00:05:42'

In [48]:
# SVDpp Method
start_time_svdpp = time.time()
svdpp = SVDpp()
cross_validate(svdpp, data_train_sample, measures=["RMSE"], cv=3, verbose=True)
end_time_svdpp = time.time()

Evaluating RMSE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8148  0.8123  0.8133  0.8135  0.0010  
Fit time          3135.24 2997.95 3003.61 3045.60 63.43   
Test time         929.61  960.22  851.17  913.67  45.92   


In [49]:
time.strftime('%H:%M:%S', time.gmtime(end_time_svdpp - start_time_svdpp))

'03:18:44'

In [50]:
# nmf Method
start_time_nmf = time.time()
nmf = NMF()
cross_validate(nmf, data_train_sample, measures=["RMSE"], cv=3, verbose=True)
end_time_nmf = time.time()

Evaluating RMSE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8770  0.8772  0.8768  0.8770  0.0002  
Fit time          102.28  100.11  94.79   99.06   3.15    
Test time         58.45   49.03   46.59   51.36   5.11    


In [51]:
time.strftime('%H:%M:%S', time.gmtime(end_time_nmf - start_time_nmf))

'00:08:15'

best RMSE in order
- SVD++
- SVD
- Baseline
- NMF

to proceed with hyperparameter tuning I chose SVD instead of the SVD++ because of SVD++'s high train time