In [70]:
from surprise import SVD, KNNBaseline
from surprise import BaselineOnly, accuracy
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

In [96]:
reader = Reader(line_format='item user rating timestamp', sep=',')

In [97]:
data = Dataset.load_from_file("./src/data/AmazonScraped/AmazonProductRatings.csv", reader=reader)

In [98]:
data.raw_ratings[0:5]

[('A334KVYM5MMQMR', 'B000K2PJ4K', 5.0, '1390780800'),
 ('A9UJRNHH1B6O', 'B000K2PJ4K', 1.0, '1390435200'),
 ('A3KAX7T0ZEGVOV', 'B000K2PJ4K', 4.0, '1389571200'),
 ('ANICWNHH5DH02', 'B000K2PJ4K', 5.0, '1389312000'),
 ('A2UKWG2OM70X63', 'B000K2PJ4K', 5.0, '1389225600')]

In [99]:
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9641  0.9529  0.9579  0.9430  0.9667  0.9569  0.0084  
MAE (testset)     0.6909  0.6934  0.6870  0.6810  0.6983  0.6901  0.0058  
Fit time          0.04    0.03    0.03    0.04    0.03    0.03    0.00    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


{'test_rmse': array([0.96408385, 0.95293315, 0.95788205, 0.94303284, 0.96671792]),
 'test_mae': array([0.69089427, 0.69341445, 0.6869999 , 0.68101249, 0.69826283]),
 'fit_time': (0.040314674377441406,
  0.03023529052734375,
  0.031742095947265625,
  0.03510546684265137,
  0.03283095359802246),
 'test_time': (0.018078088760375977,
  0.016308307647705078,
  0.016082763671875,
  0.018182992935180664,
  0.016124725341796875)}

In [100]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9936455466244322
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [101]:
trainset, testset = train_test_split(data, test_size=.25)

In [102]:
algo = SVD(n_epochs= 10, lr_all= 0.005, reg_all= 0.4)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9808


0.9808300472990928

## Get Related Products:

In [103]:
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7f005507cf90>

In [111]:
inner_id = algo.trainset.to_inner_iid('B000K2PJ4K')

In [106]:
prod_neighbors_iid = algo.get_neighbors(inner_id, k=10)

In [107]:
prod_neighbors_iid

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [108]:
prod_neighbors = list(algo.trainset.to_raw_iid(inner_id)
                       for inner_id in prod_neighbors_iid)

In [109]:
# Related Products
prod_neighbors

['B000KPIHQ4',
 'B000V0IBDM',
 'B000YFSR5G',
 'B000YFSR4W',
 'B0012DR1LU',
 'B0017LD0BM',
 'B0017LGD34',
 'B001IKJOLW',
 'B001LNSY2Q',
 'B0058YEJ5K']

In [140]:
from functools import reduce
from itertools import accumulate

In [170]:
def hash_uuid(id, n):
    return  int(str(reduce( lambda a, b : a.encode("utf-8").hex()+b.encode("utf-8").hex(), id.split("-")))) % n

In [171]:
hash_uuid("dcc549d8-bd0c-49c2-bff8-f6fa80fb7857", 256)

113

In [135]:
"dcc549d8-bd0c-49c2-bff8-f6fa80fb7857".split("-")

['dcc549d8', 'bd0c', '49c2', 'bff8', 'f6fa80fb7857']