<a href="https://colab.research.google.com/github/julx134/Amazon-Recommendation-System/blob/main/RS_Surprise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Set-up code
!pip install pandas
!pip install scipy
!pip install sklearn
!pip install openpyxl
!pip install numpy
!pip install scikit-surprise


In [None]:
#import dependencies
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import cross_validate
from surprise import KNNBasic
import os
import csv
import pandas as pd
from collections import defaultdict
from surprise.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:
def trainMLDataset():
  # First train the algorithm on the movielens dataset.
  data = Dataset.load_builtin("ml-100k")
  trainset = data.build_full_trainset() ##returns dataset as a trainset object with no folds

  # set-up options for RS algorithm
  sim_options = {
      "name": "cosine",
      "user_based": False,  # compute  similarities between items
  }
  #initialize RS algorithm to be IBCF
  algo = KNNBasic(sim_options=sim_options)

  #train IBCF on the trainset
  #algo.fit(trainset)

  # Than predict ratings for all pairs (u, i) that are NOT in the training set.
  #testset = trainset.build_anti_testset()
  #predictions = algo.test(testset)

  # Run 5-fold cross-validation and print results
  cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)


In [None]:
def trainCustomDataset(path, num_folds):
  # path to custom dataset
  file_path = os.path.expanduser(path)

  #convert csv to dictionary
  rating_dict = {'user_id':[], 'item_id':[], 'rating':[]}
  with open(file_path, 'r') as dataset:
      for line in csv.reader(dataset):
          rating_dict['user_id'].append(line[0])
          rating_dict['item_id'].append(line[2])
          rating_dict['rating'].append(line[4])

  #convert dictionary to dataframe
  rating_df = pd.DataFrame.from_dict(rating_dict)

  #group duplicate values into one rating
  rating_df = rating_df.groupby(['user_id', 'item_id']).agg({'rating':'mean'}).reset_index()

  #define surprise reader object
  reader = Reader(rating_scale=(1,5))

  #convert dataframe into surprise dataset object
  data = Dataset.load_from_df(rating_df[['user_id', 'item_id', 'rating']], reader)

  # We'll use the item-based collaborative filtering algorithm
  sim_options = {
      "name": "cosine",
      "user_based": False,  # compute  similarities between items
  }
  #define IBCFRS
  algo = KNNBasic(sim_options=sim_options)
  

  # Run 5-fold cross-validation and print results
  print(cross_validate(algo, data, measures=["RMSE", "MAE"], cv=num_folds, verbose=True))

In [None]:
def preprocessCustomDataset(path):
  file_path = os.path.expanduser(path)
   #convert csv to dictionary
  rating_dict = {'user_id':[], 'item_id':[], 'rating':[]}
  with open(file_path, 'r') as dataset:
      for line in csv.reader(dataset):
          rating_dict['user_id'].append(line[0])
          rating_dict['item_id'].append(line[2])
          rating_dict['rating'].append(line[4])

  #convert dictionary to dataframe
  rating_df = pd.DataFrame.from_dict(rating_dict)

  #group duplicate values into one rating
  rating_df = rating_df.groupby(['user_id', 'item_id']).agg({'rating':'mean'}).reset_index()

  
  #le = preprocessing.LabelEncoder()
  #test = le.fit(rating_df['user_id'])
  #test = le.transform(list(le.classes_))
  #print(test)

  print(rating_df.head())

preprocessCustomDataset("/content/dataset/amazon_appliance_100k.csv")
preprocessCustomDataset("/content/dataset/ml_100k.csv")

                user_id     item_id  rating
0  A00032921HLX2KJJVXRS  B0045LLC7K     5.0
1  A00086729ZDSXGG2E481  B00E1IUTOY     1.0
2  A00222906VX8GH7X6J6B  B001BOBZSK     5.0
3  A0096681Y127OL1H8W3U  B0014CN8Y8     5.0
4  A0122375SQ8Z42DUL03J  B001TH7H04     5.0
  user_id item_id  rating
0       1       1     5.0
1       1      10     3.0
2       1     100     5.0
3       1     101     2.0
4       1     102     2.0


In [None]:
def precision_recall_ML_dataset():
  data = Dataset.load_builtin("ml-100k")
  kf = KFold(n_splits=5)
  # set-up options for RS algorithm
  sim_options = {
      "name": "cosine",
      "user_based": False,  # compute  similarities between items
  }
  #initialize RS algorithm to be IBCF
  algo = KNNBasic(sim_options=sim_options)

  for trainset, testset in kf.split(data):
      algo.fit(trainset)
      predictions = algo.test(testset)
      precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

      # Precision and recall can then be averaged over all users
      print("Precision score:",sum(prec for prec in precisions.values()) / len(precisions))
      print("Recall score:",sum(rec for rec in recalls.values()) / len(recalls))


In [None]:
def precision_recall_custom_dataset(path, num_folds):
  # path to 50K Amazon appliance dataset
  #file_path = os.path.expanduser("/content/dataset/Appliances_subset_surprise.csv")
  file_path = os.path.expanduser(path)

  #convert csv to dictionary
  rating_dict = {'user_id':[], 'item_id':[], 'rating':[]}
  with open(file_path, 'r') as dataset:
      for line in csv.reader(dataset):
          rating_dict['user_id'].append(line[0])
          rating_dict['item_id'].append(line[2])
          rating_dict['rating'].append(line[4])

  #convert dictionary to dataframe
  rating_df = pd.DataFrame.from_dict(rating_dict)

  #group duplicate values into one rating
  rating_df = rating_df.groupby(['user_id', 'item_id']).agg({'rating':'mean'}).reset_index()

  #define surprise reader object
  reader = Reader(rating_scale=(1,5))

  #convert dataframe into surprise dataset object
  data = Dataset.load_from_df(rating_df[['user_id', 'item_id', 'rating']], reader)

  # We'll use the item-based collaborative filtering algorithm
  sim_options = {
      "name": "cosine",
      "user_based": False,  # compute  similarities between items
  }
  #define IBCFRS
  algo = KNNBasic(sim_options=sim_options)
  kf = KFold(n_splits=num_folds)
  for trainset, testset in kf.split(data):
      algo.fit(trainset)
      predictions = algo.test(testset)
      precisions, recalls = precision_recall_at_k(predictions, k=num_folds, threshold=4)

      # Precision and recall can then be averaged over all users
      print("Precision score:",sum(prec for prec in precisions.values()) / len(precisions))
      print("Recall score:",sum(rec for rec in recalls.values()) / len(recalls))


In [None]:
trainCustomDataset("/content/dataset/ml_100k.csv", 5)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0270  1.0280  1.0226  1.0320  1.0225  1.0264  0.0036  
MAE (testset)     0.8124  0.8122  0.8078  0.8158  0.8113  0.8119  0.0026  
Fit time          0.92    1.20    0.94    1.05    1.16    1.05    0.11    
Test time         5.27    4.15    5.17    4.57    5.53    4.94    0.50    
{'test_rmse': array([1.02702017, 1.0279798 , 1.02256041, 1.03203439, 1.02249292]), 'test_mae': array([0.81240652, 0.81219612, 0.80775548, 0.81581501, 0.81126849]), 'fit_time': (0.9194674

In [None]:
trainCustomDataset("/content/dataset/ml_100k.csv", 10)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.0144  1.0179  1.0185  1.0175  1.0257  1.0114 

In [None]:
trainCustomDataset("/content/dataset/amazon_appliance_100k.csv",5)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    8295803689138500937505844787937280.0000110036010752289918458834716721152.0000856653451815609882031237605995763738869760.00006755324362888809349586104614912.000056294369690741196989126110501748604928.0000171341950919579134520399185600960761167872.0000342655751141584753034387294637158760972288.0000
MAE (testset)     58841406912037152655205325602816.0000856235868333774684685185581056.00006076320530282922764350562578004250198016.000047916127610231052812381323264.00003

In [None]:
trainCustomDataset("/content/dataset/amazon_appliance_100k.csv", 10)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.6653  154793831028124979792832305823744.00001

In [None]:
trainCustomDataset("/content/dataset/Appliances_subset_surprise.csv", 5)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5681  1.6800  1.5672  21.5365 13.4507 7.9605  8.1931  
MAE (testset)     0.9603  0.9640  0.9543  1.1965  1.0703  1.0291  0.0941  
Fit time          0.02    0.04    0.03    0.03    0.03    0.03    0.01    
Test time         0.11    0.09    0.09    0.11    0.24    0.13    0.05    
{'test_rmse': array([ 1.56810258,  1.67996581,  1.56717633, 21.53654016, 13.4506578 ]), 'test_mae': array([0.96025131, 0.96401855, 0.9542648 , 1.19646364, 1.07026445]), 'fit_time': (0.02

In [None]:
trainCustomDataset("/content/dataset/Appliances_subset_surprise.csv", 10)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.6311  1.6262  1.5613  1.6134  1.5604  1.6804 

In [None]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import surprise; print("surprise", surprise.__version__)

Linux-5.10.147+-x86_64-with-glibc2.29
Python 3.8.10 (default, Nov 14 2022, 12:59:47) 
[GCC 9.4.0]
surprise 1.1.3
