## Rating Prediction Model

In [None]:
import random
from sklearn import linear_model
from matplotlib import pyplot as plt
from collections import defaultdict
import gzip
import urllib

In [None]:
path = "/content/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

header = f.readline()
header = header.strip().split('\t')

In [None]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [None]:
dataset = []

pairsSeen = set()

for line in f:
  fields = line.strip().split('\t')
  d = dict(zip(header, fields))

  ui = (d['customer_id'], d['product_id'])
  if ui in pairsSeen:
      print("Skipping duplicate user/item:", ui)
      continue
  pairsSeen.add(ui)
  d['star_rating'] = int(d['star_rating'])
  d['helpful_votes'] = int(d['helpful_votes'])
  d['total_votes'] = int(d['total_votes'])
  dataset.append(d) # dataset will have unique user/item pair


Skipping duplicate user/item: ('46953315', 'B00QM3CNN6')
Skipping duplicate user/item: ('31616428', 'B0026RB0G8')
Skipping duplicate user/item: ('47240912', 'B008I653SC')
Skipping duplicate user/item: ('14503091', 'B003FRMRC4')
Skipping duplicate user/item: ('38538360', 'B00HVLUR86')
Skipping duplicate user/item: ('43448024', 'B00HVLUR86')
Skipping duplicate user/item: ('51525270', 'B00HVLUR86')
Skipping duplicate user/item: ('20652160', 'B004OU2IQG')
Skipping duplicate user/item: ('10964440', 'B00HVLUR86')
Skipping duplicate user/item: ('20043677', 'B00HVLUR86')
Skipping duplicate user/item: ('44796499', 'B00HVLUSGM')
Skipping duplicate user/item: ('29066899', 'B0002CZSYO')
Skipping duplicate user/item: ('10385056', 'B004OU2IQG')
Skipping duplicate user/item: ('1658551', 'B00HVLURL8')
Skipping duplicate user/item: ('907433', 'B00N9Q2E5G')
Skipping duplicate user/item: ('39412969', 'B00HVLUR86')
Skipping duplicate user/item: ('4901688', 'B00HVLUR86')
Skipping duplicate user/item: ('234

In [None]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [None]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

#### Jaccard Similarity

In [None]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair
reviewsPerUser = defaultdict(list)

for d in dataTrain:
    user, item = d["customer_id"], d["product_id"]
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

for d in dataset:
    user, item = d["customer_id"], d["product_id"]
    itemNames[item] = d["product_title"]
    ratingDict[(user, item)] = d["star_rating"]
    reviewsPerUser[user].append(d["review_body"])


In [None]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rating_user = [ratingDict[(u, i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rating_user)/len(rating_user)
for i in usersPerItem:
    rating_item = [ratingDict[(u, i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rating_item)/len(rating_item)

ratingMean = sum([d['star_rating'] for d in dataTrain]) / len(dataTrain)

In [None]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer/denom

In [None]:
def mostSimilar(i, N):  #item i
  similarity = []
  user = usersPerItem[i]
  for j in usersPerItem:
    if i == j:
      continue
    sim = Jaccard(user, usersPerItem[j])
    similarity.append((sim, j))
  similarity.sort(reverse = True)
  return similarity[:N]

In [None]:
query = 'B00KCHRKD6'

In [None]:
# Predict 10 most similar items compared to item ‘B00KCHRKD6’

ms = mostSimilar(query, 10)

#### Rating prediction model based on similarity

In [None]:
def MSE(y, ypred):
    difference = [(p-l)**2 for p, l in zip(ypred, y)]
    return sum(difference)/ len(difference)

In [None]:
def predictRating(user,item):
    total_sim = []
    weighted_rating = []

    if item not in usersPerItem:
      return ratingMean

    for j in itemsPerUser[user]:
      if j == item:
        continue
      sim = Jaccard(usersPerItem[item], usersPerItem[j])
      weighted_rating.append((ratingDict[(user, j)] - itemAverages[j]) * sim)
      total_sim.append(sim)

    if sum(total_sim) == 0:
      return itemAverages[item]

    return itemAverages[item] + (sum(weighted_rating)/sum(total_sim))


In [None]:
alwaysPredictMean = [sum([data["star_rating"] for data in dataTest]) /len(dataTest)] * len(dataTest)

In [None]:
simPredictions = [predictRating(d["customer_id"], d["product_id"]) for d in dataTest]

In [None]:
labels = [d["star_rating"] for d in dataTest]

In [None]:
MSE(simPredictions, labels)

#### Time-weight collaborative filtering

In [None]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [None]:
from datetime import datetime
import math

timestamps_dict = {(d["customer_id"], d["product_id"]): datetime.strptime(d["review_date"], '%Y-%m-%d').timestamp() for d in dataset}

def decay_function(u, i, j, lambda_value):
    object_seconds = timestamps_dict[(u, j)]
    target_seconds = timestamps_dict[(u, i)]
    time_diff = abs(target_seconds - object_seconds)
    return math.exp(-lambda_value * time_diff)

In [None]:
def predictRating_temporal(user,item):
    total_sim = []
    weighted_rating = []

    if item not in usersPerItem:
      return ratingMean

    for j in itemsPerUser[user]:
      if j == item:
        continue
      sim = Jaccard(usersPerItem[item], usersPerItem[j])
      weighted_rating.append((ratingDict[(user, j)] - itemAverages[j]) * sim * decay_function(user, item, j, 0.1))
      total_sim.append(sim * decay_function(user, item, j, 0.1))

    if sum(total_sim) == 0:
      return itemAverages[item]

    return itemAverages[item] + (sum(weighted_rating)/sum(total_sim))

In [None]:
Predictions_temporal = [predictRating_temporal(d["customer_id"], d["product_id"]) for d in dataTest]

In [None]:
labels = [d["star_rating"] for d in dataTest]

In [None]:
itsMSE = MSE(Predictions_temporal, labels)

In [None]:
itsMSE

1.6993689339769356