In [1]:
#### To measure all running time
# https://github.com/cpcloud/ipython-autotime

%load_ext autotime

In [2]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy
import random
import pandas as pd
import json
import numpy as np
import time
import csv
from collections import Counter


time: 298 ms


- Download dataset from https://drive.google.com/drive/folders/1dnCnSqniJMDFGw8VIiKG5S-_hJmGBJqt

In [4]:
# colnames=['user_id', 'product_id', 'rating'] 
# rating_df = pd.read_csv(path, names=colnames, header=None,  compression='gzip')

time: 189 µs


In [5]:
def parse(path):
    for line in gzip.open(path, 'r'):
        yield json.loads(line)

time: 527 µs


### For 5-core ( start ) review 

In [6]:
DATA_DIR = './Dataset/'
fn_5core = 'reviews_Clothing_Shoes_and_Jewelry_5_2.json.gz'
path = DATA_DIR + fn_5core
print(path)

BATCH_SIZE = 100000

i = 0

dataset = []
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list) 

for line in parse(path):
    d = dict()
    d['user_id'] = line['reviewerID']
    d['product_id'] = line['asin']
    d['rating'] = int(line['overall'])
    dataset.append(d)
    i += 1
    if i > BATCH_SIZE:
        break
    
for d in dataset:
    user,item = d['user_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

dataset[10]

./Dataset/reviews_Clothing_Shoes_and_Jewelry_5_2.json.gz


{'user_id': 'AX1QE6IR7CHXM', 'product_id': '0000031887', 'rating': 5}

time: 1.36 s


In [7]:
N = len(dataset)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)

#Getting a list of keys
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

#This is equivalent to our Rating Mean from week 1
alpha = sum([d['rating'] for d in dataset]) / len(dataset)

#Create another two defaultdict's, this time being float types because they are prediction based
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

time: 7.55 ms


In [8]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

time: 462 µs


In [9]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return alpha

time: 1.53 ms


In [10]:
class Logger():
    def __init__(self):
        self.STATUS = 'OFF'
        self.START_TIME = None
        self.END_TIME = None
        self.EXECUTION_TIME = None
        self.LOGS = []
        self.MODEL = None
        self.SCORE = None
        self.STAT = None
        
    def start(self, model=None, stat=None, score=None):
        self.START_TIME = time.time()
        self.STATUS = 'ON'
        if model:
            self.MODEL = model
            self.LOGS.append("Model: {m}".format(m=model))
        if stat:
            self.STAT = stat
            self.LOGS.append("Statistic: {s}".format(s=stat))
        if score:
            self.SCORE = score
            self.LOGS.append("Score: {s}".format(s=score))
        
    def end(self, display=True, score=None):
        if self.STATUS == 'OFF':
            print("No timer started.")
        else:
            self.END_TIME = time.time()
            self.EXECUTION_TIME = self.END_TIME - self.START_TIME
            self.LOGS.append("Time: {t}".format(t=self.EXECUTION_TIME))
            if score:
                self.SCORE = score
                self.LOGS.append("Score: {s}".format(s=score))
            if display == True:
                self.getStats(last=False)
            else:
                r = self.LOGS
                self.tearDown()
                return r
            self.tearDown()
    
    def tearDown(self):
        self.STATUS = 'OFF'
        self.LOGS = []
        
    def getStats(self, show=True, last=True):
        if show == True:   
            if last == True:
                print("STATUS: {v}".format(v=self.STATUS))
                print("START_TIME: {v}".format(v=self.START_TIME))
                print("END_TIME: {v}".format(v=self.END_TIME))
                print("EXECUTION_TIME: {v}".format(v=self.EXECUTION_TIME))
                print("MODEL: {v}".format(v=self.MODEL))
                print("STAT: {v}".format(v=self.STAT))
                print("SCORE: {v}".format(v=self.SCORE))
            else:
                for l in self.LOGS:
                    print(l)
        else:
            return self.MODEL, self.STAT, self.SCORE, self.EXECUTION_TIME

        
timer = Logger()

time: 3.33 ms


In [11]:
labels = [d['rating'] for d in dataset]

time: 6.42 ms


In [12]:
# baseline
alwaysPredictMean = [alpha for d in dataset]
labels = [d['rating'] for d in dataset]
MSE(alwaysPredictMean, labels)

cfPredictions = [predictRating(d['user_id'], d['product_id']) for d in dataset] 

print(MSE(alwaysPredictMean, labels))
print(MSE(cfPredictions, labels))
print(MSE(alwaysPredictMean, labels), MSE(cfPredictions, labels)) 
print()
timer.start(model='Baseline', stat='MSE', score=MSE(alwaysPredictMean, labels))
alwaysPredictMean = [alpha for d in dataset]
timer.end()
print()

1.1630374059367858
1.40054874327075
1.1630374059367858 1.40054874327075

Model: Baseline
Statistic: MSE
Score: 1.1630374059367858
Time: 0.003266572952270508

time: 1.56 s


### Heuristic 

- Heuristic analysis is an expert based analysis that determines the susceptibility of a system towards particular threat/risk using various decision rules or weighing methods. MultiCriteria analysis (MCA) is one of the means of weighing.

- https://en.wikipedia.org/wiki/Heuristic_analysis

In [13]:
score = MSE(cfPredictions, labels)
cfPredictions = [predictRating(d['user_id'], d['product_id']) for d in dataset]

print("Mode: Weighted Ratings Heuristic")
print("Score by MSE: ", score)

Mode: Weighted Ratings Heuristic
Score by MSE:  1.40054874327075
time: 1.47 s


In [14]:
def mostSimilar(item, n):
    similarities = []
    users = usersPerItem[item]
    for i2 in usersPerItem:
        if i2 == item: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append([sim,i2])
    similarities.sort(reverse=True)
    return similarities[:n]

def mostSimilarFast(item, n):
    similarities = []
    users = usersPerItem[item]
    candidateItems = set()
    for u in users:
        candidateItems = candidateItems.union(itemsPerUser[u])
    for i2 in candidateItems:
        if i2 == item: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append([sim, i2])
    similarities.sort(reverse=True)
    return similarities[:n]

# Test Params
n = 10 
idx = 101 
query = dataset[idx]['product_id']

print("Index: {i}".format(i=idx))
print("ProductID: {q}".format(q=query))
print("Number Matches: {i}".format(i=n))

Index: 101
ProductID: B00004SR8Z
Number Matches: 10
time: 4.72 ms


In [15]:
timer.start(model='Most Similar', stat='Jaccard Similarity')
sims1 = mostSimilar(query, n)
timer.end(display=True)
sims1

Model: Most Similar
Statistic: Jaccard Similarity
Time: 0.03053903579711914


[[0.0392156862745098, 'B001J4HQ76'],
 [0.038461538461538464, 'B000J46QHS'],
 [0.038461538461538464, 'B0001XLSWA'],
 [0.037037037037037035, 'B0036VNOG2'],
 [0.037037037037037035, 'B0029F1X3W'],
 [0.037037037037037035, 'B001T0IM5U'],
 [0.037037037037037035, 'B000OVJY7A'],
 [0.037037037037037035, 'B000FVY4JW'],
 [0.03571428571428571, 'B0038P22QE'],
 [0.03571428571428571, 'B001GXH2JM']]

time: 33.3 ms


In [16]:
timer.start(model='Most Similar Optimized', stat='Jaccard Similarity')
sims2 = mostSimilarFast(query, n)
timer.end(display=True)
sims2

Model: Most Similar Optimized
Statistic: Jaccard Similarity
Time: 0.00029969215393066406


[[0.0392156862745098, 'B001J4HQ76'],
 [0.038461538461538464, 'B000J46QHS'],
 [0.038461538461538464, 'B0001XLSWA'],
 [0.037037037037037035, 'B0036VNOG2'],
 [0.037037037037037035, 'B0029F1X3W'],
 [0.037037037037037035, 'B001T0IM5U'],
 [0.037037037037037035, 'B000OVJY7A'],
 [0.037037037037037035, 'B000FVY4JW'],
 [0.03571428571428571, 'B0038P22QE'],
 [0.03571428571428571, 'B001GXH2JM']]

time: 3.47 ms


In [17]:
df = pd.DataFrame(dataset)
X = df[['user_id', 'product_id']]
y = df[['rating']]
df.head()

Unnamed: 0,user_id,product_id,rating
0,A1KLRMWW2FWPL4,31887,5
1,A2G5TCU2WDFZ65,31887,5
2,A1RLQXYNCMWRWN,31887,5
3,A8U3FAMSJVHS5,31887,5
4,A3GEOILWLK86XM,31887,5


time: 85.3 ms


# Collaberative filtering 

* Product Similarity recommedation
* User Similarity recomendation

This model uses historical user/item ratings that are similar to predict ratings.

### Documention for Surprise
- http://surpriselib.com/

In [18]:
# !pip install surprise

time: 201 µs


In [19]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

#Reading the dataset
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df,reader)

time: 253 ms


In [20]:
#Splitting the dataset
trainset, testset = train_test_split(data, 
                                     test_size=0.3,
                                     random_state=11, 
                                     shuffle=True)

time: 151 ms


### Use user_based true/false to switch between user-based or item-based collaborative filtering

In [21]:
timer.start(model='Product KNN', stat='MSE')
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)
test_pred = algo.test(testset)
acc = accuracy.mse(test_pred, verbose=False)
timer.end()
print("Score: ", acc)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Model: Product KNN
Statistic: MSE
Time: 1.8810961246490479
Score:  1.2468510963333113
time: 1.88 s


In [22]:
timer.start(model='User KNN', stat='MSE')
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)
test_pred = algo.test(testset)
acc = accuracy.mse(test_pred, verbose=False)
timer.end()
print("Score: ", acc)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Model: User KNN
Statistic: MSE
Time: 31.48060369491577
Score:  1.4815124763082748
time: 31.5 s
