In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy
import random
import pandas as pd
import json
import numpy as np
import time
import csv
from collections import Counter


In [None]:
DATA_DIR = 
fn_5core = 'Clothing_Shoes_and_Jewelry_5.json.gz'
fn_meta = 'meta_Clothing_Shoes_and_Jewelry.json.gz'
fn_reviews = 'reviews_Clothing_Shoes_and_Jewelry_5.json.gz'
fn_ratings = 'Clothing_Shoes_and_Jewelry.csv.gz'
sample = 'sample_data_1M.json.gz'
path = DATA_DIR + fn_ratings
ENV = 'local'


In [604]:
def parse(path):
    for line in gzip.open(path, 'r'):
        yield json.loads(line)

In [605]:
BATCH_SIZE = 100000
i = 0
for line in parse(path):
    d = dict()
    d['user_id'] = line['reviewerID']
    d['product_id'] = line['asin']
    d['rating'] = int(line['overall'])
    dataset.append(d)
    i += 1
    if i > BATCH_SIZE:
        break
    
for d in dataset:
    user,item = d['user_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

dataset[0]

{'user_id': 'A2KNKGKEDKKVYK', 'product_id': 'B008M61DU0', 'rating': 5}

In [78]:
N = len(dataset)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)

#Getting a list of keys
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

#This is equivalent to our Rating Mean from week 1
alpha = sum([d['rating'] for d in dataset]) / len(dataset)

#Create another two defaultdict's, this time being float types because they are prediction based
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [79]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [80]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return alpha

In [383]:
class Logger():
    def __init__(self):
        self.STATUS = 'OFF'
        self.START_TIME = None
        self.END_TIME = None
        self.EXECUTION_TIME = None
        self.LOGS = []
        self.MODEL = None
        self.SCORE = None
        self.STAT = None
        
    def start(self, model=None, stat=None, score=None):
        self.START_TIME = time.time()
        self.STATUS = 'ON'
        if model:
            self.MODEL = model
            self.LOGS.append("Model: {m}".format(m=model))
        if stat:
            self.STAT = stat
            self.LOGS.append("Statistic: {s}".format(s=stat))
        if score:
            self.SCORE = score
            self.LOGS.append("Score: {s}".format(s=score))
        
    def end(self, display=True, score=None):
        if self.STATUS == 'OFF':
            print("No timer started.")
        else:
            self.END_TIME = time.time()
            self.EXECUTION_TIME = self.END_TIME - self.START_TIME
            self.LOGS.append("Time: {t}".format(t=self.EXECUTION_TIME))
            if score:
                self.SCORE = score
                self.LOGS.append("Score: {s}".format(s=score))
            if display == True:
                self.getStats(last=False)
            else:
                r = self.LOGS
                self.tearDown()
                return r
            self.tearDown()
    
    def tearDown(self):
        self.STATUS = 'OFF'
        self.LOGS = []
        
    def getStats(self, show=True, last=True):
        if show == True:   
            if last == True:
                print("STATUS: {v}".format(v=self.STATUS))
                print("START_TIME: {v}".format(v=self.START_TIME))
                print("END_TIME: {v}".format(v=self.END_TIME))
                print("EXECUTION_TIME: {v}".format(v=self.EXECUTION_TIME))
                print("MODEL: {v}".format(v=self.MODEL))
                print("STAT: {v}".format(v=self.STAT))
                print("SCORE: {v}".format(v=self.SCORE))
            else:
                for l in self.LOGS:
                    print(l)
        else:
            return self.MODEL, self.STAT, self.SCORE, self.EXECUTION_TIME

        
timer = Logger()

In [384]:
labels = [d['rating'] for d in dataset]

In [197]:
# baseline
timer.start(model='Baseline', stat='MSE', score=MSE(alwaysPredictMean, labels))
alwaysPredictMean = [alpha for d in dataset]
timer.end()

Model: baseline
Statistic: MSE
Score: 1.1630374059367858
Time: 0.004736900329589844


In [198]:
# heuristic 
timer.start(model='Weighted Ratings Heuristic', stat='MSE', score=MSE(cfPredictions, labels))
cfPredictions = [predictRating(d['user_id'], d['product_id']) for d in dataset]
timer.end()

Model: Collaborative Filtering Heuristic
Statistic: MSE
Score: 1.40054874327075
Time: 1.8000781536102295


In [385]:
def mostSimilar(item, n):
    similarities = []
    users = usersPerItem[item]
    for i2 in usersPerItem:
        if i2 == item: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append([sim,i2])
    similarities.sort(reverse=True)
    return similarities[:n]

def mostSimilarFast(item, n):
    similarities = []
    users = usersPerItem[item]
    candidateItems = set()
    for u in users:
        candidateItems = candidateItems.union(itemsPerUser[u])
    for i2 in candidateItems:
        if i2 == item: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append([sim, i2])
    similarities.sort(reverse=True)
    return similarities[:n]

# Test Params
n = 10 
idx = 101 
query = dataset[idx]['product_id']
print("Index: {i}".format(i=idx))
print("ProductID: {q}".format(q=query))
print("Number Matches: {i}".format(i=n))

Index: 101
ProductID: B00004SR8Z
Number Matches: 10


In [433]:
timer.start(model='Most Similar', stat='Jaccard Similarity')
sims1 = mostSimilar(query, n)
timer.end(display=True)
sims1

Model: Most Similar
Statistic: Jaccard Similarity
Time: 0.023540973663330078


[[0.0392156862745098, 'B001J4HQ76'],
 [0.038461538461538464, 'B000J46QHS'],
 [0.038461538461538464, 'B0001XLSWA'],
 [0.037037037037037035, 'B0036VNOG2'],
 [0.037037037037037035, 'B0029F1X3W'],
 [0.037037037037037035, 'B001T0IM5U'],
 [0.037037037037037035, 'B000OVJY7A'],
 [0.037037037037037035, 'B000FVY4JW'],
 [0.03571428571428571, 'B0038P22QE'],
 [0.03571428571428571, 'B001GXH2JM']]

In [434]:
timer.start(model='Most Similar Optimized', stat='Jaccard Similarity')
sims2 = mostSimilarFast(query, n)
timer.end(display=True)
sims2

Model: Most Similar Optimized
Statistic: Jaccard Similarity
Time: 0.0006649494171142578


[[0.0392156862745098, 'B001J4HQ76'],
 [0.038461538461538464, 'B000J46QHS'],
 [0.038461538461538464, 'B0001XLSWA'],
 [0.037037037037037035, 'B0036VNOG2'],
 [0.037037037037037035, 'B0029F1X3W'],
 [0.037037037037037035, 'B001T0IM5U'],
 [0.037037037037037035, 'B000OVJY7A'],
 [0.037037037037037035, 'B000FVY4JW'],
 [0.03571428571428571, 'B0038P22QE'],
 [0.03571428571428571, 'B001GXH2JM']]

In [440]:
df = pd.DataFrame(dataset)
X = df[['user_id', 'product_id']]
y = df[['rating']]
df.head()

Unnamed: 0,user_id,product_id,rating
0,A1KLRMWW2FWPL4,31887,5
1,A2G5TCU2WDFZ65,31887,5
2,A1RLQXYNCMWRWN,31887,5
3,A8U3FAMSJVHS5,31887,5
4,A3GEOILWLK86XM,31887,5


# Collaberative filtering 

* Product Similarity recommedation
* User Similarity recomendation

This model uses historical user/item ratings that are similar to predict ratings.

In [573]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

#Reading the dataset
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df,reader)

In [596]:
#Splitting the dataset
trainset, testset = train_test_split(data, test_size=0.3,random_state=11, shuffle=True)

In [598]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
timer.start(model='Product KNN', stat='MSE')
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)
test_pred = algo.test(testset)
acc = accuracy.mse(test_pred, verbose=False)
timer.end()
print("Score: ", acc)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Model: Product KNN
Statistic: MSE
Time: 5.375831842422485
Score:  1.2468510963333113


In [599]:
timer.start(model='User KNN', stat='MSE')
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)
test_pred = algo.test(testset)
acc = accuracy.mse(test_pred, verbose=False)
timer.end()
print("Score: ", acc)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Model: User KNN
Statistic: MSE
Time: 83.10871982574463
Score:  1.4815124763082748


## Script to create batch files with 1M records

In [739]:
item2indx = dict()
user2indx = dict()
review2indx = dict()
metadata = {}
user_counts = Counter()
item_counts = Counter()
review_counts = Counter()

idx = 0
idxs = 0
BATCH_SIZE=1000000
nBatches = 24
cBatch = 0
maxid = idx + BATCH_SIZE
f = gzip.open(path, 'rt', encoding="utf8")
header = ['userID', 'itemID', 'rating']

for line in f:
    fields = line.strip().split(',')
    d = dict(zip(header, fields))

    user, item, rating = d['itemID'], d['userID'], int(d['rating'][0])

    if user not in user2indx:
        user2indx[user] = len(user2indx)
    if item not in item2indx:
        item2indx[item] = len(item2indx)

    userid, itemid = user2indx[user], item2indx[item]
    user_counts[userid] += 1
    item_counts[itemid] += 1
    review_counts[rating] += 1

    if itemid < minItemId:
        minItemId = itemid
    if itemid > maxItemId:
        maxItemId = itemid

    reviewIdxs.append([userid, itemid, rating])
    idx += 1

    if idx >= maxid:
        try:
            with open(DATA_DIR + 'Clothing_Shoes_and_Jewelry/items/' + str(minItemId) + "_" + str(maxItemId) + ".csv", 'w') as batch2:
                batch_writer2 = csv.writer(batch2)   
                print(minItemId, maxItemId)
                batch_writer2.writerows(reviewIdxs)
            
        except csv.Error as e:
            print(e)

        finally:
            batch2.close()
            reviewIdxs = []
            maxid = idx + BATCH_SIZE
            minItemId = 9999999999999999
            maxItemId = 0        
            cBatch += 1
            if maxid >= 32292099 + 100000:
                maxid = 32292099
                cBatch = 23
                print('maxbatch')
            if cBatch >= nBatches:
                break  

0 5969
5956 14606
11094 18900
18900 28606
28604 41390
41382 56692
56692 73750
73747 91267
91263 109065
109013 128510
128508 148390
148390 168555
168555 188580
188579 208764
208764 229280
229279 250356
250356 268629
240952 286129
286129 304123
304123 325093
325089 345807
345807 366751
0 420880
14022 609843


In [740]:
# Example data sets
pd.read_csv(DATA_DIR + 'Clothing_Shoes_and_Jewelry/items/14022_609843.csv', header=None, names=['userID', 'itemID', 'rating'])


Unnamed: 0,userID,itemID,rating
0,3044901,14022,4
1,1811413,14022,5
2,4609280,14022,5
3,10070935,14022,5
4,4153187,14022,5
...,...,...,...
999995,10376925,61101,1
999996,1947088,61101,5
999997,10376926,61101,2
999998,6603819,61101,2


In [747]:
user2indx
user_counts
item_counts
len(item2indx)
print('i size: {}'.format(len(item_counts)))
print('u size: {}'.format(len(user_counts)))
print('r size: {}'.format(len(review_counts)))
print('most common: {}'.format(item_counts.most_common(10)))
indx2item = {indx:itm for itm,indx in item2indx.items()}
index2user = {indx:itm for itm,indx in user2indx.items()}
with open(DATA_DIR + 'Clothing_Shoes_and_Jewelry/items/id_to_item.json', 'w') as f:
    json.dump(indx2item, f)

{0: 'A2IC3NZN488KWK',
 1: 'A3OT9BYASFGU2X',
 2: 'A28GK1G2KDXHRP',
 3: 'A3NFXFEKW8OK0E',
 4: 'A3I6G5TKBVJEK9',
 5: 'A1A7Y1M8AJWNZ8',
 6: 'A30FG02C424EJ5',
 7: 'ADQQYU1UCDEWB',
 8: 'A39YL2NXZORK56',
 9: 'A2PRY50ZESF1MH',
 10: 'A2G9GWQEWWNQUB',
 11: 'A3RGH15H17SM1Z',
 12: 'A20QJNRKLJVP1E',
 13: 'A1G26EYQGW3YF1',
 14: 'A2JGAZF2Y2BDU6',
 15: 'A3NI5OGW35SLY2',
 16: 'A1OPRA4NE56EV6',
 17: 'A3M6UXIK7XTA7A',
 18: 'A3I3B5OSB80ZXC',
 19: 'A62O7C5RQB353',
 20: 'A8MZ2YP8UJA9Q',
 21: 'AL3IEZLLIAGNH',
 22: 'A22ZX01TPWQY4G',
 23: 'A1YIEW86G14BHP',
 24: 'A3A96RTGZTWKWG',
 25: 'AA7PNT2OPS3RP',
 26: 'A3LOIIIW4G3TL7',
 27: 'A25QEMMPTX5D5D',
 28: 'AZI75OKBKZ98R',
 29: 'A1NZRLAQC3XB32',
 30: 'A16DSXRAN5QK94',
 31: 'A3GWE80SUGORJD',
 32: 'A2NLY1TJ8TYV6D',
 33: 'A2GU5SHR2DC29H',
 34: 'A2FHBHNKHRDS72',
 35: 'A1EAXUN286CZ5Z',
 36: 'A1OMRK7B7SN83C',
 37: 'A3TCVTOI95UQUT',
 38: 'A17S6SU1PHP443',
 39: 'A3IUKE1LC890XR',
 40: 'A2R2AOAQ0E7CCN',
 41: 'A3FLZJDY64B3NS',
 42: 'A2CPOWZUTJWXJ7',
 43: 'ASR774IIR8VN7',
 44: 

In [89]:
new_df1=dfs.head(10000)
ratings_matrix = new_df1.pivot_table(values='rating', index='user_id', columns='product_id', fill_value=0)
ratings_matrix

product_id,0000031887,0123456479,1608299953,1617160377,B00001W0KA,B00001WRHJ,B00004SR8W,B00004SR8Z,B00004SR9P,B00004U1J2,...,B0006Z3HMW,B0006ZJMIK,B00073ZLIG,B00074HDAO,B00074KYC8,B00074MEYE,B00075J1K8,B00075ZWR4,B00075ZXLE,B00075ZYRW
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00635603LUUJQPQWSJW1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0148968UM59JS3Y8D1M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A02155413BVL8D0G7X6DN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A02324053VU4N09WNQTBN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A02755422E9NI29TCQ5W3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZXS6P5QWNMLC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZYGUNBHBC9RI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZYP4FQ2L2C4O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZYZQ7I9L7G3G,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Notes / Scratch code

In [443]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)


In [559]:
# metadata = {}
# metadata['X_train'] = {
#     "users": X_train['user_id'].count(),
#     "items": X_train['product_id'].count()
# }
# metadata['y_train'] = {
#     "ratings": y_train['rating'].count()
# }
# metadata['X_valid'] = {
#     "users": X_valid['user_id'].count(),
#     "items": X_valid['product_id'].count()
# }
# metadata['y_valid'] = {
#     "ratings": y_valid['rating'].count()
# }
# metadata['X_test'] = {
#     "users": X_test['user_id'].count(),
#     "items": X_test['product_id'].count()
# }
# metadata['y_test'] = {
#     "ratings": y_test['rating'].count()
# }
# metadata

def load(data):
    """
    takes the training file no and return training and test data
    Ex. fileno = 1 for u1.base and u1.test
        fileno = 5 for u5.base and u5.test
    """
#     dataset = filename
#     df = pd.DataFrame(dataset)
#     X = df[['user_id', 'product_id']]
#     y = df[['rating']]
#     df.head()
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#     X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    # userid songid user-rating
#     basedir = "../dataset/ml-100k/u%s." % (fileno)
#     with open(basedir + 'base') as f:
#         training = np.loadtxt(f)
#     with open(basedir + 'test') as f:
#         test = np.loadtxt(f)    
#     with open('../dataset/ml-100k/u.info') as f:
#         metafile = np.genfromtxt(f)
#     metafile = np.delete(metafile, len(metafile[0]) - 1, 1)

    metadata = {}
    metadata['users'] = data['user_id'].nunique() 
    metadata['items'] = data['product_id'].nunique() 
    metadata['ratings'] = data['rating'].nunique() 
    return data, metadata

data, metadata = load(df)


In [561]:
def constructRatingMatrix(data, metadata):
    user = int(metadata['users'])
    item = int(metadata['items'])
    # ratingMatrix = np.zeros((metadata['users'], metadata['items']))
    ratingMatrix = np.zeros((user, item))
    for i in data:
        
        ratingMatrix[int(i[0])-1][int(i[1])-1] = i[2] 
    return ratingMatrix



ratingMatrix = constructRatingMatrix(data, metadata)
ratingMatrix

ValueError: invalid literal for int() with base 10: 'u'

In [499]:
d = np.array(X_train)
for i in d:
#     print(i-2)
    print(i[0])
    print(i[1])
    break

A36V3SNW7TPKR9
B0001MQ60A


In [556]:
arr
# dataArray
ratingMatrix.shape

(29261, 7249)

In [538]:
timer.start()
def makeDic(data):
    dataDic = {}
    for i in data:
        dataDic.setdefault(i[0], {}).update({i[1]: i[2]})
    return dataDic

dataArray = np.array(df)
dataDic = makeDic(dataArray)
timer.end()
len(dataDic)

Time: 0.1729869842529297


33593

In [541]:
timer.start()
def makeArray(dic):
    arr = []
    for user,items in dic.items():
        for k,v in items.items():
            row = []
            row.append(user)
            row.append(k)
            row.append(v)
            arr.append(row)
    return arr

arr = makeArray(dataDic)
timer.end()
len(arr)

Time: 0.09032797813415527


100001

In [90]:
le = preprocessing.LabelEncoder()
le.fit(dfs['user_id'])
usersEnc = le.transform(dfs['user_id'])
le.classes_

le2 = preprocessing.LabelEncoder()
le2.fit(dfs['product_id'])
productsEnc = le2.transform(dfs['product_id'])
le2.classes_

le3 = preprocessing.LabelEncoder()
le3.fit(dfs['rating'])
ratingsEnc = le3.transform(dfs['rating'])
le3.classes_

array([1, 2, 3, 4, 5])

In [91]:
mat = np.array([usersEnc, productsEnc, ratingsEnc])
mat = mat.T
print(mat.shape)
mat

(50001, 3)


array([[ 3724,     0,     4],
       [ 9682,     0,     4],
       [ 5062,     0,     4],
       ...,
       [20419,  3261,     4],
       [19721,  3261,     4],
       [17681,  3261,     1]])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

simmat = cosine_similarity(mat).flatten()
simmat

In [64]:
X = ratings_matrix.T
X

user_id,A00635603LUUJQPQWSJW1,A04559521ZZJ87IMRVPME,A1001UEHDLCC1Z,A100WFKYVRPVX7,A1027WM3PXKVMR,A103HHE0H07OYZ,A103TW8KWPKZT2,A103VMN9OENIN6,A1059SSXUZZS1S,A107C6XHEWELEO,...,AZLS3FBNC8A6P,AZLYAJ8XE443K,AZNS7TH82KH9K,AZPHYNPEZDMIO,AZPQZEN7242O7,AZQSQSF2QI02F,AZTF7EHUO6GDK,AZUNMKF75X0WY,AZXS6P5QWNMLC,AZZTOUKVTUMVM
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000031887,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0123456479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1608299953,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1617160377,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B00001W0KA,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B0001YS56Q,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B0001YSBEW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B0001YWBG6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,0,0
B0001ZNZF6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
#Decomposing the Matrix
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

(253, 10)

In [66]:
#Correlation Matrix

correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(253, 253)

In [71]:
# trainset, testset = train_test_split(data, test_size=0.3,random_state=10)
X.index[75]

i = "B0001ZNZF6"

product_names = list(X.index)
product_ID = product_names.index(i)
product_ID


251

In [72]:
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

(253,)

In [73]:
Recommend = list(X.index[correlation_product_ID > 0.65])

# Removes the item already bought by the customer
Recommend.remove(i) 

Recommend[0:24]

['0123456479',
 'B00001W0KA',
 'B00005TQI7',
 'B000086211',
 'B00008ID39',
 'B0000ANDEI',
 'B0000AUTD6',
 'B0000AWHQ4',
 'B0000B3DUK',
 'B0000DYNCD',
 'B0000DYNCP',
 'B0000UIUNE',
 'B0000ZD54S',
 'B00011VJ7Q',
 'B00018CAUY',
 'B00019MH42',
 'B0001DYW9G',
 'B0001XLSWA',
 'B0001XVUFA']

In [618]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1a348d2d68>

In [619]:
# run the trained model against the testset
test_pred = algo.test(testset)

print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

User-based Model : Test Set
RMSE: 1.1584


1.1584034894080402

In [616]:
# run the trained model against the testset
test_pred = algo.test(testset)

print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 1.1350


1.1349773546536244

In [607]:
# get RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 1.1375


1.1375371403440921

In [614]:
def cosine(n_x, yr, min_support):
    # sum (r_xy * r_x'y) for common ys
#     cdef np.ndarray[np.double_t, ndim=2] prods
#     # number of common ys
#     cdef np.ndarray[np.int_t, ndim=2] freq
#     # sum (r_xy ^ 2) for common ys
#     cdef np.ndarray[np.double_t, ndim=2] sqi
#     # sum (r_x'y ^ 2) for common ys
#     cdef np.ndarray[np.double_t, ndim=2] sqj
#     # the similarity matrix
#     cdef np.ndarray[np.double_t, ndim=2] sim

#     cdef int xi, xj
#     cdef double ri, rj
#     cdef int min_sprt = min_support

    prods = np.zeros((n_x, n_x), np.double)
    freq = np.zeros((n_x, n_x), np.int)
    sqi = np.zeros((n_x, n_x), np.double)
    sqj = np.zeros((n_x, n_x), np.double)
    sim = np.zeros((n_x, n_x), np.double)

    for y, y_ratings in iteritems(yr):
        for xi, ri in y_ratings:
            for xj, rj in y_ratings:
                freq[xi, xj] += 1
                prods[xi, xj] += ri * rj
                sqi[xi, xj] += ri**2
                sqj[xi, xj] += rj**2

    for xi in range(n_x):
        sim[xi, xi] = 1
        for xj in range(xi + 1, n_x):
            if freq[xi, xj] < min_sprt:
                sim[xi, xj] = 0
            else:
                denum = np.sqrt(sqi[xi, xj] * sqj[xi, xj])
                sim[xi, xj] = prods[xi, xj] / denum

            sim[xj, xi] = sim[xi, xj]

    return sim
    

In [506]:
from sklearn.metrics.pairwise import cosine_similarity

def ww_sim(s1, s2):
    """Calculate topn most similar words to word"""
#     indx = tok2indx[word]
    s1list = []
    for x in s1:
        indx = tok2indx[x]
        s1list.append(indx)
        
    s2list = []

    for y in s2:
        indy = tok2indx[y]
        s2list.append(indy)
#     if isinstance(mat, sparse.csr_matrix):
#         v1 = mat.getrow(indx)
#     else:
#         v1 = mat[indx:indx+1, :]
    sims = cosine_similarity(np.array(s1list).reshape(-1, 1), np.array(s2list).reshape(-1, 1))
#     sindxs = np.argsort(-sims)
#     sim_word_scores = [(indx2tok[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return sims

In [300]:
p_c = cooc/cooc.sum(axis=0).sum()
p_c_w = cooc
for x in range(len(p_c_w)):
    p_c_w[x,:] = p_c_w[x,:]/p_c_w[x,:].sum()

p_c_w

  after removing the cwd from sys.path.


array([[0.89130435, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.84375   , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan]])

In [302]:
item2indx = dict()
item_counts = Counter()

for token in df:
    item = token[0]
    item_counts[item] += 1
    if item not in item2indx:
        item2indx[item] = len(item2indx)
indx2item = {indx:tok for tok,indx in item2indx.items()}
print('vocabulary size: {}'.format(len(item_counts)))
print('most common: {}'.format(item_counts.most_common(10)))

vocabulary size: 8
most common: [('0000031887', 23), ('B00004SR8Z', 17), ('1617160377', 16), ('1608299953', 13), ('B00004SR8W', 12), ('B00001W0KA', 8), ('0123456479', 6), ('B00001WRHJ', 6)]


In [305]:
back_window = 2
front_window = 2
skipgram_counts = Counter()

for ifw, fw in enumerate(df):
        icw_min = max(0, ifw - back_window)
        icw_max = min(len(filtered_words) - 1, ifw + front_window)
        icws = [ii for ii in range(icw_min, icw_max + 1) if ii != ifw]
        
        for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean
    
        for icw in icws:
            skipgram = (filtered_words[ifw], filtered_words[icw])
            skipgram_counts[skipgram] += 1

        
print('number of skipgrams: {}'.format(len(skipgram_counts)))
print('most common: {}'.format(skipgram_counts.most_common(10)))

[('0000031887', 23),
 ('B00004SR8Z', 17),
 ('1617160377', 16),
 ('1608299953', 13),
 ('B00004SR8W', 12),
 ('B00001W0KA', 8),
 ('0123456479', 6),
 ('B00001WRHJ', 6)]

In [342]:
from pyspark.sql import *
from pyspark import SparkContext,SparkConf

def create_sc(pyFiles):
    sc_conf = SparkConf()
    sc_conf.setAppName("RecomenderAmazon")
    sc_conf.set('spark.executor.memory', '3g')
    sc_conf.set('spark.executor.cores', '1')
    sc_conf.set('spark.cores.max', '4')
    sc_conf.set('spark.default.parallelism','10')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())

    sc = SparkContext(conf=sc_conf,pyFiles=pyFiles)

    return sc 

sc = create_sc(pyFiles=[])

sqlContext = SQLContext(sc)


dict_items([('spark.app.name', 'RecomenderAmazon'), ('spark.executor.memory', '3g'), ('spark.executor.cores', '1'), ('spark.cores.max', '4'), ('spark.default.parallelism', '10'), ('spark.logConf', 'True')])


In [362]:
df

[['0000031887', 'A1KLRMWW2FWPL4', 5.0],
 ['0000031887', 'A2G5TCU2WDFZ65', 5.0],
 ['0000031887', 'A1RLQXYNCMWRWN', 5.0],
 ['0000031887', 'A8U3FAMSJVHS5', 5.0],
 ['0000031887', 'A3GEOILWLK86XM', 5.0],
 ['0000031887', 'A27UF1MSF3DB2', 4.0],
 ['0000031887', 'A16GFPNVF4Y816', 5.0],
 ['0000031887', 'A2M2APVYIB2U6K', 5.0],
 ['0000031887', 'A1NJ71X3YPQNQ9', 4.0],
 ['0000031887', 'A3EERSWHAI6SO', 5.0],
 ['0000031887', 'AX1QE6IR7CHXM', 5.0],
 ['0000031887', 'A2A2WZYLU528RO', 5.0],
 ['0000031887', 'A391EXIT5TFP72', 5.0],
 ['0000031887', 'A34ATJR9KFIXL9', 5.0],
 ['0000031887', 'AJ6B83I4YJHYW', 5.0],
 ['0000031887', 'A26A4KKLAVTMCC', 3.0],
 ['0000031887', 'A1MXJVYXE2QU6H', 5.0],
 ['0000031887', 'A2XJ13PIXVJFJH', 1.0],
 ['0000031887', 'A287XY94U7JDM8', 5.0],
 ['0000031887', 'A1JR9KKF6UKUWW', 5.0],
 ['0000031887', 'A2SX7X8QGQVCJU', 5.0],
 ['0000031887', 'A2Q6UWR5PBHTJS', 5.0],
 ['0000031887', 'AJZPHJR944IBB', 4.0],
 ['0123456479', 'A2WNN1DQVL4LH5', 5.0],
 ['0123456479', 'A1ZPOCG2ST2CY3', 5.0],
 ['012

In [363]:
# df=sqlContext.read.parquet(parquet_path)

# RDDin
# RDD=RDDin.map(lambda v:np.array(np.insert(v,0,1),dtype=np.float64))
# df.to_parquet('df.parquet.gzip',
#               compression='gzip')  

# df.append(['0123456479', 'A1KLRMWW2FWPL4', 5.0])
# df.append(['1608299953', 'A1KLRMWW2FWPL4', 4.0])
# df.append(['1617160377', 'A2G5TCU2WDFZ65', 5.0])
sdf = sqlContext.createDataFrame(df, ("item", "user", "rating"))
sdf.show()

+----------+--------------+------+
|      item|          user|rating|
+----------+--------------+------+
|0000031887|A1KLRMWW2FWPL4|   5.0|
|0000031887|A2G5TCU2WDFZ65|   5.0|
|0000031887|A1RLQXYNCMWRWN|   5.0|
|0000031887| A8U3FAMSJVHS5|   5.0|
|0000031887|A3GEOILWLK86XM|   5.0|
|0000031887| A27UF1MSF3DB2|   4.0|
|0000031887|A16GFPNVF4Y816|   5.0|
|0000031887|A2M2APVYIB2U6K|   5.0|
|0000031887|A1NJ71X3YPQNQ9|   4.0|
|0000031887| A3EERSWHAI6SO|   5.0|
|0000031887| AX1QE6IR7CHXM|   5.0|
|0000031887|A2A2WZYLU528RO|   5.0|
|0000031887|A391EXIT5TFP72|   5.0|
|0000031887|A34ATJR9KFIXL9|   5.0|
|0000031887| AJ6B83I4YJHYW|   5.0|
|0000031887|A26A4KKLAVTMCC|   3.0|
|0000031887|A1MXJVYXE2QU6H|   5.0|
|0000031887|A2XJ13PIXVJFJH|   1.0|
|0000031887|A287XY94U7JDM8|   5.0|
|0000031887|A1JR9KKF6UKUWW|   5.0|
+----------+--------------+------+
only showing top 20 rows



In [367]:
sqlContext.registerDataFrameAsTable(sdf,'ratings')

Query="""
SELECT item, count(item) as count 
FROM ratings 
GROUP BY item
ORDER BY count desc
"""
counts=sqlContext.sql(Query)
counts.show()

+----------+-----+
|      item|count|
+----------+-----+
|0000031887|   23|
|1617160377|   17|
|B00004SR8Z|   17|
|1608299953|   14|
|B00004SR8W|   12|
|B00001W0KA|    8|
|0123456479|    7|
|B00001WRHJ|    6|
+----------+-----+



In [366]:
Query2="""
SELECT user, count(user) as count 
FROM ratings 
GROUP BY user
ORDER BY count desc
"""
counts2=sqlContext.sql(Query2)
counts2.show()

+--------------+-----+
|          user|count|
+--------------+-----+
|A1KLRMWW2FWPL4|    3|
|A3U6J0DLLDEWM2|    2|
|A2G5TCU2WDFZ65|    2|
| ACVB24PN5KS0A|    1|
|A1JR9KKF6UKUWW|    1|
|A2TYOUN7A2UVAU|    1|
|A3I1BJIFFM4S21|    1|
|A11EKRR0LDL893|    1|
|A287XY94U7JDM8|    1|
|A2JCJJNY43QQIV|    1|
|A16GFPNVF4Y816|    1|
|A3D5B2CBFNJB0K|    1|
|A1UWE64CPO197G|    1|
|A3RB1JA72Z0TUN|    1|
|A2BN9DIBP9A0XG|    1|
|A13XISH6C2F7L9|    1|
|A1GNYV0RA0EQSS|    1|
|A11Z8RVUCDYJAC|    1|
| AZAC8O310IK4E|    1|
|A3GEOILWLK86XM|    1|
+--------------+-----+
only showing top 20 rows



In [393]:
Query3="""
SELECT item, rating from ratings 
where user = 'A1KLRMWW2FWPL4' and item <>'0000031887'
"""
counts3=sqlContext.sql(Query3)
counts3.show()

Query3="""
SELECT user, rating from ratings 
where item in ('0123456479', '1608299953')
"""
counts3=sqlContext.sql(Query3)
counts3.show()

# ratings.append(d['star_rating'])
#         similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))

+----------+------+
|      item|rating|
+----------+------+
|0123456479|   5.0|
|1608299953|   4.0|
+----------+------+

+--------------+------+
|          user|rating|
+--------------+------+
|A2WNN1DQVL4LH5|   5.0|
|A1ZPOCG2ST2CY3|   5.0|
|A1WDJIFL2F4QJA|   5.0|
|A3RB1JA72Z0TUN|   3.0|
|A1JC50F14SLAEV|   3.0|
| A8WD29DKN8ZC3|   4.0|
|A1F7YU6O5RU432|   5.0|
|A3INPLAFCMRI3I|   4.0|
|A3NHUQ33CFH3VM|   4.0|
|A253TILLU81VZK|   5.0|
|A28QH3KX709FFQ|   1.0|
|A1R377IPZOKLMM|   5.0|
|A3U6J0DLLDEWM2|   5.0|
|A1ZU55TM45Y2R8|   4.0|
| A1NIGUK4M2ST8|   4.0|
| AQY5XBYSENNZQ|   4.0|
|A1GNYV0RA0EQSS|   5.0|
|A3D5B2CBFNJB0K|   5.0|
|A35V32HZEGZH04|   5.0|
|A1KLRMWW2FWPL4|   5.0|
+--------------+------+
only showing top 20 rows



In [413]:
import numpy as np
from numpy import linalg as LA

def outerProduct(X):
    """Computer outer product and indicate which locations in matrix are undefined"""
    O=np.outer(X,X)
    N=1-np.isnan(O)
    return (O,N)

def sumWithNan(M1,M2):
    """Add two pairs of (matrix,count)"""
    (X1,N1)=M1
    (X2,N2)=M2
    N=N1+N2
    X=np.nansum(np.dstack((X1,X2)),axis=2)
    return (X,N)


def HW_func(S,N):
    E=      np.ones([365]) # E is the sum of the vectors
    NE=     np.ones([365]) # NE is the number of not-nan antries for each coordinate of the vectors
    Mean=   np.ones([365]) # Mean is the Mean vector (ignoring nans)
    O=      np.ones([365,365]) # O is the sum of the outer products
    NO=     np.ones([365,365]) # NO is the number of non-nans in the outer product.
    return  E,NE,Mean,O,NO

In [419]:
def computeCov(RDDin):
    """computeCov recieves as input an RDD of np arrays, all of the same length, 
    and computes the covariance matrix for that set of vectors"""
    RDD=RDDin.map(lambda v:np.array(np.insert(v,0,1),dtype=np.float64)) # insert a 1 at the beginning of each vector so that the same 
                                           #calculation also yields the mean vector
    OuterRDD=RDD.map(outerProduct)   # separating the map and the reduce does not matter because of Spark uses lazy execution.
    (S,N)=OuterRDD.reduce(sumWithNan)

    E,NE,Mean,O,NO=HW_func(S,N)

    Cov=O/NO - np.outer(Mean,Mean)
    # Output also the diagnal which is the variance for each day
    Var=np.array([Cov[i,i] for i in range(Cov.shape[0])])
    return {'E':E,'NE':NE,'O':O,'NO':NO,'Cov':Cov,'Mean':Mean,'Var':Var}

RDD=sc.parallelize(df)
RDD = RDD.map(lambda x: x[2])
OUT=computeCov(RDD)

eigval,eigvec=LA.eig(OUT['Cov'])
print('eigval=',eigval)
print('eigvec=',eigvec)

eigval= [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [None]:
def computeOverAllDist(rdd0):
    UnDef=np.array(rdd0.map(lambda row:sum(np.isnan(row))).sample(False,0.01).collect())
    flat=rdd0.flatMap(lambda v:list(v)).filter(lambda x: not np.isnan(x)).cache()
    count,S1,S2=flat.map(lambda x: np.float64([1,x,x**2]))\
                  .reduce(lambda x,y: x+y)
    mean=S1/count
    std=np.sqrt(S2/count-mean**2)
    Vals=flat.sample(False,0.0001).collect()
    SortedVals=np.array(sorted(Vals))
    low100,high100=find_percentiles(SortedVals,100)
    low1000,high1000=find_percentiles(SortedVals,1000)
    return {'UnDef':UnDef,\
          'mean':mean,\
          'std':std,\
          'SortedVals':SortedVals,\
          'low100':low100,\
          'high100':high100,\
          'low1000':low100,\
          'high1000':high1000
          }



In [407]:

# sdf.select('rating').filter("user == 'A1KLRMWW2FWPL4'").groupBy('user').show()
def packArray(a):
    """
    pack a numpy array into a bytearray that can be stored as a single 
    field in a spark DataFrame

    :param a: a numpy ndarray 
    :returns: a bytearray
    :rtype:

    """
    if type(a)!=np.ndarray:
        raise Exception("input to packArray should be numpy.ndarray. It is instead "+str(type(a)))
    return bytearray(a.tobytes())

In [352]:
# from numpy_pack import packArray,unpackArray
def unpackArray(x,data_type=np.int16):
    """
    unpack a bytearray into a numpy.ndarray

    :param x: a bytearray
    :param data_type: The dtype of the array. This is important because if determines how many bytes go into each entry in the array.
    :returns: a numpy array
    :rtype: a numpy ndarray of dtype data_type.

    """
    return np.frombuffer(x,dtype=data_type)

data=mdf.rdd.map(lambda row: unpackArray(row['Values'],np.float16))


ModuleNotFoundError: No module named 'numpy_pack'