In [1]:
#### To measure all running time
# https://github.com/cpcloud/ipython-autotime

%load_ext autotime

In [2]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy
import random
import pandas as pd
import json
import numpy as np
import time
import csv
from collections import Counter


time: 661 ms


In [3]:
DATA_DIR = './Dataset/'

# fn_meta = 'meta_Clothing_Shoes_and_Jewelry.json.gz'
# fn_reviews = 'reviews_Clothing_Shoes_and_Jewelry_5.json.gz'
# sample = 'sample_data_1M.json.gz'

# download file from http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Clothing_Shoes_and_Jewelry.csv
# and run gzip to compress
fn_ratings = 'Clothing_Shoes_and_Jewelry.csv.gz'

path = DATA_DIR + fn_ratings

ENV = 'local'


time: 817 µs


In [4]:
# colnames=['user_id', 'product_id', 'rating'] 
# rating_df = pd.read_csv(path, names=colnames, header=None,  compression='gzip')

time: 788 µs


In [6]:
def parse(path):
    for line in gzip.open(path, 'r'):
        yield json.loads(line)

time: 1.97 ms


### For 5-core ( start ) review 

In [7]:
DATA_DIR = './Dataset/'
fn_5core = 'Clothing_Shoes_and_Jewelry_5.json.gz'
path = DATA_DIR + fn_5core
print(path)

BATCH_SIZE = 100000

i = 0

dataset = []
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list) 

for line in parse(path):
    d = dict()
    d['user_id'] = line['reviewerID']
    d['product_id'] = line['asin']
    d['rating'] = int(line['overall'])
    dataset.append(d)
    i += 1
    if i > BATCH_SIZE:
        break
    
for d in dataset:
    user,item = d['user_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

dataset[10]

./Dataset/Clothing_Shoes_and_Jewelry_5.json.gz


{'user_id': 'AZI75OKBKZ98R', 'product_id': '0871167042', 'rating': 5}

time: 1.18 s


In [8]:
N = len(dataset)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)

#Getting a list of keys
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

#This is equivalent to our Rating Mean from week 1
alpha = sum([d['rating'] for d in dataset]) / len(dataset)

#Create another two defaultdict's, this time being float types because they are prediction based
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

time: 6.81 ms


In [9]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

time: 2.24 ms


In [10]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return alpha

time: 5.12 ms


In [11]:
class Logger():
    def __init__(self):
        self.STATUS = 'OFF'
        self.START_TIME = None
        self.END_TIME = None
        self.EXECUTION_TIME = None
        self.LOGS = []
        self.MODEL = None
        self.SCORE = None
        self.STAT = None
        
    def start(self, model=None, stat=None, score=None):
        self.START_TIME = time.time()
        self.STATUS = 'ON'
        if model:
            self.MODEL = model
            self.LOGS.append("Model: {m}".format(m=model))
        if stat:
            self.STAT = stat
            self.LOGS.append("Statistic: {s}".format(s=stat))
        if score:
            self.SCORE = score
            self.LOGS.append("Score: {s}".format(s=score))
        
    def end(self, display=True, score=None):
        if self.STATUS == 'OFF':
            print("No timer started.")
        else:
            self.END_TIME = time.time()
            self.EXECUTION_TIME = self.END_TIME - self.START_TIME
            self.LOGS.append("Time: {t}".format(t=self.EXECUTION_TIME))
            if score:
                self.SCORE = score
                self.LOGS.append("Score: {s}".format(s=score))
            if display == True:
                self.getStats(last=False)
            else:
                r = self.LOGS
                self.tearDown()
                return r
            self.tearDown()
    
    def tearDown(self):
        self.STATUS = 'OFF'
        self.LOGS = []
        
    def getStats(self, show=True, last=True):
        if show == True:   
            if last == True:
                print("STATUS: {v}".format(v=self.STATUS))
                print("START_TIME: {v}".format(v=self.START_TIME))
                print("END_TIME: {v}".format(v=self.END_TIME))
                print("EXECUTION_TIME: {v}".format(v=self.EXECUTION_TIME))
                print("MODEL: {v}".format(v=self.MODEL))
                print("STAT: {v}".format(v=self.STAT))
                print("SCORE: {v}".format(v=self.SCORE))
            else:
                for l in self.LOGS:
                    print(l)
        else:
            return self.MODEL, self.STAT, self.SCORE, self.EXECUTION_TIME

        
timer = Logger()

time: 7.29 ms


In [12]:
labels = [d['rating'] for d in dataset]

time: 17.1 ms


In [20]:
# baseline
alwaysPredictMean = [alpha for d in dataset]
labels = [d['rating'] for d in dataset]
MSE(alwaysPredictMean, labels)

cfPredictions = [predictRating(d['user_id'], d['product_id']) for d in dataset] 

print(MSE(alwaysPredictMean, labels))
print(MSE(cfPredictions, labels))
print(MSE(alwaysPredictMean, labels), MSE(cfPredictions, labels)) 
print()
timer.start(model='Baseline', stat='MSE', score=MSE(alwaysPredictMean, labels))
alwaysPredictMean = [alpha for d in dataset]
timer.end()
print()

1.2160867173437198
0.9359891765231052
1.2160867173437198 0.9359891765231052

Model: Baseline
Statistic: MSE
Score: 1.2160867173437198
Time: 0.0025463104248046875

time: 13.8 s


### Heuristic 

- Heuristic analysis is an expert based analysis that determines the susceptibility of a system towards particular threat/risk using various decision rules or weighing methods. MultiCriteria analysis (MCA) is one of the means of weighing.

- https://en.wikipedia.org/wiki/Heuristic_analysis

In [21]:
score=MSE(cfPredictions, labels)
cfPredictions = [predictRating(d['user_id'], d['product_id']) for d in dataset]
print("Mode: Weighted Ratings Heuristic")
print("Score by MSE: ", score)

Mode: Weighted Ratings Heuristic
Score by MSE:  0.9359891765231052
time: 13.8 s


In [22]:
def mostSimilar(item, n):
    similarities = []
    users = usersPerItem[item]
    for i2 in usersPerItem:
        if i2 == item: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append([sim,i2])
    similarities.sort(reverse=True)
    return similarities[:n]

def mostSimilarFast(item, n):
    similarities = []
    users = usersPerItem[item]
    candidateItems = set()
    for u in users:
        candidateItems = candidateItems.union(itemsPerUser[u])
    for i2 in candidateItems:
        if i2 == item: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append([sim, i2])
    similarities.sort(reverse=True)
    return similarities[:n]

# Test Params
n = 10 
idx = 101 
query = dataset[idx]['product_id']

print("Index: {i}".format(i=idx))
print("ProductID: {q}".format(q=query))
print("Number Matches: {i}".format(i=n))

Index: 101
ProductID: 3979050432
Number Matches: 10
time: 3.89 ms


In [23]:
timer.start(model='Most Similar', stat='Jaccard Similarity')
sims1 = mostSimilar(query, n)
timer.end(display=True)
sims1

Model: Most Similar
Statistic: Jaccard Similarity
Time: 0.025446176528930664


[[0.009174311926605505, 'B00001T38Y'],
 [0.003003003003003003, 'B0000WL1Q0'],
 [0.002680965147453083, 'B0000WL0XY'],
 [0.0010706638115631692, 'B0000DCS5T'],
 [0.0008880994671403197, 'B0001YR54E'],
 [0.0005558643690939411, 'B00009ZM7Z'],
 [0.0005518763796909492, 'B0000CBALZ'],
 [0.00014490653528474135, 'B00028AZ6E'],
 [0.00014490653528474135, 'B0001YRFS0'],
 [0.0, 'B0002NZ898']]

time: 30.5 ms


In [24]:
timer.start(model='Most Similar Optimized', stat='Jaccard Similarity')
sims2 = mostSimilarFast(query, n)
timer.end(display=True)
sims2

Model: Most Similar Optimized
Statistic: Jaccard Similarity
Time: 0.004246950149536133


[[0.009174311926605505, 'B00001T38Y'],
 [0.003003003003003003, 'B0000WL1Q0'],
 [0.002680965147453083, 'B0000WL0XY'],
 [0.0010706638115631692, 'B0000DCS5T'],
 [0.0008880994671403197, 'B0001YR54E'],
 [0.0005558643690939411, 'B00009ZM7Z'],
 [0.0005518763796909492, 'B0000CBALZ'],
 [0.00014490653528474135, 'B00028AZ6E'],
 [0.00014490653528474135, 'B0001YRFS0']]

time: 13.6 ms


In [25]:
df = pd.DataFrame(dataset)
X = df[['user_id', 'product_id']]
y = df[['rating']]
df.head()

Unnamed: 0,user_id,product_id,rating
0,A2IC3NZN488KWK,871167042,5
1,A30FG02C424EJ5,871167042,5
2,A2G9GWQEWWNQUB,871167042,5
3,A3NI5OGW35SLY2,871167042,5
4,A1OPRA4NE56EV6,871167042,5


time: 95.7 ms


# Collaberative filtering 

* Product Similarity recommedation
* User Similarity recomendation

This model uses historical user/item ratings that are similar to predict ratings.

In [26]:
# !pip install surprise

time: 1.1 ms


In [27]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

#Reading the dataset
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df,reader)

time: 388 ms


In [28]:
#Splitting the dataset
trainset, testset = train_test_split(data, 
                                     test_size=0.3,
                                     random_state=11, 
                                     shuffle=True)

time: 142 ms


### Use user_based true/false to switch between user-based or item-based collaborative filtering

In [29]:
timer.start(model='Product KNN', stat='MSE')
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)
test_pred = algo.test(testset)
acc = accuracy.mse(test_pred, verbose=False)
timer.end()
print("Score: ", acc)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Model: Product KNN
Statistic: MSE
Time: 0.31182003021240234
Score:  0.9860290835457611
time: 316 ms


In [None]:
timer.start(model='User KNN', stat='MSE')
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)
test_pred = algo.test(testset)
acc = accuracy.mse(test_pred, verbose=False)
timer.end()
print("Score: ", acc)

### Script to create batch files with 1M records

In [None]:
item2indx = dict()
user2indx = dict()
review2indx = dict()
metadata = {}
user_counts = Counter()
item_counts = Counter()
review_counts = Counter()

idx = 0
idxs = 0
BATCH_SIZE=1000000
nBatches = 24
cBatch = 0
maxid = idx + BATCH_SIZE
f = gzip.open(path, 'rt', encoding="utf8")
header = ['userID', 'itemID', 'rating']

for line in f:
    fields = line.strip().split(',')
    d = dict(zip(header, fields))

    user, item, rating = d['itemID'], d['userID'], int(d['rating'][0])

    if user not in user2indx:
        user2indx[user] = len(user2indx)
    if item not in item2indx:
        item2indx[item] = len(item2indx)

    userid, itemid = user2indx[user], item2indx[item]
    user_counts[userid] += 1
    item_counts[itemid] += 1
    review_counts[rating] += 1

    if itemid < minItemId:
        minItemId = itemid
    if itemid > maxItemId:
        maxItemId = itemid

    reviewIdxs.append([userid, itemid, rating])
    idx += 1

    if idx >= maxid:
        try:
            with open(DATA_DIR + 'Clothing_Shoes_and_Jewelry/items/' + str(minItemId) + "_" + str(maxItemId) + ".csv", 'w') as batch2:
                batch_writer2 = csv.writer(batch2)   
                print(minItemId, maxItemId)
                batch_writer2.writerows(reviewIdxs)
            
        except csv.Error as e:
            print(e)

        finally:
            batch2.close()
            reviewIdxs = []
            maxid = idx + BATCH_SIZE
            minItemId = 9999999999999999
            maxItemId = 0        
            cBatch += 1
            if maxid >= 32292099 + 100000:
                maxid = 32292099
                cBatch = 23
                print('maxbatch')
            if cBatch >= nBatches:
                break  

In [None]:
# Example data sets
pd.read_csv(DATA_DIR + 'Clothing_Shoes_and_Jewelry/items/14022_609843.csv', header=None, names=['userID', 'itemID', 'rating'])


In [None]:
user2indx
user_counts
item_counts
len(item2indx)
print('i size: {}'.format(len(item_counts)))
print('u size: {}'.format(len(user_counts)))
print('r size: {}'.format(len(review_counts)))
print('most common: {}'.format(item_counts.most_common(10)))
indx2item = {indx:itm for itm,indx in item2indx.items()}
index2user = {indx:itm for itm,indx in user2indx.items()}
with open(DATA_DIR + 'Clothing_Shoes_and_Jewelry/items/id_to_item.json', 'w') as f:
    json.dump(indx2item, f)

In [None]:
new_df1=dfs.head(10000)
ratings_matrix = new_df1.pivot_table(values='rating', index='user_id', columns='product_id', fill_value=0)
ratings_matrix

# Notes / Scratch code

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)


In [None]:
# metadata = {}
# metadata['X_train'] = {
#     "users": X_train['user_id'].count(),
#     "items": X_train['product_id'].count()
# }
# metadata['y_train'] = {
#     "ratings": y_train['rating'].count()
# }
# metadata['X_valid'] = {
#     "users": X_valid['user_id'].count(),
#     "items": X_valid['product_id'].count()
# }
# metadata['y_valid'] = {
#     "ratings": y_valid['rating'].count()
# }
# metadata['X_test'] = {
#     "users": X_test['user_id'].count(),
#     "items": X_test['product_id'].count()
# }
# metadata['y_test'] = {
#     "ratings": y_test['rating'].count()
# }
# metadata

def load(data):
    """
    takes the training file no and return training and test data
    Ex. fileno = 1 for u1.base and u1.test
        fileno = 5 for u5.base and u5.test
    """
#     dataset = filename
#     df = pd.DataFrame(dataset)
#     X = df[['user_id', 'product_id']]
#     y = df[['rating']]
#     df.head()
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#     X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    # userid songid user-rating
#     basedir = "../dataset/ml-100k/u%s." % (fileno)
#     with open(basedir + 'base') as f:
#         training = np.loadtxt(f)
#     with open(basedir + 'test') as f:
#         test = np.loadtxt(f)    
#     with open('../dataset/ml-100k/u.info') as f:
#         metafile = np.genfromtxt(f)
#     metafile = np.delete(metafile, len(metafile[0]) - 1, 1)

    metadata = {}
    metadata['users'] = data['user_id'].nunique() 
    metadata['items'] = data['product_id'].nunique() 
    metadata['ratings'] = data['rating'].nunique() 
    return data, metadata

data, metadata = load(df)


In [None]:
def constructRatingMatrix(data, metadata):
    user = int(metadata['users'])
    item = int(metadata['items'])
    # ratingMatrix = np.zeros((metadata['users'], metadata['items']))
    ratingMatrix = np.zeros((user, item))
    for i in data:
        
        ratingMatrix[int(i[0])-1][int(i[1])-1] = i[2] 
    return ratingMatrix



ratingMatrix = constructRatingMatrix(data, metadata)
ratingMatrix

In [None]:
d = np.array(X_train)
for i in d:
#     print(i-2)
    print(i[0])
    print(i[1])
    break

In [None]:
arr
# dataArray
ratingMatrix.shape

In [None]:
timer.start()
def makeDic(data):
    dataDic = {}
    for i in data:
        dataDic.setdefault(i[0], {}).update({i[1]: i[2]})
    return dataDic

dataArray = np.array(df)
dataDic = makeDic(dataArray)
timer.end()
len(dataDic)

In [None]:
timer.start()
def makeArray(dic):
    arr = []
    for user,items in dic.items():
        for k,v in items.items():
            row = []
            row.append(user)
            row.append(k)
            row.append(v)
            arr.append(row)
    return arr

arr = makeArray(dataDic)
timer.end()
len(arr)

In [None]:
le = preprocessing.LabelEncoder()
le.fit(dfs['user_id'])
usersEnc = le.transform(dfs['user_id'])
le.classes_

le2 = preprocessing.LabelEncoder()
le2.fit(dfs['product_id'])
productsEnc = le2.transform(dfs['product_id'])
le2.classes_

le3 = preprocessing.LabelEncoder()
le3.fit(dfs['rating'])
ratingsEnc = le3.transform(dfs['rating'])
le3.classes_

In [None]:
mat = np.array([usersEnc, productsEnc, ratingsEnc])
mat = mat.T
print(mat.shape)
mat

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

simmat = cosine_similarity(mat).flatten()
simmat

In [None]:
X = ratings_matrix.T
X

In [None]:
#Decomposing the Matrix
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

In [None]:
#Correlation Matrix

correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

In [None]:
# trainset, testset = train_test_split(data, test_size=0.3,random_state=10)
X.index[75]

i = "B0001ZNZF6"

product_names = list(X.index)
product_ID = product_names.index(i)
product_ID


In [None]:
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

In [None]:
Recommend = list(X.index[correlation_product_ID > 0.65])

# Removes the item already bought by the customer
Recommend.remove(i) 

Recommend[0:24]

In [None]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)

In [None]:
# run the trained model against the testset
test_pred = algo.test(testset)

print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:
# run the trained model against the testset
test_pred = algo.test(testset)

print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:
# get RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:
def cosine(n_x, yr, min_support):
    # sum (r_xy * r_x'y) for common ys
#     cdef np.ndarray[np.double_t, ndim=2] prods
#     # number of common ys
#     cdef np.ndarray[np.int_t, ndim=2] freq
#     # sum (r_xy ^ 2) for common ys
#     cdef np.ndarray[np.double_t, ndim=2] sqi
#     # sum (r_x'y ^ 2) for common ys
#     cdef np.ndarray[np.double_t, ndim=2] sqj
#     # the similarity matrix
#     cdef np.ndarray[np.double_t, ndim=2] sim

#     cdef int xi, xj
#     cdef double ri, rj
#     cdef int min_sprt = min_support

    prods = np.zeros((n_x, n_x), np.double)
    freq = np.zeros((n_x, n_x), np.int)
    sqi = np.zeros((n_x, n_x), np.double)
    sqj = np.zeros((n_x, n_x), np.double)
    sim = np.zeros((n_x, n_x), np.double)

    for y, y_ratings in iteritems(yr):
        for xi, ri in y_ratings:
            for xj, rj in y_ratings:
                freq[xi, xj] += 1
                prods[xi, xj] += ri * rj
                sqi[xi, xj] += ri**2
                sqj[xi, xj] += rj**2

    for xi in range(n_x):
        sim[xi, xi] = 1
        for xj in range(xi + 1, n_x):
            if freq[xi, xj] < min_sprt:
                sim[xi, xj] = 0
            else:
                denum = np.sqrt(sqi[xi, xj] * sqj[xi, xj])
                sim[xi, xj] = prods[xi, xj] / denum

            sim[xj, xi] = sim[xi, xj]

    return sim
    

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def ww_sim(s1, s2):
    """Calculate topn most similar words to word"""
#     indx = tok2indx[word]
    s1list = []
    for x in s1:
        indx = tok2indx[x]
        s1list.append(indx)
        
    s2list = []

    for y in s2:
        indy = tok2indx[y]
        s2list.append(indy)
#     if isinstance(mat, sparse.csr_matrix):
#         v1 = mat.getrow(indx)
#     else:
#         v1 = mat[indx:indx+1, :]
    sims = cosine_similarity(np.array(s1list).reshape(-1, 1), np.array(s2list).reshape(-1, 1))
#     sindxs = np.argsort(-sims)
#     sim_word_scores = [(indx2tok[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return sims

In [None]:
p_c = cooc/cooc.sum(axis=0).sum()
p_c_w = cooc
for x in range(len(p_c_w)):
    p_c_w[x,:] = p_c_w[x,:]/p_c_w[x,:].sum()

p_c_w

In [None]:
item2indx = dict()
item_counts = Counter()

for token in df:
    item = token[0]
    item_counts[item] += 1
    if item not in item2indx:
        item2indx[item] = len(item2indx)
indx2item = {indx:tok for tok,indx in item2indx.items()}
print('vocabulary size: {}'.format(len(item_counts)))
print('most common: {}'.format(item_counts.most_common(10)))

In [None]:
back_window = 2
front_window = 2
skipgram_counts = Counter()

for ifw, fw in enumerate(df):
        icw_min = max(0, ifw - back_window)
        icw_max = min(len(filtered_words) - 1, ifw + front_window)
        icws = [ii for ii in range(icw_min, icw_max + 1) if ii != ifw]
        
        for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean
    
        for icw in icws:
            skipgram = (filtered_words[ifw], filtered_words[icw])
            skipgram_counts[skipgram] += 1

        
print('number of skipgrams: {}'.format(len(skipgram_counts)))
print('most common: {}'.format(skipgram_counts.most_common(10)))

In [None]:
from pyspark.sql import *
from pyspark import SparkContext,SparkConf

def create_sc(pyFiles):
    sc_conf = SparkConf()
    sc_conf.setAppName("RecomenderAmazon")
    sc_conf.set('spark.executor.memory', '3g')
    sc_conf.set('spark.executor.cores', '1')
    sc_conf.set('spark.cores.max', '4')
    sc_conf.set('spark.default.parallelism','10')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())

    sc = SparkContext(conf=sc_conf,pyFiles=pyFiles)

    return sc 

sc = create_sc(pyFiles=[])

sqlContext = SQLContext(sc)


In [None]:
df

In [None]:
# df=sqlContext.read.parquet(parquet_path)

# RDDin
# RDD=RDDin.map(lambda v:np.array(np.insert(v,0,1),dtype=np.float64))
# df.to_parquet('df.parquet.gzip',
#               compression='gzip')  

# df.append(['0123456479', 'A1KLRMWW2FWPL4', 5.0])
# df.append(['1608299953', 'A1KLRMWW2FWPL4', 4.0])
# df.append(['1617160377', 'A2G5TCU2WDFZ65', 5.0])
sdf = sqlContext.createDataFrame(df, ("item", "user", "rating"))
sdf.show()

In [None]:
sqlContext.registerDataFrameAsTable(sdf,'ratings')

Query="""
SELECT item, count(item) as count 
FROM ratings 
GROUP BY item
ORDER BY count desc
"""
counts=sqlContext.sql(Query)
counts.show()

In [None]:
Query2="""
SELECT user, count(user) as count 
FROM ratings 
GROUP BY user
ORDER BY count desc
"""
counts2=sqlContext.sql(Query2)
counts2.show()

In [None]:
Query3="""
SELECT item, rating from ratings 
where user = 'A1KLRMWW2FWPL4' and item <>'0000031887'
"""
counts3=sqlContext.sql(Query3)
counts3.show()

Query3="""
SELECT user, rating from ratings 
where item in ('0123456479', '1608299953')
"""
counts3=sqlContext.sql(Query3)
counts3.show()

# ratings.append(d['star_rating'])
#         similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))

In [None]:
import numpy as np
from numpy import linalg as LA

def outerProduct(X):
    """Computer outer product and indicate which locations in matrix are undefined"""
    O=np.outer(X,X)
    N=1-np.isnan(O)
    return (O,N)

def sumWithNan(M1,M2):
    """Add two pairs of (matrix,count)"""
    (X1,N1)=M1
    (X2,N2)=M2
    N=N1+N2
    X=np.nansum(np.dstack((X1,X2)),axis=2)
    return (X,N)


def HW_func(S,N):
    E=      np.ones([365]) # E is the sum of the vectors
    NE=     np.ones([365]) # NE is the number of not-nan antries for each coordinate of the vectors
    Mean=   np.ones([365]) # Mean is the Mean vector (ignoring nans)
    O=      np.ones([365,365]) # O is the sum of the outer products
    NO=     np.ones([365,365]) # NO is the number of non-nans in the outer product.
    return  E,NE,Mean,O,NO

In [None]:
def computeCov(RDDin):
    """computeCov recieves as input an RDD of np arrays, all of the same length, 
    and computes the covariance matrix for that set of vectors"""
    RDD=RDDin.map(lambda v:np.array(np.insert(v,0,1),dtype=np.float64)) # insert a 1 at the beginning of each vector so that the same 
                                           #calculation also yields the mean vector
    OuterRDD=RDD.map(outerProduct)   # separating the map and the reduce does not matter because of Spark uses lazy execution.
    (S,N)=OuterRDD.reduce(sumWithNan)

    E,NE,Mean,O,NO=HW_func(S,N)

    Cov=O/NO - np.outer(Mean,Mean)
    # Output also the diagnal which is the variance for each day
    Var=np.array([Cov[i,i] for i in range(Cov.shape[0])])
    return {'E':E,'NE':NE,'O':O,'NO':NO,'Cov':Cov,'Mean':Mean,'Var':Var}

RDD=sc.parallelize(df)
RDD = RDD.map(lambda x: x[2])
OUT=computeCov(RDD)

eigval,eigvec=LA.eig(OUT['Cov'])
print('eigval=',eigval)
print('eigvec=',eigvec)

In [None]:
def computeOverAllDist(rdd0):
    UnDef=np.array(rdd0.map(lambda row:sum(np.isnan(row))).sample(False,0.01).collect())
    flat=rdd0.flatMap(lambda v:list(v)).filter(lambda x: not np.isnan(x)).cache()
    count,S1,S2=flat.map(lambda x: np.float64([1,x,x**2]))\
                  .reduce(lambda x,y: x+y)
    mean=S1/count
    std=np.sqrt(S2/count-mean**2)
    Vals=flat.sample(False,0.0001).collect()
    SortedVals=np.array(sorted(Vals))
    low100,high100=find_percentiles(SortedVals,100)
    low1000,high1000=find_percentiles(SortedVals,1000)
    return {'UnDef':UnDef,\
          'mean':mean,\
          'std':std,\
          'SortedVals':SortedVals,\
          'low100':low100,\
          'high100':high100,\
          'low1000':low100,\
          'high1000':high1000
          }



In [None]:

# sdf.select('rating').filter("user == 'A1KLRMWW2FWPL4'").groupBy('user').show()
def packArray(a):
    """
    pack a numpy array into a bytearray that can be stored as a single 
    field in a spark DataFrame

    :param a: a numpy ndarray 
    :returns: a bytearray
    :rtype:

    """
    if type(a)!=np.ndarray:
        raise Exception("input to packArray should be numpy.ndarray. It is instead "+str(type(a)))
    return bytearray(a.tobytes())

In [None]:
# from numpy_pack import packArray,unpackArray
def unpackArray(x,data_type=np.int16):
    """
    unpack a bytearray into a numpy.ndarray

    :param x: a bytearray
    :param data_type: The dtype of the array. This is important because if determines how many bytes go into each entry in the array.
    :returns: a numpy array
    :rtype: a numpy ndarray of dtype data_type.

    """
    return np.frombuffer(x,dtype=data_type)

data=mdf.rdd.map(lambda row: unpackArray(row['Values'],np.float16))
