### Helper Code

In [1]:
import gzip
from collections import defaultdict
from multiprocessing import Lock, Process, Queue, current_process, Pool
import random
from tqdm import tqdm, tnrange, tqdm_notebook
import time
import sys
#import queue # imported for using queue.Empty exception

In [2]:
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

# Tasks: Purchase Prediction

1. Although we have built a validation set, it only consists of positive samples. For this task we also need examples of user/item pairs that weren’t purchased. Build such a set by randomly sampling users and items until you have 100,000 non-purchased user/item pairs. This random sample combined with your 100,000 validation reviews now corresponds to the complete validation set for the purchase prediction task. Evaluate the performance (accuracy) of the baseline model on the validation set you have built (1 mark).

In [3]:
#generate list of possible choices to pick from for users\
allUsers = set([])
allItems = set([])

for l in readGz("train.json.gz"):
    user,item = l['reviewerID'],l['itemID']
    allUsers.add(user)
    allItems.add(item)


In [4]:
# optionally save it to disk (Also practicing parallelism 
# here, it will be useful later on)
persist = True
processes = []

if persist:
    def save_items():
        itemsSet = open("items_set.txt", 'w')
        for l in allItems:
            itemsSet.write(l + '\n')
        itemsSet.close()
        
    def save_users():
        usersSet = open("users_set.txt", 'w')
        for l in allUsers:
            usersSet.write(l + '\n')
        usersSet.close()
        
    def parallel_execute():
        p = Process(target=save_items)
        q = Process(target=save_users)
        processes.append(p)
        processes.append(q)
        p.start()
        q.start()
        p.join() # comment this out to allow other cells to run
        q.join() # ''
        #p.close()
        #q.close()
    parallel_execute()
    
uniqueUsers = list(allUsers)
uniqueItems = list(allItems)
print('Done') # should let me run other cells while jobs finish

Done


In [5]:
# load sets of data in memory
train_200k = []
purchases_200k = []
pbar = tqdm_notebook(total=200000, desc='Progress:')

for l in readGz("train.json.gz"):
    train_200k.append(l)
    user,item = l['reviewerID'],l['itemID']
    # appending as tuple to allow hash collision detection
    # later on down the line
    purchases_200k.append((user,item))
    pbar.update(1)
pbar.close()

ImportError: IntProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [None]:
validation_negatives = []
purchases_to_generate = range(100000)

pool = Pool(processes=3)
#progress = tqdm_notebook(total=len(purchases_to_generate), desc='Progress:')

def generate_fake_purchase(counter):
    while True:
        unique = True
        #if (counter % 1000) == 0:
        #    print(str(int(counter/1000)) + '% complete')
        randreviewer = random.choice(uniqueUsers)
        randitem = random.choice(uniqueItems)  
        randtuple = tuple([randreviewer, randitem]) # hash friendly format

        if randtuple in purchases_200k:
            unique = False
            #print('pair already exists in purchases_200k')
        elif randtuple in validation_negatives:
            unique = False
            #print('pair already exists in validation_negs')
        else:
            #progress.update(1)
            return randtuple


validation_negatives = pool.imap(generate_fake_purchase, purchases_to_generate)
pool.close() #no more work
pool.join()
#progress.close()

In [None]:
# Now we create the final validation set consisting of 
# 100k positive + 100k negative values & clean up vars
train_100k = train_200k[:100000]
validation_positives = purchases_200k[100000:]
v_set = validation_positives + validation_negatives
v_negs = open("validation_negatives.txt", 'w')
for l in validation_negatives:
    v_negs.write(str(l) + '\n')
v_negs.close()

In [None]:
#Load presaved data
import re
train_100k = train_200k[:100000]
validation_positives = purchases_200k[100000:]

v_negs = []
for l in open("validation_negatives.txt"):
    obj = l.split('\'')[1::2]
    v_negs.append(tuple(obj))
validation_negatives = v_negs

In [None]:
v_set = validation_positives + validation_negatives

In [None]:
### Would-purchase baseline: just rank which items are popular

items_purchased = defaultdict(int)
total_purchases = 0

for l in train_100k:
    item = l['itemID']
    items_purchased[item] += 1
    total_purchases += 1

mostPopular = [(items_purchased[x], x) for x in items_purchased]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > total_purchases/2: break

predictions = []
for u,i in v_set:
    if i in return1:
        predictions.append(1)
    else:
        predictions.append(0)

In [None]:
# Evaluate accuracy
# Everything < 100k should be 1 and everything above that should be a 0
counter = 0
for i in predictions[:100000]:
    if i == 1:
        counter +=1
for i in predictions[100000:]:
    if i == 0:
        counter +=1
accuracy = counter/200000
print("Accuracy: " + str(accuracy*100) + "%")

2. The existing ‘purchase prediction’ baseline just returns True if the item in question is ‘popular,’ using a threshold of the 50th percentile of popularity (totalPurchases/2). Assuming that the ‘non-purchased’ test examples are a random sample of user-purchase pairs, is this particular threshold value the best? If not, see if you can find a better one (and report its performance), or if so, explain why it is the best (1 mark).

In [None]:
### Would-purchase baseline: just rank which items are popular
def run_baseline(denominator):
    items_purchased = defaultdict(int)
    total_purchases = 0

    for l in train_100k:
        item = l['itemID']
        items_purchased[item] += 1
        total_purchases += 1

    mostPopular = [(items_purchased[x], x) for x in items_purchased]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > total_purchases/denominator: break

    predictions = []
    for u,i in v_set:
        if i in return1:
            predictions.append(1)
        else:
            predictions.append(0)

    # Evaluate accuracy
    # Everything < 100k should be 1 and everything above that should be a 0
    counter = 0
    for i in predictions[:100000]:
        if i == 1:
            counter +=1
    for i in predictions[100000:]:
        if i == 0:
            counter +=1
    accuracy = counter/200000
    return accuracy
    #print("denominator = " + str(denominator) + " accuracy: " + str(accuracy*100) + "%")

baseline_accuracy = run_baseline(2) # original value, 50th percentile
with Pool(processes=3) as pool: #start 3 worker processes
    accuracies = pool.map(run_baseline, range(1,90))
#print(accuracies)

In [None]:
count = 1
flag = False
for i in accuracies:
    if i > baseline_accuracy:
        flag = True
        print('denom: ' + str(count) + ' accuracy: ' + str(i*100) + '%')
if flag == False:
    print('None of the trials beat the baseline')      

There is no threshhold value that outperforms the original 50th percentile metric. It appears to perform the best because it has the highest accuracy when considering that particular subset.

3. Users may tend to repeatedly purchase items of the same type. Build a baseline that returns ‘True’ if a user has purchased an item of the same category before (at least one category in common), or zero otherwise (1 mark).

In [None]:
results = set([])

for l in train_100k:
    user = l['reviewerID']
    category = l['categoryID']
    obj = tuple([user,category])
    results.add(obj)
    
user_categories = list(results)    
#print(user_categories[:3])

validation_positives = []
for l in train_200k[100000:]:
    user = l['reviewerID']
    category = l['categoryID']
    obj = tuple([user,category])
    validation_positives.append(obj)

#print(validation_positives[:3])
validation_negatives = []
while len(validation_negatives) < 100000:
    tup = random.choice(user_categories)
    user = tup[0]
    category = random.randrange(1,5)
    obj = tuple([user,category])
    #print('generated')
    if obj not in validation_positives:
        validation_negatives.append(obj)
        #print('added')
        #print(obj)
#print(validation_negatives[:3]) 

In [None]:
counter = 0
predictions_pos = []
for l in validation_positives:
    if l in user_categories:
        predictions_pos.append(1)
        counter +=1
    else:
        predictions.append(0)
predictions_neg = []
for l in validation_negatives:
    if l in user_categories:
        predictions_neg.append(1)
    else:
        predictions_neg.append(0)
        counter +=1

In [None]:
print('Accuracy of predictor: ' + str(counter/200000*100) + '%')

4. To run our model on the test set, we’ll have to use the files ‘pairs Purchase.txt’ to find the review- erID/itemID pairs about which we have to make predictions. Using that data, run the above model and upload your solution to Kaggle. Tell us your Kaggle user name (1 mark). If you’ve already uploaded a better solution to Kaggle, that’s fine too!

# Kaggle Username: https://www.kaggle.com/kodingkirby

In [None]:
pairs_Purchase_list = []
for l in open("pairs_Purchase.txt"):
    if l.startswith("reviewerID"):
        #header
        continue
    u,i = l.strip().split('-')
    obj = tuple([u,i])
    pairs_Purchase_list.append(obj)

In [None]:
user_cat_tup_array = [] # [('UserID', 'category'), ]
pbar = tqdm_notebook(total=28000, desc='Progress:')

for l in pairs_Purchase_list: # [(userID,itemID)]
    u_id = l[0]
    i_id = l[1]
    #ui_tup = tuple([u_id,i_id])

    for k in train_200k:
        if i_id == k['itemID']:
            c_id = k['categoryID']
            break
        else:
            c_id = 0 # if category is unknown, assume the most popular
                    # which is women (0)
                
    user_cat_tup_array.append(tuple([u_id, c_id]))
    pbar.update(1)
pbar.close()
print('Generated user_cat_tup_array [(userID, category),]')

In [None]:
counter = 0
predictions = []
pbar = tqdm_notebook(total=28000, desc='Progress:')

for t in user_cat_tup_array: # 28k times
    if t in user_categories:
        predictions.append(1)
        counter +=1
    else:
        predictions.append(0)
    pbar.update(1)
pbar.close()

In [None]:
zipped = list(zip(pairs_Purchase_list, predictions))
file = open('q4_results.txt', 'w')
file.write('reviewerID-itemID,prediction\n')
for l in zipped:
    file.write(str(l[0][0]) + '-' + str(l[0][1]) + ',' + str(l[1]) + '\n')
file.close()
print('done')