### Helper Code

In [8]:
import gzip
from collections import defaultdict
from multiprocessing import Lock, Process, Queue, current_process, Pool
import random
from tqdm import tqdm, tnrange, tqdm_notebook
import time
import sys
#import queue # imported for using queue.Empty exception

In [9]:
import numpy as np
import scipy as sp
import sklearn
import matplotlib
import math

In [10]:
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

# Tasks: Purchase Prediction

In [11]:
#generate list of possible choices to pick from for users
allUsers = set([])
allItems = set([])

for l in readGz("train.json.gz"):
    user,item = l['reviewerID'],l['itemID']
    allUsers.add(user)
    allItems.add(item)
 

In [12]:
# optionally save it to disk (Also practicing parallelism 
# here, it will be useful later on)
persist = False
processes = []

if persist:
    def save_items():
        itemsSet = open("items_set.txt", 'w')
        for l in allItems:
            itemsSet.write(l + '\n')
        itemsSet.close()
        
    def save_users():
        usersSet = open("users_set.txt", 'w')
        for l in allUsers:
            usersSet.write(l + '\n')
        usersSet.close()
        
    def parallel_execute():
        p = Process(target=save_items)
        q = Process(target=save_users)
        processes.append(p)
        processes.append(q)
        p.start()
        q.start()
        p.join() # comment this out to allow other cells to run
        q.join() # ''
        #p.close()
        #q.close()
    parallel_execute()
    
uniqueUsers = list(allUsers)
uniqueItems = list(allItems)
print('Done') # should let me run other cells while jobs finish

Done


In [13]:
# load sets of data in memory
train_200k = []
purchases_200k = []
#pbar = tqdm_notebook(total=200000, desc='Progress:')

for l in readGz("train.json.gz"):
    train_200k.append(l)
    user,item = l['reviewerID'],l['itemID']
    # appending as tuple to allow hash collision detection
    # later on down the line
    purchases_200k.append((user,item))
    #pbar.update(1)
#pbar.close()

In [28]:
#Load presaved data
random.shuffle(train_200k)
random.shuffle(purchases_200k)
train_100k = train_200k[:100000]
validation_positives = purchases_200k[100000:]

v_negs = []
for l in open("validation_negatives.txt"):
    obj = l.split('\'')[1::2]
    v_negs.append(tuple(obj))
validation_negatives = v_negs

In [29]:
v_set = validation_positives + validation_negatives

In [30]:
### Would-purchase baseline: just rank which items are popular

items_purchased = defaultdict(int)
total_purchases = 0

for l in train_100k:
    item = l['itemID']
    items_purchased[item] += 1
    total_purchases += 1

mostPopular = [(items_purchased[x], x) for x in items_purchased]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > total_purchases/2: break

predictions = []
for u,i in v_set:
    if i in return1:
        predictions.append(1)
    else:
        predictions.append(0)

In [27]:
pairs_Purchase_list = []
for l in open("pairs_Purchase.txt"):
    if l.startswith("reviewerID"):
        #header
        continue
    u,i = l.strip().split('-')
    obj = tuple([u,i])
    pairs_Purchase_list.append(obj)

In [None]:
user_cat_tup_array = [] # [('UserID', 'category'), ]
#pbar = tqdm_notebook(total=28000, desc='Progress:')

for l in pairs_Purchase_list: # [(userID,itemID)]
    u_id = l[0]
    i_id = l[1]
    #ui_tup = tuple([u_id,i_id])

    for k in train_200k:
        if i_id == k['itemID']:
            c_id = k['categoryID']
            break
        else:
            c_id = 0 # if category is unknown, assume the most popular
                    # which is women (0)
                
    user_cat_tup_array.append(tuple([u_id, c_id]))
    #pbar.update(1)
#pbar.close()
print('Generated user_cat_tup_array [(userID, category),]')

In [None]:
counter = 0
predictions = []
pbar = tqdm_notebook(total=28000, desc='Progress:')

for t in user_cat_tup_array: # 28k times
    if t in user_categories:
        predictions.append(1)
        counter +=1
    else:
        predictions.append(0)
    pbar.update(1)
pbar.close()

In [None]:
zipped = list(zip(pairs_Purchase_list, predictions))
file = open('q4_results.txt', 'w')
file.write('reviewerID-itemID,prediction\n')
for l in zipped:
    file.write(str(l[0][0]) + '-' + str(l[0][1]) + ',' + str(l[1]) + '\n')
file.close()
print('done')