In [1]:
import gzip
from collections import defaultdict
import random
import math
import statistics
def readCSV(path):
  f = gzip.open(path, 'rt')
  f.readline()
  for l in f:
    yield l.strip().split(',')

In [2]:
userIDs = {}
itemIDs = {}
trainSet = []
validationSet = []
testSet = []
interactions = []
bookCount = defaultdict(int)
totalRead = 0
iterations = 0 
for user,book,rating in readCSV("train_Interactions.csv.gz"):
    interactions.append((user,book,rating))

In [3]:
random.shuffle(interactions)
trainSet = interactions[0:180000]
validation = interactions[180000:190000]
testSet = interactions[190000:200000]
#Count number of interactions for each book
usersPerItem = defaultdict(set) # U_i from class slides
itemsPerUser = defaultdict(set) # I_u from class slides
for user,book,_ in trainSet:
    totalRead += 1
    bookCount[book] += 1
    usersPerItem[book].add(user)
    itemsPerUser[user].add(book)
items = list(usersPerItem.keys())

In [4]:
len(interactions)

200000

In [5]:
# Create negative samples for validation set.
validationSet = [] #(user_id, book_id, bought/ not bought)
for user,book,d in validation:
    validationSet.append((user,book,1))
    bookNeg = random.choice(items) # negative sample
    while bookNeg in itemsPerUser[user]:
        bookNeg = random.choice(items)
    validationSet.append((user,bookNeg,0))  

In [6]:
usersPerBook = defaultdict(set)
booksPerUser = defaultdict(set)
for d in trainSet:
    user, book = d[0], d[1]
    usersPerBook[book].add(user)
    booksPerUser[user].add(book)

def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

def mostSimilarFast(i, j ):
    similarities = []
    books = booksPerUser[i] #the games that the user i played in the train set
    users = usersPerBook[j] #the game j that users played in train set
    for b in books:
        similarities.append(Jaccard(users, usersPerBook[b]))
    return similarities

def cosine(s1, s2):
    product = len(s1.intersection(s2))
    denom = math.sqrt(len(s1)) * math.sqrt(len(s2))
    return product/denom

def mostSimilarFast2(i, j ):
    similarities = []
    books = booksPerUser[i] #the games that the user i played in the train set
    users = usersPerBook[j] #the game j that users played in train set
    for b in books:
        similarities.append(cosine(users, usersPerBook[b]))
    return similarities

def euclidean_distance(s1, s2):
    count = 0
    for i in s1:
        if i not in s2:
            count = count + 1
    for i in s2:
        if i not in s1:
            count = count + 1
    return count

def mostSimilarFast3(i, j ):
    similarities = []
    books = booksPerUser[i] #the games that the user i played in the train set
    users = usersPerBook[j] #the game j that users played in train set
    for b in books:
        similarities.append(euclidean_distance(users, usersPerBook[b]))
    return similarities

In [7]:
#create a new file for is_game_valSet
test = open("validation.txt", 'w')
test.write("userID-bookID,prediction\n")
for d in validationSet:
    test.write(d[0] + '-' + d[1] + '\n')

In [33]:
#popularity for each book
bookCount = defaultdict(int)
totalbought = 0

for user,book,_ in trainSet:
    bookCount[book] += 1
    totalbought += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()
returnPopular = set()
returnPopularCnt = set()
count = 0
#Extract books that account for 40% of all interactions
for ic, i in mostPopular:
  count += ic
  returnPopular.add(i)
  returnPopularCnt.add((ic,i))
  if count > totalRead*0.65: break

## Baseline popularity model

In [34]:
predictions_lst = []
count = 0
labels = []
for sample in validationSet:
    user = sample[0]
    book = sample[1]
    if book in returnPopular:
        predictions_lst.append(1)
    else:
        predictions_lst.append(0)
    labels.append(sample[2])

In [35]:
#accuracy
count = 0
for i in range(len(predictions_lst)):
    if predictions_lst[i] == validationSet[i][2]:
        count += 1
accuracy = count/len(predictions_lst)
accuracy

0.6538

In [11]:
#popularity       
popular = {}


predictions = open("predictions_bought.txt", 'w')
for l in open("validation.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,b = l.strip().split('-')
    
    for num, book in returnPopularCnt:  
        if num > 10 and b == book:
            popular[b] = 1
            break
        popular[b] = 0

## Jaccard similarity model

In [12]:
predictions_lst = []
count = 0
predictions = open("predictions_bought.txt", 'w')
for l in open("validation.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,b = l.strip().split('-')
    
    sim1 = mostSimilarFast(u, b)

    if len(sim1) == 0:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count +=1
    elif statistics.mean(sim1) > 0.002 or popular[b] == 1:
        predictions.write(u + '-' + b + ",1\n")
        predictions_lst.append(1)
        count +=1
    elif statistics.mean(sim1) <= 0.002 or popular[b] == 0:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count+=1
    else:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count+=1

predictions.close()

In [13]:
#accuracy
count = 0
for i in range(len(predictions_lst)):
    if predictions_lst[i] == validationSet[i][2]:
        count += 1
accuracy = count/len(predictions_lst)
accuracy

0.683735392948493

## Cosine similarity model

In [14]:
predictions_lst = []
count = 0
predictions = open("predictions_bought.txt", 'w')
for l in open("validation.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,b = l.strip().split('-')
    
    sim2 = mostSimilarFast2(u, b)

    if len(sim2) == 0:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count +=1
    elif statistics.mean(sim2) > 0.004 or popular[b] == 1:
        predictions.write(u + '-' + b + ",1\n")
        predictions_lst.append(1)
        count +=1
    elif statistics.mean(sim2) <= 0.004 or popular[b] == 0:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count+=1
    else:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count+=1

predictions.close()

In [15]:
#accuracy
count = 0
for i in range(len(predictions_lst)):
    if predictions_lst[i] == validationSet[i][2]:
        count += 1
accuracy = count/len(predictions_lst)
accuracy

0.6771152013641607

## Euclidean distance similarity

In [31]:
predictions_lst = []
count = 0
predictions = open("predictions_bought.txt", 'w')
for l in open("validation.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,b = l.strip().split('-')
    
    sim3 = mostSimilarFast3(u, b)

    if len(sim3) == 0:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count +=1
    elif statistics.mean(sim3) > 93:
        predictions.write(u + '-' + b + ",1\n")
        predictions_lst.append(1)
        count +=1
    elif statistics.mean(sim3) <= 93:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count+=1
    else:
        predictions.write(u + '-' + b + ",0\n")
        predictions_lst.append(0)
        count+=1

predictions.close()

In [32]:
#accuracy
count = 0
for i in range(len(predictions_lst)):
    if predictions_lst[i] == validationSet[i][2]:
        count += 1
accuracy = count/len(predictions_lst)
accuracy

0.6145242991122924