## Assignment 1 

#### Read prediction

In [1]:
import numpy
import urllib
import scipy.optimize
import random
import gzip
import csv
from collections import defaultdict
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [2]:
# input file using gzip
path = "train_Interactions.csv.gz"
f = gzip.open(path,"rt",encoding="utf8")
reader = csv.reader(f,delimiter = ",")

In [3]:
# reading the file to build dataset
dataset = []
first = True
for line in reader:
    if first:
        header = line
        first = False
    else:
        d = dict(zip(header,line))
        # convert strings to integers for some fields
        d["rating"] = int(d["rating"])
        dataset.append(d)

In [4]:
# split the training data
data_train = dataset[:190000]
data_valid = dataset[190000:]

In [5]:
# find all user ids and book ids, and pair them
allUserID=[]
allBookID=[]
UsersReadBooks = {}

for i in dataset:
    allUserID.append(i["userID"])
    allBookID.append(i["bookID"])
    if UsersReadBooks.get(i["userID"]):
        UsersReadBooks[i["userID"]].append(i["bookID"])
    else:
        UsersReadBooks[i["userID"]] = [i["bookID"]]

unique_users = list(set(allUserID))
unique_books = list(set(allBookID))

In [8]:
# positive validation data 
positive_valid = []
for i in data_valid:
        positive_valid.append([i["userID"],i["bookID"]])

In [10]:
# negative validation data
negative_valid_dict = {}
for c in data_valid:
    bid = random.choice(unique_books)
    uid = c["userID"]
    while bid in UsersReadBooks[c["userID"]]:
        bid = random.choice(unique_books)
    if negative_valid_dict.get(uid):
        negative_valid_dict[uid].append(bid)
    else:
        negative_valid_dict[uid]= [bid]
        
# negative validation dataset to list
negative_valid = []

for i in negative_valid_dict.keys():
    if len(negative_valid_dict[i]) > 1:
        for ii in negative_valid_dict[i]:
            negative_valid.append([i,ii])
    else:
        negative_valid.append([i,negative_valid_dict[i][0]])

In [11]:
y_valid = [0 for i in range(len(negative_valid))] + [1 for i in range(len(positive_valid))]
X_valid = negative_valid + positive_valid 

#### Book popularity

In [17]:
# Baseline - using train dataset to get the most popular books data
bookCount = defaultdict(int)
totalRead = 0

for c in dataset:
    user,book = c["userID"],c["bookID"]
    bookCount[book] += 1
    totalRead += 1

#### Jaccard Similarity

In [22]:
# pair users and books in dataset
TrainUserID = []
TrainBookID = []
UsersPerBook = defaultdict(set)
BooksPerUser = defaultdict(set)

for i in dataset:
    TrainUserID.append(i["userID"])
    TrainBookID.append(i["bookID"])
    UsersPerBook[i["bookID"]].add(i["userID"])
    BooksPerUser[i["userID"]].add(i["bookID"])

unique_users_train = list(set(TrainUserID))
unique_books_train = list(set(TrainBookID))

In [19]:
# Jaccard Predictor
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

def mostSimilarFast(user, book):
    similarities = []
    books = BooksPerUser[user]

    for b in books:
        if b == book:
            continue
        users = UsersPerBook[b]
        sim = Jaccard(users, UsersPerBook[book])
        similarities.append(sim)
      
        mean = sum(similarities)/len(similarities)
   
    return mean

#### Prediction

In [20]:
# test on validation dataset
user_book_sim = defaultdict(list)
for x in X_valid:
    u = x[0]
    b = x[1]
    s = mostSimilarFast(u,b) * bookCount[b]
    comb = (u,b)
    user_book_sim[u].append ((s,comb))

for i in user_book_sim:
    user_book_sim[i].sort()

In [None]:
# user_book_sim(X_valid)

In [21]:
y_pred = []
for x in X_valid:
    u = x[0]
    b = x[1]
    i = (u,b)
    book_sim_list = user_book_sim[u]
    p = 1
    for n in range(int(len(book_sim_list)/2)):
        if i in book_sim_list[n]:
            p = 0
    y_pred.append(p)

In [None]:
accuracy_score(y_pred, y_valid)

#### predict on the testing set

In [None]:
X_test = []
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        # header
#         predictions.write(l)
        continue
    u, b = l.strip().split("-")
    X_test.append([u,b])

In [None]:
user_book_sim = defaultdict(list)
for x in X_test:
    u = x[0]
    b = x[1]
    s = mostSimilarFast(u,b) * bookCount[b]
    comb = (u,b)
    user_book_sim[u].append ((s,comb))

for i in user_book_sim:
    user_book_sim[i].sort()

In [None]:
y_pred = []
for x in X_test:
    u = x[0]
    b = x[1]
    i = (u,b)
    book_sim_list = user_book_sim[u]
    p = 1
    for n in range(int(len(book_sim_list)/2)):
        if i in book_sim_list[n]:
            p = 0
    y_pred.append(p)

In [None]:
predictions = open("predictions_Read.txt", "w")
n = -2
for l in open("pairs_Read.txt"):
    n += 1
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue
    u, b = l.strip().split("-")
    p = y_pred[n]
    predictions.write(u + "-" + b + "," + str(p) + "\n")

predictions.close()