# CSE 258, Fall 2019: Homework 3
**You’ll probably want to implement your solution by modifying the baseline code provided.**   
Files: 
* http://cseweb.ucsd.edu/classes/fa19/cse258-a/files/assignment1.tar.gz   

Kaggle:
* https://inclass.kaggle.com/c/cse158258-fa19-read-prediction
* (258 only) https://inclass.kaggle.com/c/cse258-fa19-rating-prediction

# Tasks (Read prediction)   
Since we don’t have access to the test labels, we’ll need to simulate validation/test sets of our own.    
So, let’s split the training data (‘train Interactions.csv.gz’) as follows:
1. Reviews 1-190,000 for training
2. Reviews 190,001-200,000 for validation
3. Upload to Kaggle for testing only when you have a good model on the validation set. This will save you time (since Kaggle can take several minutes to return results), and prevent you from exceeding your daily submission limit.

In [1]:
import gzip
from collections import defaultdict
import random
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy
from sklearn.linear_model import LogisticRegression

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)
    
def readCSV(path):
    f = gzip.open(path, 'rt')
    header = f.readline()
    for l in f:
        yield l.strip().split(',')
        
def accuracy(predictions, labels):
    predictions, labels = np.array(predictions), np.array(labels)
    return sum(predictions == labels) / len(predictions)

def most_popular_percentile(mostPopular, percentile):
    return1 = set()
    count = 0
    for b_count, b in mostPopular:
        count += b_count
        return1.add(b)
        if count > percentile * totalRead: break
    return return1

def cosine_sim(s1,s2):
    numer = len(s1.intersection(s2))
    denom = len(s1) * len(s2) + 10**(-8)
    return numer / denom 
    
def best_cosine(user, book):
    users = usersPerBook[book]
    b_mark = bookPerUser[user] # Books that user has read
    angels = []
    for book2 in b_mark:
        if book2 == book:
            continue
        angel = cosine_sim(users, usersPerBook[book2])
        angels.append(angel)
    angels.sort(reverse=True)
    if len(angels) == 0:
        return [0]
    return angels

def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

def best_jacc(user, book):
    users = usersPerBook[book]
    b_mark = bookPerUser[user]
    similarities = []
    for book2 in b_mark:
        if book2 == book:
            continue
        # compute sim between book and book2
        sim = Jaccard(users, usersPerBook[book2])
        similarities.append(sim)
    similarities.sort(reverse=True)
    return similarities
        

In [463]:
data = [line[:2] + [1] for line in readCSV("train_Interactions.csv.gz")] # 1 is the label saying it is read.
#data = [line for line in readCSV("train_Interactions.csv.gz")]

# Extend validation set

In [655]:
Xy_train = data[:190000]
Xy_valid = data[190000:]
# First get overview of what books each user have read, and what what user a book has been read by.
usersPerBook = defaultdict(set)
bookPerUser = defaultdict(set)
for line in Xy_train:
    userID, bookID, rating = line
    usersPerBook[bookID].add(userID)
    bookPerUser[userID].add(bookID)

# Randomly ad some negative samples to the validation set
negative_samples = []
available_books = usersPerBook.keys()
for user, book, rating in Xy_valid:
    random_book = random.choice(list(available_books))
    while random_book in bookPerUser[user]:
        random_book = random.choice(list(available_books))
    new_data = [user, random_book, 0]
    negative_samples.append(new_data)
Xy_valid += negative_samples # Add the negative data
random.shuffle(Xy_valid)

Xtrain, ytrain = [d[:2] for d in Xy_train], [int(d[2]) for d in Xy_train]
Xvalid, yvalid = [d[:2] for d in Xy_valid], [int(d[2]) for d in Xy_valid]

# Predict

In [665]:
# BASELINE: ACC 0.6576

bookCount = defaultdict(int)
totalRead = 0
for user,book in Xtrain:
    bookCount[book] += 1
    totalRead += 1
mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort(reverse=True)

top_6 = most_popular_percentile(mostPopular, 0.06)
top_10 = most_popular_percentile(mostPopular, 0.10)
top_30 = most_popular_percentile(mostPopular, 0.30)
mostPopular.sort()
worst30 = most_popular_percentile(mostPopular, 0.30)
worst10 = most_popular_percentile(mostPopular, 0.10)

def predict(user, book):
    global TP, FP, FN, TN
    # Poppularity
    istop6 = book in top_6
    istop10 = book in top_10
    istop30 = book in top_30
    isworst30 = book in worst30
    isworst10 = book in worst10
    
    
    # Jaccard
    jaccard_sims = best_jacc(user, book)
    jacc_avg = sum(jaccard_sims) / (len(jaccard_sims) + 10**(-8))
    
    pred = 0
    if isworst10:
        pred = 0
    elif isworst30:
        if jacc_avg > 0.0025:
            pred = 1
    elif istop10:
        pred = 1
        if jacc_avg < 0.0014:
            pred = 0
    else:
        
        if jacc_avg > 0.0016:
            pred = 1
    return pred        

In [667]:
TP, FP, FN, TN = 0, 0, 0, 0
avgs = []
predictions = []
for (user, book), rating in zip(Xvalid, yvalid):
    p= predict(user, book) 
    predictions.append(p)


print("NUMBER OF ONES PREDICTED",sum(predictions), len(predictions))
yvalid = list(map(lambda x: int(x>0), yvalid))
print("Accuracy: {}".format(accuracy(predictions, yvalid)))

NUMBER OF ONES PREDICTED 9067 20000
Accuracy: 0.6871


# Upload Test Results
## Kaggle Username: kristogj

In [648]:
TP, FP = 0, 0
avgs = []
predictions = open("predictions_Read.txt", 'w')
test_pred = []
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    user, book = l.strip().split('-')
    pred = predict(user, book)
    test_pred.append(pred)
    predictions.write(user + '-' + book + ",{}\n".format(pred))
predictions.close()
print(TP, FP)
print(sum(test_pred), len(test_pred))

0 0
9009 20000


In [475]:
sum(avgs)/len(avgs)

0.002162103115097757

# (CSE 258 only) Tasks (Rating prediction)

Let’s start by building our training/validation sets much as we did for the first task. This time building a validation set is more straightforward: you can simply use part of the data for validation, and do not need to randomly sample non-read users/books.

In [39]:
data = [line for line in readCSV("train_Interactions.csv.gz")]
Xy_train, Xy_valid = data[:190000], data[190000:]
Xtrain, ytrain = [x[:2] for x in Xy_train], [int(x[-1]) for x in Xy_train]
Xvalid, yvalid = [x[:2] for x in Xy_valid], [int(x[-1]) for x in Xy_valid]

## Task 9
Fit a predictor of the form

$rating(user, item) = \alpha + \beta_{user} + \beta_{item}$


by fitting the mean and the two bias terms as described in the lecture notes. Use a regularization
parameter of λ = 1. Report the MSE on the validation set (1 mark).

In [40]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [41]:
reviewsPerUser = defaultdict(list)
reviewsPerBook = defaultdict(list)

for user, book, rating in Xy_train:
    rating = int(rating)
    reviewsPerUser[user].append((book,rating))
    reviewsPerBook[book].append((user,rating))

N = len(ytrain)

In [42]:
usersPerBook = defaultdict(set)
bookPerUser = defaultdict(set)
for line in Xy_train:
    userID, bookID, rating = line
    usersPerBook[bookID].add(userID)
    bookPerUser[userID].add(bookID)

In [43]:
def prediction(user, book):
    return alpha + userBiases[user] + bookBiases[book]

In [44]:
import random
lamb = 0.000001
userBiases = defaultdict(float)
bookBiases = defaultdict(float)
for user in bookPerUser.keys():
    userBiases[user] = random.random() - 0.5
for book in usersPerBook.keys():
    bookBiases[book] = random.random() - 0.5

In [45]:
# Alt 1
def cost(lamb, userBiases, bookBiases, X, y):
    predictions = [prediction(user, book) for user, book in X]
    cost = MSE(predictions, y) # Error
    
    # And add regularization
    for u in userBiases:
        cost += lamb*userBiases[u]**2
    for b in bookBiases:
        cost += lamb*bookBiases[b]**2
    return cost

# Train
for x in range(10):
    
    # Calculate alpha
    temp = sum([int(rating) - (userBiases[user] + bookBiases[book]) for user, book, rating in Xy_train])
    alpha = temp / N
    
    # Calculate Beta_user
    for user in userBiases.keys():
        books = bookPerUser[user]
        book_reviews = dict(reviewsPerUser[user])
        temp = sum([book_reviews[book] - (alpha + bookBiases[book]) for book in books])
        temp /= lamb + len(books)
        userBiases[user] = temp
    
        
    # Calculate Beta_book
    for book in bookBiases.keys():
        users = usersPerBook[book]
        users_review = dict(reviewsPerBook[book])
        temp = sum([users_review[user] - (alpha + userBiases[user]) for user in users])
        temp /= lamb + len(users)
        bookBiases[book] = temp
    print(cost(lamb, userBiases, bookBiases, Xtrain, ytrain))
    
    

0.9220745696523946
0.9088652390055492
0.9083792457082627
0.9082930255200622
0.9082698300389569
0.9082649437920396
0.9082653299066792
0.9082667995216558
0.9082681434661425
0.9082691228135656


# JOn

In [56]:
def calculate(lambda_value):
    beta_users = defaultdict(float)
    for user in bookPerUser.keys():
        beta_users[user] = 0
        
    beta_items = defaultdict(float)
    for book in usersPerBook.keys():
        beta_items[book] = 0
    
    def alpha1():
        a = 0
        N = len(Xtrain)
        for X, r in zip(Xtrain, ytrain):
            user, book = X
            r = int(r)
            a += (r - (beta_users[user] + beta_items[book]))
        return a/N
        
    def beta_user(a):
        for X, r in zip(Xtrain, ytrain):
            user, book = X
            r = int(r)
            N = lambda_value + len(reviewsPerUser[user])
            beta_users[user] += (r - (a + beta_items[book])) / N
        
    
    def beta_item(a):
        for X, r in zip(Xtrain, ytrain):
            user, book = X
            r = int(r)
            N = lambda_value + len(reviewsPerBook[book])
            beta_items[book] += (r - (a + beta_users[user]))/ N
    
    last_alpha = 1
    current_alpha = 0    
    while abs(current_alpha - last_alpha)  > 0.001:
        last_alpha = current_alpha
        
        current_alpha = alpha1()
        print(current_alpha)
        for user in bookPerUser.keys():
            beta_users[user] = 0
        beta_user(current_alpha)
        
        for book in usersPerBook.keys():
            beta_items[book] = 0
        beta_item(current_alpha)
        
    
    return current_alpha, beta_users, beta_items

In [59]:
alpha, userBiases, bookBiases = calculate(2.6)

3.897121052631579
3.890945336866959
3.884670470570013
3.8788226299683974
3.8734246396728955
3.8684450920264095
3.8638508425080094
3.859612133751425
3.8557021709354355
3.8520964528826807
3.848772350821494
3.8457088614250576
3.8428864510646696
3.8402869445492813
3.8378934354206065
3.8356902071204293
3.8336626602356247
3.8317972437534804
3.830081389502148
3.8285034494865458
3.82705263604893
3.8257189648549814
3.824493200718295
3.8233668062616704
3.8223318933910373
3.8213811775381554


# ALT 2

In [782]:
# ALT 2
import random
lamb = 1
lr = 1
alpha = ratingMean
userBiases = defaultdict(float)
bookBiases = defaultdict(float)
for user in bookPerUser.keys():
    userBiases[user] = random.random() - 0.5
for book in usersPerBook.keys():
    bookBiases[book] = random.random() - 0.5
    
def derivative(lamb, alpha, userBiases, bookBiases, Xy_train):
    dalpha = 0
    dUserBiases = defaultdict(float)
    dBookBiases = defaultdict(float)
    for user, book, rating in Xy_train:
        rating = int(rating)
        
        # Alpha
        dalpha += (2/N) * (alpha + userBiases[user] + bookBiases[book] - rating)
        
        # User
        books = bookPerUser[user]
        temp = 2*sum([alpha + userBiases[user] + bookBiases[book2] - rating for book2 in books])
        temp += 2* lamb * userBiases[user] # Regularization
        dUserBiases[user] = temp / N # Maybe remove N
        
        # Book
        users = usersPerBook[book]
        temp = 2*sum([alpha + userBiases[user2] + bookBiases[book] - rating for user2 in users])
        temp += 2* lamb * bookBiases[book] # Regularization
        dBookBiases[book] = temp / N # Maybe remove N
        
    return dalpha, dUserBiases,dBookBiases

for x in range(10):
    
    dalpha, dUserBiases, dBookBiases = derivative(lamb, alpha, userBiases, bookBiases, Xy_train)
    
    alpha -= lr * dalpha
    for user in userBiases.keys():
        userBiases[user] -= lr * dUserBiases[user]
    for book in bookBiases.keys():
        bookBiases[book] -= lr * dBookBiases[book]
    
    print(cost(lamb, userBiases, bookBiases, Xtrain, ytrain))
    

1556.0249390559


KeyboardInterrupt: 

In [60]:
predictions = []
for user, book in Xvalid:
    predictions.append(prediction(user, book))

mse = MSE(predictions, yvalid)
print("MSE: {}".format(mse))

MSE: 1.1084461391876665


## Task 10
Report the user and book IDs that have the largest and smallest values of β (1 mark).

In [30]:
# Code here
userBiases_items = list(userBiases.items())
userBiases_items.sort(key=lambda x: x[1], reverse=True)
print("\n Largest user bias: {} \n Smallest user bias: {}".format(userBiases_items[0], userBiases_items[-1]))

bookBiases_items = list(bookBiases.items())
bookBiases_items.sort(key=lambda x: x[1], reverse=True)
print("\n Largest book bias: {} \n Smallest book bias: {}".format(bookBiases_items[0], bookBiases_items[-1]))


 Largest user bias: ('u32162993', 1.5065993547347916) 
 Smallest user bias: ('u47604248', -4.1307309523718345)

 Largest book bias: ('b19925500', 1.6016881017938551) 
 Smallest book bias: ('b84091840', -2.1038598817102456)


## Task 11
Find a better value of λ using your validation set. Report the value you chose, its MSE, and upload your solution to Kaggle by running it on the test data (1 mark).
# Kaggle Username: kristogj

In [79]:
lamb = 0.00002
x, f, d = scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nBooks),
                             derivative, args = (ytrain, lamb))

I found the right value for lamb by looping over different values of lamb and observing how the mse was chaning. 

In [80]:
predictions = []
for user, book in Xvalid:
    predictions.append(prediction(user, book))

mse = MSE(predictions, yvalid)
print("MSE: {}".format(mse))

MSE: 1.1742062925437837


In [61]:
predictions = open("predictions_Rating.txt", 'w')
for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    user, book = l.strip().split('-')
    pred = str(prediction(user, book))
    predictions.write(user + '-' + book + ',' + pred + '\n')
predictions.close()