# CSE 258, Fall 2019: Assignment 1
Files: 
* http://cseweb.ucsd.edu/classes/fa19/cse258-a/files/assignment1.tar.gz   

Kaggle:
* https://inclass.kaggle.com/c/cse158258-fa19-read-prediction
* (258 only) https://inclass.kaggle.com/c/cse258-fa19-rating-prediction

# Tasks (Read prediction)   
Since we don’t have access to the test labels, we’ll need to simulate validation/test sets of our own.    
So, let’s split the training data (‘train Interactions.csv.gz’) as follows:
1. Reviews 1-190,000 for training
2. Reviews 190,001-200,000 for validation
3. Upload to Kaggle for testing only when you have a good model on the validation set. This will save you time (since Kaggle can take several minutes to return results), and prevent you from exceeding your daily submission limit.

# Import Dependencies

In [1]:
import gzip
from collections import defaultdict
import random
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy
from sklearn.linear_model import LogisticRegression

# Help Methods

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)
    
def readCSV(path):
    f = gzip.open(path, 'rt')
    header = f.readline()
    for l in f:
        yield l.strip().split(',')
        
def accuracy(predictions, labels):
    predictions, labels = np.array(predictions), np.array(labels)
    return sum(predictions == labels) / len(predictions)

def most_popular_percentile(mostPopular, percentile):
    return1 = set()
    count = 0
    for b_count, b in mostPopular:
        count += b_count
        return1.add(b)
        if count > percentile * totalRead: break
    return return1

def cosine_sim(s1,s2):
    numer = len(s1.intersection(s2))
    denom = len(s1) * len(s2) + 10**(-8)
    return numer / denom 
    
def best_cosine(user, book):
    users = usersPerBook[book]
    b_mark = bookPerUser[user] # Books that user has read
    angels = []
    for book2 in b_mark:
        if book2 == book:
            continue
        angel = cosine_sim(users, usersPerBook[book2])
        angels.append(angel)
    angels.sort(reverse=True)
    if len(angels) == 0:
        return [0]
    return angels

def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

def best_jacc(user, book):
    users = usersPerBook[book]
    b_mark = bookPerUser[user]
    similarities = []
    for book2 in b_mark:
        if book2 == book:
            continue
        # compute sim between book and book2
        sim = Jaccard(users, usersPerBook[book2])
        similarities.append(sim)
    similarities.sort(reverse=True)
    return similarities
        

# Load Data

In [4]:
data = [line[:2] + [1] for line in readCSV("train_Interactions.csv.gz")] # 1 is the label saying it is read.

# Extend validation set

In [5]:
Xy_train = data[:190000]
Xy_valid = data[190000:]
# First get overview of what books each user have read, and what what user a book has been read by.
usersPerBook = defaultdict(set)
bookPerUser = defaultdict(set)
for line in Xy_train:
    userID, bookID, rating = line
    usersPerBook[bookID].add(userID)
    bookPerUser[userID].add(bookID)

# Randomly ad some negative samples to the validation set
negative_samples = []
available_books = usersPerBook.keys()
for user, book, rating in Xy_valid:
    random_book = random.choice(list(available_books))
    while random_book in bookPerUser[user]:
        random_book = random.choice(list(available_books))
    new_data = [user, random_book, 0]
    negative_samples.append(new_data)
Xy_valid += negative_samples # Add the negative data
random.shuffle(Xy_valid)

Xtrain, ytrain = [d[:2] for d in Xy_train], [int(d[2]) for d in Xy_train]
Xvalid, yvalid = [d[:2] for d in Xy_valid], [int(d[2]) for d in Xy_valid]

# Build Model

In [6]:
# BASELINE: ACC 0.6576

bookCount = defaultdict(int)
totalRead = 0
for user,book in Xtrain:
    bookCount[book] += 1
    totalRead += 1
mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort(reverse=True)

top_6 = most_popular_percentile(mostPopular, 0.06)
top_10 = most_popular_percentile(mostPopular, 0.10)
top_30 = most_popular_percentile(mostPopular, 0.30)
mostPopular.sort()
worst30 = most_popular_percentile(mostPopular, 0.30)
worst10 = most_popular_percentile(mostPopular, 0.10)

def predict(user, book):
    global TP, FP, FN, TN
    # Poppularity
    istop6 = book in top_6
    istop10 = book in top_10
    istop30 = book in top_30
    isworst30 = book in worst30
    isworst10 = book in worst10
    
    
    # Jaccard
    jaccard_sims = best_jacc(user, book)
    jacc_avg = sum(jaccard_sims) / (len(jaccard_sims) + 10**(-8))
    
    pred = 0
    if isworst10:
        pred = 0
    elif isworst30:
        if jacc_avg > 0.0025:
            pred = 1
    elif istop10:
        pred = 1
        if jacc_avg < 0.0014:
            pred = 0
    else:
        
        if jacc_avg > 0.0016:
            pred = 1
    return pred        

# Validate

In [7]:
TP, FP, FN, TN = 0, 0, 0, 0
avgs = []
predictions = []
for (user, book), rating in zip(Xvalid, yvalid):
    p= predict(user, book) 
    predictions.append(p)


print("NUMBER OF ONES PREDICTED",sum(predictions), len(predictions))
yvalid = list(map(lambda x: int(x>0), yvalid))
print("Accuracy: {}".format(accuracy(predictions, yvalid)))

NUMBER OF ONES PREDICTED 9131 20000
Accuracy: 0.68575


# Upload Test Results
## Kaggle Username: kristogj

In [9]:
avgs = []
predictions = open("predictions_Read.txt", 'w')
test_pred = []
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    user, book = l.strip().split('-')
    pred = predict(user, book)
    test_pred.append(pred)
    predictions.write(user + '-' + book + ",{}\n".format(pred))
predictions.close()
print(sum(test_pred), len(test_pred))

8691 20000


# (CSE 258 only) Tasks (Rating prediction)


## Load Data

In [10]:
data = [line for line in readCSV("train_Interactions.csv.gz")]
Xy_train, Xy_valid = data[:190000], data[190000:]
Xtrain, ytrain = [x[:2] for x in Xy_train], [int(x[-1]) for x in Xy_train]
Xvalid, yvalid = [x[:2] for x in Xy_valid], [int(x[-1]) for x in Xy_valid]

## Help Methods

In [11]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

def prediction(user, book):
    return alpha + userBiases[user] + bookBiases[book]


## Insert data into fitting datastructures

In [12]:
# Reviews
reviewsPerUser = defaultdict(list)
reviewsPerBook = defaultdict(list)

# Ids
usersPerBook = defaultdict(set)
bookPerUser = defaultdict(set)

for user, book, rating in Xy_train:
    rating = int(rating)
    reviewsPerUser[user].append((book,rating))
    reviewsPerBook[book].append((user,rating))
    usersPerBook[book].add(user)
    bookPerUser[user].add(book)


## Build Model
Fitting a predictor of the form

$rating(user, item) = \alpha + \beta_{user} + \beta_{item}$

In [13]:
lamb = 3.1
userBiases = defaultdict(float)
bookBiases = defaultdict(float)
N = len(Xtrain)

def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)


last_alpha = 1
alpha = 0
# While a threshold
counter = 0
while abs(alpha - last_alpha)  > 0.0000001:
    last_alpha = alpha
    
    # Calculate new alpha
    alpha = 0
    for (user, book), rating in zip(Xtrain, ytrain):
        rating = int(rating)
        alpha += (rating - (userBiases[user] + bookBiases[book]))
    alpha /= N
    
    # Reset userBiases
    for user in bookPerUser.keys():
            userBiases[user] = 0
            
    # Calculate new userBiases
    for (user, book), rating in zip(Xtrain, ytrain):
        rating = int(rating)
        n = lamb + len(reviewsPerUser[user])
        userBiases[user] += (rating - (alpha + bookBiases[book])) / n
    
    # Reset bookBiases
    for book in usersPerBook.keys():
            bookBiases[book] = 0
    # Calculate new bookBiases
    for (user, book), rating in zip(Xtrain, ytrain):
        rating = int(rating)
        n = lamb + len(reviewsPerBook[book])
        bookBiases[book] += (rating - (alpha + userBiases[user]))/ n
    
    if counter % 10 == 0:  
        print(alpha)
    counter += 1

3.897121052631579
3.84464046269895
3.8242710704646954
3.8167082037656797
3.81394505274029
3.812943912632831
3.8125827792232396
3.812452816999202
3.8124061061685346
3.812389328844536
3.812383305070624
3.812381142706472


# Validate

In [14]:
predictions = []
for user, book in Xvalid:
    predictions.append(prediction(user, book))

mse = MSE(predictions, yvalid)
print("MSE: {}".format(mse))

MSE: 1.1080403998850257


## Task 10
Report the user and book IDs that have the largest and smallest values of β

In [16]:
# Code here
userBiases_items = list(userBiases.items())
userBiases_items.sort(key=lambda x: x[1], reverse=True)
print("\n Largest user bias: {} \n Smallest user bias: {}".format(userBiases_items[0], userBiases_items[-1]))

bookBiases_items = list(bookBiases.items())
bookBiases_items.sort(key=lambda x: x[1], reverse=True)
print("\n Largest book bias: {} \n Smallest book bias: {}".format(bookBiases_items[0], bookBiases_items[-1]))


 Largest user bias: ('u81539151', 1.1914740282744452) 
 Smallest user bias: ('u76571258', -3.459309613286763)

 Largest book bias: ('b19925500', 1.0692324925129244) 
 Smallest book bias: ('b84091840', -1.3912266598173963)



# Kaggle Username: kristogj

In [17]:
predictions = open("predictions_Rating.txt", 'w')
for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    user, book = l.strip().split('-')
    pred = str(prediction(user, book))
    predictions.write(user + '-' + book + ',' + pred + '\n')
predictions.close()