Loading libraries

In [42]:
import gzip
from collections import defaultdict
import numpy as np
import random
import pandas as pd
import itertools
from sklearn.metrics import confusion_matrix

Defining functions to extract zip, and extract data.

In [43]:
def readGz(path):
    for l in gzip.open(path, "rt"):
        yield eval(l)


def readCSV(path):
    f = gzip.open(path, "rt")
    f.readline()
    for l in f:
        yield l.strip().split(",")

Loading Training data

In [44]:
data = []

for user, book, rating in readCSV("train_Interactions.csv.gz"):
    data.append([user, book, rating])

train_data = data[:190000]
valid_data = data[190000:]

books = set(np.array(data).T[1])
users = set(np.array(data).T[0])

Visaulizing dataset

In [45]:
usersPerBook = defaultdict(set)
booksPerUser = defaultdict(set)

for user, book, r in train_data:
    usersPerBook[book].add(user)
    booksPerUser[user].add(book)

Question 1, Generating negative cases for user in each entry

In [46]:
# Dont run this... everytime.

neg_valid_data = []

for user, _, _ in valid_data:
    unreadbooks = list(books.difference(booksPerUser[user]))
    datum = [user, random.choice(unreadbooks), -1]
    neg_valid_data.append(datum)

valid_data = valid_data + neg_valid_data

Training over training data

In [47]:
bookCount = defaultdict(int)
totalRead = 0

for user, book, _ in train_data:
    bookCount[book] += 1
    totalRead += 1

In [48]:
mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

def top_n_books(mostPopular, c):
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalRead * c:
            break
    return return1, count

return1, count = top_n_books(mostPopular, 0.50)

In [49]:
def pred_read(valid_data, return1):
    y_valid_pred = []
    for user, book, _ in valid_data:
        if book in return1:
            y_valid_pred.append(1)
        else:
            y_valid_pred.append(0)
    return y_valid_pred


y_valid = np.array([1] * 10000 + [0] * 10000)
y_valid_pred = pred_read(valid_data, return1)

In [50]:
accuracy = sum(y_valid == y_valid_pred) * 100 / len(y_valid_pred)

print(accuracy)

64.855


Defining function to calculate diagnostic metrics

In [53]:
def performance_metrics(y, y_pred, t):
    accuracy = sum(y_pred == y) / len(y)
    c_m = confusion_matrix(y, y_pred)
    TP = c_m[1][1]
    FP = c_m[0][1]
    FN = c_m[1][0]
    TN = c_m[0][0]
    BER = 1 - 0.5 * (TP / (TP + FN) + TN / (TN + FP))
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * precision * recall / (precision + recall)
    return t, accuracy, BER, precision, recall, F1

Question 2, Trying better definitons of popularity by manipulating the propotion of users reading it

In [54]:
criteria = [0.3, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.8, 0.9, 0.95]

top_books = [(top_n_books(mostPopular, c), c) for c in criteria]

y_valid_accuracies = [
    performance_metrics(y_valid, pred_read(valid_data, return1[0]), c)
    for return1, c in top_books
]

In [56]:
pd.DataFrame(
    y_valid_accuracies,
    columns=[
        "Popularity in Training Set",
        "Accuracy",
        "BER",
        "Precision",
        "Recall",
        "F1",
    ],
)

Unnamed: 0,Popularity in Training Set,Accuracy,BER,Precision,Recall,F1
0,0.3,0.60935,0.39065,0.794982,0.2947,0.429999
1,0.4,0.6305,0.3695,0.753792,0.3876,0.511954
2,0.45,0.64095,0.35905,0.737971,0.4371,0.549017
3,0.5,0.64855,0.35145,0.721089,0.4845,0.57958
4,0.55,0.65335,0.34665,0.700641,0.5355,0.60704
5,0.6,0.65315,0.34685,0.680071,0.5784,0.625128
6,0.65,0.64965,0.35035,0.656915,0.6265,0.641347
7,0.7,0.6436,0.3564,0.635268,0.6744,0.654249
8,0.8,0.6213,0.3787,0.593107,0.7727,0.671096
9,0.9,0.57955,0.42045,0.550066,0.874,0.67519


The better threshold is 0.55 as you can see above( has best accuracy), but still we do not have a tremendous improvement. We can look for desiging better recommendation system than simply predicting based on popularity.

Question 3, Implementing Jaccard similarity on the books the user has read before to ascertain whether the user would be reading the new book or not

In [57]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [58]:
def cosine(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1)**0.5 * len(s2)**0.5
    try:
        sim = numer/denom
    except ZeroDivisionError:
        return 0
    return sim

In [59]:
return1, count = top_n_books(mostPopular, 0.70)

def pred_jaccard(threshold):
    y_valid_pred = []
    for user, book, _ in valid_data:
        recommend = 0
        books = booksPerUser[user]
        for each_book in books:
            if each_book == book or each_book not in return1:
                continue
            else:
                users = usersPerBook[each_book]
                sim = Jaccard(users, usersPerBook[book])
                if sim > threshold:
                    recommend = 1
                    break
        y_valid_pred.append(recommend)
    return y_valid_pred

In [60]:
thresholds = [0.0070, 0.0075, 0.0080, 0.0090, 0.0095, 0.01, 0.0125, 0.015]

y_valid_accuracies = [
    performance_metrics(y_valid, pred_jaccard(t), t) for t in thresholds
]

pd.DataFrame(
    y_valid_accuracies,
    columns=["Similarity Threshold", "Accuracy", "BER", "Precision", "Recall", "F1"],
)

Unnamed: 0,Similarity Threshold,Accuracy,BER,Precision,Recall,F1
0,0.007,0.6125,0.3875,0.575707,0.8555,0.688254
1,0.0075,0.61515,0.38485,0.57881,0.8457,0.687254
2,0.008,0.61905,0.38095,0.583106,0.8353,0.686783
3,0.009,0.6219,0.3781,0.588874,0.8077,0.681144
4,0.0095,0.62245,0.37755,0.591675,0.7903,0.676714
5,0.01,0.6243,0.3757,0.595896,0.7724,0.672764
6,0.0125,0.6199,0.3801,0.609139,0.6692,0.637759
7,0.015,0.60405,0.39595,0.618548,0.5429,0.578261


The best of the lot is 0.01 for the similarity threshold, whihc gives the best accuracy

Question 4

The prediction on the Jaccard Similirity based recommender system performs worse than popularity based. Hence we go for a best of both worlds model as the new recommendation system to predict whether user will read a book or not

In [111]:
def pred_combined(threshold, c):
    y_valid_pred = []
    return1, _ = top_n_books(mostPopular, c)
    for user, book, _ in valid_data:
        recommend_sim = 0
        recommend_pop = 0
        similarities = []

        if book in return1:
            # y_valid_pred.append(1)
            recommend_pop = 1

        books = booksPerUser[user]
        for each_book in books:
            if each_book == book or each_book not in return1:
                continue
            else:
                users = usersPerBook[each_book]
                sim = Jaccard(users, usersPerBook[book])
                if sim > threshold:
                    recommend_sim = 1
                    break
                       
        y_valid_pred.append(recommend_sim and recommend_pop)

    return y_valid_pred

In [121]:
thresholds = np.arange(0.008, 0.009, 0.00002)
criteria = np.arange(0.6, 0.8, 0.02)

thresholds_criteria = list(itertools.product(thresholds, criteria))

y_valid_accuracies = [(c,)+performance_metrics(y_valid, pred_combined(t, c), t) for t, c in thresholds_criteria]

performance_matrix = pd.DataFrame(y_valid_accuracies, columns = ['Popularity Threshold','Similarity Threshold','Accuracy','BER','Precision','Recall','F1'])

In [122]:
performance_matrix.sort_values(
    by=["Accuracy", "BER", "Recall"], ascending=[False, True, False]
)[:30]

Unnamed: 0,Popularity Threshold,Similarity Threshold,Accuracy,BER,Precision,Recall,F1
314,0.72,0.00856,0.66115,0.33885,0.673485,0.6256,0.64866
325,0.72,0.00858,0.6611,0.3389,0.67345,0.6255,0.64859
336,0.72,0.0086,0.661,0.339,0.673379,0.6253,0.64845
347,0.72,0.00862,0.661,0.339,0.673379,0.6253,0.64845
149,0.72,0.00826,0.6609,0.3391,0.670916,0.6316,0.650664
193,0.72,0.00834,0.6609,0.3391,0.671755,0.6293,0.649835
204,0.72,0.00836,0.6609,0.3391,0.671755,0.6293,0.649835
138,0.72,0.00824,0.66085,0.33915,0.670844,0.6316,0.650631
6,0.72,0.008,0.66085,0.33915,0.669691,0.6348,0.651779
17,0.72,0.00802,0.66085,0.33915,0.669691,0.6348,0.651779


Combining the popularity and similarity models by doing an AND operation on the two predictions to get more confident predictions. The above matrix shows the top 30 performing matrix for the hyperparameters values. Using 0.7 as popularity threshold and 0.0085 similarity threshold give the best performance in terms of accuracy.

Question 5, With the model and hyperparameters in question 4 we run the prediction on the test data set. The resulting file is uploaded to Kaggle. (Username : mouserat)

In [125]:
return1, _ = top_n_books(mostPopular, 0.72)

predictions = open("predictions_Read.txt", "w")
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue

    u, b = l.strip().split("-")
    recommend_sim = 0
    recommend_pop = 0

    if b in return1:
        recommend_pop = 1
        # predictions.write(u + '-' + b + "," + str(recommend) + "\n")

    bs = booksPerUser[u]
    for each_book in bs:
        if each_book == b or each_book not in return1:
            continue
        else:
            us = usersPerBook[each_book]
            sim = Jaccard(us, usersPerBook[b])
            if sim > 0.0087:
                recommend_sim = 1
                break
    # predictions.write(u + '-' + b + "," + str(recommend) + "\n")
    predictions.write(u + "-" + b + "," + str(recommend_sim and recommend_pop) + "\n")

predictions.close()

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])