In [176]:
import numpy as np
import pandas as pd
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import random

In [177]:
##reading data

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [178]:
##Q1

recipeCount = defaultdict(int)
totalCooked = 0
user_cooked = defaultdict(list)

train_set = []
validation_set = []
all_recipes = set()

for user,recipe,_ in readCSV("trainInteractions.csv.gz"):
    totalCooked += 1
    all_recipes.add(recipe)
    user_cooked[user].append(recipe)
    if totalCooked < 400000:
        recipeCount[recipe] += 1
        train_set.append((user,recipe))
    if totalCooked >= 400000:
        validation_set.append((user,recipe,1))

all_recipes_list = list(all_recipes)
for i in range(100001):
    recipe = validation_set[i][1]
    user = validation_set[i][0]
    not_found = 1
    while not_found:
        recipe_ =  random.sample(all_recipes_list,1)
        recipe_ = recipe_[0]
        if recipe_ not in user_cooked[user]:
            validation_set.append((user,recipe_,0))
            user_cooked[user].append(recipe_)
            not_found = 0


In [180]:
mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalCooked/2: break

first_half = [val[1] for val in validation_set[0:100001]]
second_half = [val[1] for val in validation_set[100001:]]

TP = TN = 0
for recipe in first_half:
    if recipe in return1:
        TP += 1
for recipe in second_half:
    if recipe not in return1:
        TN += 1
print("Accuracy on validation set = " + str((TP+TN)/len(validation_set)))

Accuracy on validation set = 0.6739332606673933


In [182]:
##Q2

def choose_threshold(threshold):
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked/threshold: break

    first_half = [val[1] for val in validation_set[0:100001]]
    second_half = [val[1] for val in validation_set[100001:]]

    correct = 0
    for recipe in first_half:
        if recipe in return1:
            correct += 1
    for recipe in second_half:
        if recipe not in return1:
            correct += 1
    return correct/len(validation_set)

max_val = 0
choosen_val = 0
for value in range(150,220):
    val = value/100
    acc = choose_threshold(val)
    if acc > max_val:
        max_val = acc
        choosen_val = val
print(choosen_val, max_val)

2.11 0.6758182418175819


Top 47% leads to higher accuracy of 0.6758.

In [183]:
##Q3

def Jaccard(set1,set2):
    return len(set1.intersection(set2))/len(set1.union(set2))

recipes_per_user = defaultdict(list)
users_per_recipe = defaultdict(list)

for pair in train_set:
    recipes_per_user[pair[0]].append(pair[1])
    users_per_recipe[pair[1]].append(pair[0])

max_values = []
for pair in validation_set:
    made_recipes = recipes_per_user[pair[0]]
    query_recipe = pair[1]
    max_sim = 0
    for recipe in made_recipes:
        if recipe == query_recipe:
            continue
        jaccard = Jaccard(set(users_per_recipe[recipe]), set(users_per_recipe[query_recipe]))
        if jaccard > max_sim:
            max_sim = jaccard
    max_values.append(max_sim)

In [184]:
def jaccard_threshold(threshold):
    values = [val[2] for val in validation_set]
    correct = 0
    for i in range(len(values)):
        if (max_values[i] > threshold) and (values[i] == 1):
            correct += 1
        elif (max_values[i] < threshold) and (values[i] == 0):
            correct += 1
    return correct/len(validation_set)

max_threshold = 0
threshold = 0        
for i in range(0,100):
    val = i/1000
    accu = jaccard_threshold(val)
    if accu > max_threshold:
        max_threshold = accu
        threshold = val
print(max_threshold,threshold)
        

0.594119058809412 0.011


The threshold for highest accuracy is 0.011. And the accuracy for this threshold is 0.59. 

In [185]:
##Q4

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalCooked/2: break
            
values = [val[2] for val in validation_set]
recipes = [val[1] for val in validation_set]
correct = 0
for i in range(len(values)):
    if ((max_values[i] >= 0.011) and (recipes[i] in return1) and (values[i] == 1)):
        correct += 1
    elif (((max_values[i] < 0.011) or (recipes[i] not in return1)) and (values[i] == 0)):
        correct += 1
        
print("Accuracy for combination of two models is = " + str(correct/len(values)))

Accuracy for combination of two models is = 0.6580534194658053


In [174]:
##Q5

for user,recipe,_ in readCSV("trainInteractions.csv.gz"):
    totalCooked += 1
    all_recipes.add(recipe)
    user_cooked[user].append(recipe)
    recipeCount[recipe] += 1
    train_set.append((user,recipe))
    
for pair in train_set:
    recipes_per_user[pair[0]].append(pair[1])
    users_per_recipe[pair[1]].append(pair[0])
    
mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalCooked/2: break
        
        
predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    made_recipes = recipes_per_user[u]
    max_sim = 0
    for recipe in made_recipes:
        if recipe == i:
            continue
        jaccard = Jaccard(set(users_per_recipe[recipe]), set(users_per_recipe[i]))
        if jaccard > max_sim:
            max_sim = jaccard
    if (max_sim > 0.012) and (i in return1):
        predictions.write(u + '-' + i + ",1\n")
    elif (max_sim < 0.012) or (i not in return1):
        predictions.write(u + '-' + i + ",0\n")
    

predictions.close()

In [175]:
predictions = open("predictions_Made_q2.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if (i in return1):
        predictions.write(u + '-' + i + ",1\n")
    else:
        predictions.write(u + '-' + i + ",0\n")
    

predictions.close()

Using only popularity model leads to higher accuracy on Kaggle. Kaggle username is "helizi"