In [2]:
import random
import pandas as pd
import json
import numpy as np
import scipy.sparse as sp
import time
from collections import defaultdict

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import rc
rc('figure', figsize=(16, 8), max_open_warning=False)

from surprise import SVD
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

import sys
sys.path.insert(0, '../src')

import preprocess
import evaluation

In [3]:
# random_dict = preprocess.create_random_subset_paper_paper_data(debug=False)
random_dict = preprocess.create_paper_paper_dict(debug=True)

In [4]:
len(random_dict)

47018

In [5]:
def invert_dict(d): 
    inverse = dict() 
    for key in d: 
        # Go through the list that is saved in the dict:
        for item in d[key]:
            # Check if in the inverted dict the key exists
            if item not in inverse: 
                # If not create a new list
                inverse[item] = [key] 
            else: 
                inverse[item].append(key) 
    return inverse

In [6]:
invert_random_dict = invert_dict(random_dict)

In [7]:
# test_size = 0.25
# total_num_data = len(random_dict)
# print(total_num_data)
# est_num_test_data = int(total_num_data*test_size)

testdic  = defaultdict(list)
traindic = defaultdict(list)

for user in random_dict:
    if len(random_dict[user]) < 2:
        traindic[user] = random_dict[user]
    else:
        i = 0
        for ref in random_dict[user]:
            i = i+1
            if i < 2:
                traindic[user].append(ref)
            else:
                if len(invert_random_dict[ref]) < 2:
                    traindic[user].append(ref)
                else:
                    invert_random_dict[ref].remove(user)
                    testdic[user].append(ref)
                

In [8]:
trainset  = preprocess.create_surprise_paper_paper_data(traindic).build_full_trainset()
testset   = preprocess.create_surprise_paper_paper_data(testdic).build_full_trainset().build_testset()

In [10]:
aux  = trainset.build_testset()
print(len(aux))
print(len(testset))
print(len(testset)/(len(aux)+len(testset)))

348749
253657
0.42107316328190625


In [12]:
algo = SVD(biased=False, n_epochs = 10)
algo.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x105653dd8>

In [13]:
predictions = algo.test(testset, clip=False)
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
print(len(df))
print(len(df[df.details == {'was_impossible': False}]))

253657
253657


In [None]:
df

In [14]:
# Visualize train set data
usernum = trainset.n_users
itemnum = trainset.n_items

papermat = sp.dok_matrix((usernum,itemnum), dtype=np.int8)
for i in range(trainset.n_users):
    for paper_id in trainset.ur[i]:
        papermat[i, paper_id[0]] = 1

In [None]:
# Visualize test set data
for j in range(len(testset)):
    uid    = testset[j][0]
    iid    = testset[j][1]
    rating = testset[j][2]
    try:
        iuid = algo.trainset.to_inner_uid(uid)
    except:
        print("hi")
        continue
    try:
        iiid = algo.trainset.to_inner_iid(iid)
    except:
        print("aaa")
        continue
#     print(iuid, iiid)
    papermat[iuid, iiid] = -1

In [None]:
A = papermat.todense()
fig, axes = plt.subplots(figsize=(20, 20))
pos = axes.matshow(A[0:30,0:30], cmap='seismic',vmin=-2., vmax=2.)
fig.colorbar(pos)