In [16]:
import preprocess
import random
import pandas as pd
import json
import numpy as np
import scipy.sparse as sp
from collections import defaultdict

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rc
rc('figure', figsize=(16, 8), max_open_warning=False)

from surprise import SVD, SVDpp, NMF
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split

# Sparsity

In [2]:
# mydict = preprocess.create_paper_paper_dict()

KeyboardInterrupt: 

In [4]:
# Creat random subset of dictionary, where we only retain references to themselves
random_dict = preprocess.create_random_subset_paper_paper_data(debug=True)

In [None]:
numbering, reverse = dict(), dict()
current_id = 0

for i in random_dict.keys():
    numbering[i] = current_id
    reverse[current_id] = i
    current_id += 1

In [None]:
# Build paper-paper matrix to observe its sparsity
papernum = len(random_dict.keys())
papermat = sp.dok_matrix((papernum,papernum), dtype=np.int8)
for paper_id, ref_ids in random_dict.items():
    for ref in ref_ids:
        papermat[numbering[paper_id], numbering[ref]] = 1

In [None]:
print("sparsity: ",100*papermat.count_nonzero()/papernum**2, "%")
plt.spy(papermat, markersize=1)
plt.show()

# SVD

In [5]:
data = preprocess.create_surprise_paper_paper_data(random_dict)

In [8]:
# split that data to train, test set and then do SVD
trainset, testset = train_test_split(data, test_size=.25)
algo = SVD(biased=False)
algo.fit(trainset)
predictions = algo.test(testset)
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df.rui

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
5       1.0
6       1.0
7       1.0
8       1.0
9       1.0
10      1.0
11      1.0
12      1.0
13      1.0
14      1.0
15      1.0
16      1.0
17      1.0
18      1.0
19      1.0
20      1.0
21      1.0
22      1.0
23      1.0
24      1.0
25      1.0
26      1.0
27      1.0
28      1.0
29      1.0
       ... 
984     1.0
985     1.0
986     1.0
987     1.0
988     1.0
989     1.0
990     1.0
991     1.0
992     1.0
993     1.0
994     1.0
995     1.0
996     1.0
997     1.0
998     1.0
999     1.0
1000    1.0
1001    1.0
1002    1.0
1003    1.0
1004    1.0
1005    1.0
1006    1.0
1007    1.0
1008    1.0
1009    1.0
1010    1.0
1011    1.0
1012    1.0
1013    1.0
Name: rui, Length: 1014, dtype: float64

In [None]:
pu = algo.pu
qi = algo.qi
Abar = np.dot(pu, np.transpose(qi))

In [None]:
fig, axes = plt.subplots()
min_val, max_val = -3, 3
pos = axes.matshow(Abar[0:30, 0:30], cmap=plt.cm.coolwarm)
fig.colorbar(pos)

In [None]:
# Show papers citing itself

In [11]:
data = preprocess.create_paper_paper_dict()

for key, value in data.items():
    if key in value:
        print(key)

In [13]:
#paper_paper_dict = data

itemList, userList, ratingList = [], [], []

all_keys_set = set(data.keys())
for key, value in data.items():
    for paper in value:
        itemList.append(paper)
        userList.append(key)
        ratingList.append(1) # "rating" is always 1 for each citation

#     # JP 03/28/18 First attempt on trying to add some (not all) entries with 0 ratings
#     if add_random_0_entries and len(value)!=0:
#         # create candidate set which does not include references
#         zero_rating_set_for_key = all_keys_set - set(value)
#         # add randomly selected 0 entry, the same number as 1 entries
#         for paper in random.sample(zero_rating_set_for_key, len(value)):
#             itemList.append(paper)
#             userList.append(key)
#             ratingList.append(0) # we add 0 entries (no citation) with certain probability

ratings_dict = {'itemID': itemList, 'userID': userList, 'rating': ratingList}
df = pd.DataFrame(ratings_dict)
df

Unnamed: 0,itemID,rating,userID
0,51c7e02e-f5ed-431a-8cf5-f761f266d4be,1,00127ee2-cb05-48ce-bc49-9de556b93346
1,69b625b9-ebc5-4b60-b385-8a07945f5de9,1,00127ee2-cb05-48ce-bc49-9de556b93346
2,10482dd3-4642-4193-842f-85f3b70fcf65,1,001c58d3-26ad-46b3-ab3a-c1e557d16821
3,3133714c-f979-4d84-9224-97361cf053ab,1,001c58d3-26ad-46b3-ab3a-c1e557d16821
4,3a926fef-7422-4654-8776-8e31b45be563,1,001c58d3-26ad-46b3-ab3a-c1e557d16821
5,52f480e8-85e6-4c01-9e5b-d75eabf6a8ec,1,001c58d3-26ad-46b3-ab3a-c1e557d16821
6,6f52f995-7c4c-4a92-83aa-d1c9fbd2465c,1,001c58d3-26ad-46b3-ab3a-c1e557d16821
7,8bd964d6-c45f-448c-9e65-efe5f98ca0a0,1,001c58d3-26ad-46b3-ab3a-c1e557d16821
8,8fa0a362-6522-48fc-bd5e-24de00ed6511,1,001c58d3-26ad-46b3-ab3a-c1e557d16821
9,9bfa8c24-8fb6-44d9-ba42-38b22f9cf34b,1,001c58d3-26ad-46b3-ab3a-c1e557d16821


In [14]:
df[df["itemID"]==df["userID"]]

Unnamed: 0,itemID,rating,userID


In [17]:
reader = Reader(rating_scale=(0,1)) # JP: rating scale is 0 (not cited) and 1 (cited)
data_surprise = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In [20]:
data_surprise = data_surprise.build_full_trainset()


AttributeError: 'Trainset' object has no attribute 'build_full_trainset'

In [27]:
for i in range(10):
    print(i, data_surprise.ir[i], '\n')

0 [(0, 2.0), (125329, 2.0), (527503, 2.0), (1209820, 2.0), (1364999, 2.0), (2085853, 2.0), (2156714, 2.0), (2208720, 2.0), (2229945, 2.0), (2260868, 2.0), (2402673, 2.0), (2408845, 2.0), (2429745, 2.0), (2444218, 2.0), (2480256, 2.0)] 

1 [(0, 2.0), (45616, 2.0), (122551, 2.0), (125329, 2.0), (444353, 2.0), (527503, 2.0), (644232, 2.0), (663243, 2.0), (682945, 2.0), (697606, 2.0), (699436, 2.0), (709843, 2.0), (740860, 2.0), (754223, 2.0), (811429, 2.0), (890945, 2.0), (914815, 2.0), (978194, 2.0), (1101908, 2.0), (1155015, 2.0), (1187042, 2.0), (1206720, 2.0), (1209820, 2.0), (1227389, 2.0), (1232264, 2.0), (1250769, 2.0), (1279645, 2.0), (1338156, 2.0), (1338228, 2.0), (1362134, 2.0), (1376019, 2.0), (1398902, 2.0), (1427013, 2.0), (1499231, 2.0), (1659445, 2.0), (1722446, 2.0), (1741061, 2.0), (1875068, 2.0), (2085853, 2.0), (2109932, 2.0), (2134628, 2.0), (2153626, 2.0), (2162108, 2.0), (2165630, 2.0), (2209858, 2.0), (2213626, 2.0), (2260868, 2.0), (2271683, 2.0), (2328423, 2.0), 