In [32]:
import random
import pandas as pd
import json
import numpy as np
import scipy.sparse as sp
import time
from collections import defaultdict

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import rc
rc('figure', figsize=(16, 8), max_open_warning=False)

from surprise import SVD
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

import sys
sys.path.insert(0, '../src')

import preprocess
import evaluation

In [33]:
# random_dict = preprocess.create_random_subset_paper_paper_data(debug=False)
random_dict = preprocess.create_paper_paper_dict(debug=True)

In [34]:
len(random_dict)

47018

In [35]:
def invert_dict(d): 
    inverse = dict() 
    for key in d: 
        # Go through the list that is saved in the dict:
        for item in d[key]:
            # Check if in the inverted dict the key exists
            if item not in inverse: 
                # If not create a new list
                inverse[item] = [key] 
            else: 
                inverse[item].append(key) 
    return inverse

In [36]:
invert_random_dict = invert_dict(random_dict)

In [37]:
# test_size = 0.25
# total_num_data = len(random_dict)
# print(total_num_data)
# est_num_test_data = int(total_num_data*test_size)

testdic  = defaultdict(list)
traindic = defaultdict(list)

for user in random_dict:
    if len(random_dict[user]) < 2:
        traindic[user] = random_dict[user]
    else:
        i = 0
        for ref in random_dict[user]:
            i = i+1
            if i < 2:
                traindic[user].append(ref)
            else:
                if len(invert_random_dict[ref]) < 2:
                    traindic[user].append(ref)
                else:
                    invert_random_dict[ref].remove(user)
                    testdic[user].append(ref)
                

In [38]:
len(traindic)

47018

In [39]:
len(testdic)

37516

In [9]:
import codecs
with codecs.open('../dblp-ref'+'/'+'train_must.tsv', "w", "utf8") as o:
    for key, values in traindic.items():
        for value in values:
            o.write("%s\t%s\t%d\n" % (key, value, 1))

In [10]:
with codecs.open('../dblp-ref'+'/'+'test.tsv', "w", "utf8") as o:
    for key, values in testdic.items():
        for value in values:
            o.write("%s\t%s\t%d\n" % (key, value, 1))

In [42]:
import pandas as pd
data = pd.read_table('../dblp-ref/test.tsv',
                             usecols=[0, 1, 2],
                             names=['user', 'ref', 'cites'],
                             na_filter=False)

In [43]:
data

Unnamed: 0,user,ref,cites
0,001eef4f-1d00-4ae6-8b4f-7e66344bbc6e,3c2ddf0a-237b-4d17-8083-c90df5f3514b,1
1,001eef4f-1d00-4ae6-8b4f-7e66344bbc6e,522ce553-29ea-4e0b-9ad3-0ed4eb9de065,1
2,001eef4f-1d00-4ae6-8b4f-7e66344bbc6e,579e5f24-5b13-4e92-b255-0c46d066e306,1
3,001eef4f-1d00-4ae6-8b4f-7e66344bbc6e,80656b4d-b24c-4d92-8753-bdb965bcd50a,1
4,001eef4f-1d00-4ae6-8b4f-7e66344bbc6e,d6e37fb1-5f7e-448e-847b-7d1f1271c574,1
5,00352759-f0a7-4678-82ae-fed68c700da6,7bcea2f7-6006-4a4d-8529-3593fa56d2dc,1
6,00352759-f0a7-4678-82ae-fed68c700da6,c1959b6f-8bab-49a2-ae2f-066c43cfc9c9,1
7,00352759-f0a7-4678-82ae-fed68c700da6,d65d26b4-a0e0-4112-ae22-2f3a4b51d7b3,1
8,013ea675-bb58-42f8-a423-f5534546b2b1,aa699fbf-fabe-40e4-bd68-46eaf333f7b1,1
9,01522369-3b88-4256-99d4-4e0c1de9f1ba,db81cd6e-3a7a-4088-b254-92f924a4e379,1


In [44]:
from sklearn.model_selection import train_test_split, KFold
train, test = train_test_split(data, test_size=0.1, random_state=42)
train, validation = train_test_split(train, test_size=0.11, random_state=42)

In [45]:
train

Unnamed: 0,user,ref,cites
98815,222676c0-4937-40e1-966a-1c712a96af3e,9c79dd78-4bf6-4bf2-aea8-5f60a91cb4ec,1
103456,4e58a299-e096-494a-8106-a7b0ccfec36f,0ed34330-461a-4ed4-96c3-3e626cb37c0b,1
113016,03f3ccb8-b70f-49d4-aeae-cf8799d3a280,caecec1b-5326-469b-bf2f-c4b69aa50fd6,1
168677,42e51b15-3abc-4198-82e6-ab31e1b97a6a,e6b37360-202e-4d47-8e5f-5f02c89c06cc,1
99337,4891c33d-bfd0-4d4a-a990-95c57ba89c91,141a0694-e728-40dd-b665-5bbda2f42130,1
227097,094fee32-586b-4c92-a83d-115d5254f5f3,17d4f04a-3479-4d44-bdae-70b01300775b,1
215728,39fbc733-26b9-4c04-9c3e-6316503d8c06,9ecc2bf7-e62f-4421-b556-27044821e883,1
4469,e9f95bd4-70d9-4667-8aea-59a67c0b0d6d,cd3d6419-2ff3-4474-ab9e-6aef231095e6,1
86439,2726fab1-dc58-4a91-a1f1-695e79ed322c,19f228de-9312-4c5e-bb33-f3b1302d6677,1
103825,6378b5da-5785-4939-9d6a-b22527a6805a,c3f3fc3e-6225-4381-b780-c184080ebc4d,1


In [17]:
#train, validate, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])

In [18]:
train

Unnamed: 0,user,ref,cites
26294,da534c99-eef5-463c-8b8e-f0fda6d85a71,2c6b2c8f-2342-4fce-8983-293be67f57c5,1
157801,5e6941b1-8015-4581-ada7-fda8cc1b95e3,44e6257a-e74f-4231-a3d6-167139554cb5,1
164036,fd3d2a25-41f8-4fbb-9d51-6e075346e737,11fd6e50-6f6f-4c83-9a9f-2c59760571e6,1
234825,9026098d-e1d2-4010-95d0-f2b21bd9aac4,75167370-c38e-4e6b-9e6f-adbf9ea4ed35,1
48512,9a28a24b-d24a-4346-8bc2-f86cfce1f1c7,50220aae-5c3c-4217-98f5-1661382f63bf,1
103828,63eeb83a-3101-4c28-a79b-4495ee806e8b,3d4f1ae9-1fb3-412f-aaac-21754f59b91b,1
225260,d7fbcc17-126e-44ed-8af1-c6ee2bee2924,98c3a1b5-9fb9-4342-8db3-cb16a068f344,1
33764,d5bf8690-7090-458b-ad15-b46733d0ee79,98ae0fb7-8106-4ef4-a06f-e1cc6c844db3,1
113504,29a3804d-aaa3-4594-b508-13f2c0a66587,6421aa8b-ed50-4b63-ab20-34e26e0ab8fa,1
156207,faf77a95-fd76-4430-8e0e-7588d602e261,5c56f63c-d859-405d-9976-a8317f112bd0,1


In [19]:
validate

Unnamed: 0,user,ref,cites
125760,ddd5c1d9-9054-4ae7-9c52-5cc3b6616a62,ce8682b5-ce54-45be-a485-d8cc162968c9,1
182997,45a1d56d-1a4a-401d-83b6-297b9192441c,178658bb-4903-4c05-9283-d00a53e9b842,1
190660,43b33d1b-4250-4c2c-b9a1-07e623eae809,ce90adda-7c60-451f-a617-78822d4a5ef4,1
16862,50d50ec5-8acf-40a0-94da-3b68c9f65a9f,45beeef2-a57d-436d-8fc4-56136234b4b9,1
59320,a9889d71-3a6f-4de0-b055-80f05081112f,027cf4d3-ca04-45c6-9da4-92a11e94d22a,1
80063,d4ca604e-a519-488c-8bac-fd6a2bba406f,9ffbfdbb-6704-4c7f-80cd-41a071da684b,1
58151,6db87b8c-4581-4f06-b2cc-a180da2d3600,1b224081-32c6-4847-86c7-9c84f874519b,1
114507,65966528-e907-4ade-a90f-b85195da98b0,19808527-8cf3-4484-8463-a1b5d923d95b,1
244777,c9ec7611-4449-482c-8225-c153ab6a5263,d3e00e7e-1c64-4d7a-b2b2-1ad98ba4c706,1
163572,e11a2e74-5e43-48cb-a4c5-af653f2bf8d4,83513149-87ba-4588-9487-0f2a838a7ae7,1


In [20]:
test

Unnamed: 0,user,ref,cites
4834,fa25d384-98a8-4a75-a469-1a6937122055,3d2891dc-d78e-436a-8c52-09d297f11191,1
102384,113ec50a-7f24-42b0-986e-5834780deb99,3f4116a8-07fc-4fab-a2ec-7d9aa7f68d6e,1
234693,8d6c590c-02e8-4f18-ae24-c9420e96d628,c3162fc6-7e0b-44c4-a55c-46b06354699f,1
113383,1ca76916-242e-4b1a-b1f6-7f481917225e,38eb4311-d3e7-45c2-8d33-1624a267e2b5,1
226710,02ad7bec-7a4c-4492-9382-7c0b001c0811,e0296c28-35a4-41c1-9fd2-58e75d4819be,1
29529,af442558-5dac-419a-9687-51d9dd98bd87,36f339f1-3e52-4417-b4d8-a349c33f8127,1
184478,99561421-96ac-4cb3-9e04-bcde7e63cec0,c1b6b493-01ef-420f-be44-7bacfe34e846,1
65014,d5825074-db3a-4518-82c6-f0f94b88d381,9e7f3a14-e586-4bad-aab9-0c36173e441d,1
86466,29cb5ef5-514b-4e5a-9034-04858e827c7c,86fefda3-efad-4a80-8ac9-d3cdf07b94b1,1
93905,e2615557-541a-4c70-b762-e8dbf7428d15,3d4f1ae9-1fb3-412f-aaac-21754f59b91b,1


In [28]:
validate.to_csv('../dblp-ref/small_validate.tsv',sep='\t',header=False,index=False)

In [29]:
test.to_csv('../dblp-ref/small_test.tsv',sep='\t',header=False,index=False)

In [30]:
train.to_csv('../dblp-ref/small_train.tsv',sep='\t',header=False,index=False)

In [24]:
train_must = pd.read_table('../dblp-ref/train_must.tsv',
                             usecols=[0, 1, 2],
                             names=['user', 'ref', 'cites'],
                             na_filter=False)

In [25]:
concat = pd.concat([train_must,train])

In [26]:
len(concat)

500943

In [31]:
concat.to_csv('../dblp-ref/merged_train.tsv',sep='\t',header=False,index=False)