In [10]:
import json
N_KEYWORDS= 500
N_VENUES = 470
N_AUTHORS = 2302
RANDOM_STATE = 1

In [11]:
with open("train.json") as f:
    data = json.load(f)
with open("test.json") as f:
    test_data = json.load(f)

print("Training data sample:")
print(data["0"])
print("\nTest data sample:")
print(test_data["0"])

Training data sample:
{'venue': '', 'keywords': [64, 1, 322, 134, 136, 396, 270, 144, 476, 481, 165, 39, 361, 43, 177, 308, 310, 118, 187, 127], 'year': 2017, 'author': [1605, 759]}

Test data sample:
{'venue': '', 'keywords': [260, 6, 390, 136, 7, 11, 17, 285, 288, 162, 422, 179, 55, 184, 61, 318, 451, 199, 457, 329, 459, 79, 469, 342, 213, 346, 474, 477, 478, 228, 230, 363, 494, 496, 241, 370, 378], 'year': 2017, 'coauthor': [], 'target': 988}


In [12]:
import random

def split_data(data, val_set_ratio = 0.2):
    """
    Splits provided data into a training set and validation set
    Validation set size proportion depends on val_set_ratio, anything not in the validation set is put into training
    """
    keys = [key for key in data.keys()]
    # calculate number of instances for validation set and take a random sample
    val_set_size = int(val_set_ratio * len(keys))
    val_keys = random.sample(keys, val_set_size)
    val_data = {}
    train_data = {}
    # split between validation and training
    for key in keys:
        if key in val_keys:
            val_data[key] = data[key]
        else:
            train_data[key] = data[key]
    return train_data, val_data

train_data, val_data = split_data(data)

In [13]:
def process_data(train_data):
    """
    Takes each entry in training data and creates a new entry for each author of the form:
    year, keyword_0, keyword_1, ..., keyword_499, coauthor_0, coauthor_1, ... coauthor_499, target
    where target is an int in the range [0, 2301] and keyword_x and coauthor_x is in {0, 1} depending on if the 
    keyword/coauthor is in the entry
    Eg. Entry = {venue: '', keywords: [0, 3], year: 2011, author: [1, 2]} becomes the following entries:
    [2011,     1, 0, 0, 1, 0, 0, ..., 0,     0, 0, 1, 0, 0, ..., 0,     1]
    [2011,     1, 0, 0, 1, 0, 0, ..., 0,     0, 1, 0, 0, 0, ..., 0,     2]
     year     |        keywords        |    |     coauthors       |   target
    """
    train_data_processed = []
    for key in train_data:
        # ignores venue at the moment since papers with no venue will all be treated the same which maybe we do not want?
        # venue = entry['venue']
        entry = train_data[key]
        keywords = sorted(entry['keywords'])
        year = entry['year']
        authors = sorted(entry['author'])
        
        # converts keywords into a binary representation
        # TODO: find a simpler way to represent (currently given as an array of length 500)
        keyword_list = [1 if i in keywords else 0 for i in range(N_KEYWORDS)]
        
        for i in range(len(authors)):
            target = authors[i]
            # takes all authors other than the target as coauthors
            coauthor = sorted([author for author in authors if author != target])
            # converts coauthors into a binary representation
            # TODO: find a simpler way to represent (currently given as an array of length 2302)
            coauthor = [1 if i in coauthor else 0 for i in range(N_AUTHORS)]
            
            # concatenate everything and append to processed list
            train_data_processed.append([year] + keyword_list + coauthor + [target])
    
    return train_data_processed

train_data_processed = process_data(train_data)
val_data_processed = process_data(val_data)

In [14]:
from sklearn.svm import SVC

In [15]:
# separate into features and labels
X_train = [data[:-1] for data in train_data_processed]
y_train = [data[-1] for data in train_data_processed]

X_val = [data[:-1] for data in val_data_processed]
y_val = [data[-1] for data in val_data_processed]

In [16]:
# create a SVM classifier and train on training data (takes too long at the moment)
clf = SVC(C=0.1, random_state=RANDOM_STATE)
clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# predict probabilities (untested at the moment because training takes too long)
clf.predict_proba(X_val)