In [1]:
import json
N_KEYWORDS= 500
N_VENUES = 470
N_AUTHORS = 2302
RANDOM_STATE = 1

In [2]:
with open("train.json") as f:
    data = json.load(f)
with open("test.json") as f:
    test_data = json.load(f)

print("Training data sample:")
print(data["0"])
print("\nTest data sample:")
print(test_data["0"])

Training data sample:
{'venue': '', 'keywords': [64, 1, 322, 134, 136, 396, 270, 144, 476, 481, 165, 39, 361, 43, 177, 308, 310, 118, 187, 127], 'year': 2017, 'author': [1605, 759]}

Test data sample:
{'venue': '', 'keywords': [260, 6, 390, 136, 7, 11, 17, 285, 288, 162, 422, 179, 55, 184, 61, 318, 451, 199, 457, 329, 459, 79, 469, 342, 213, 346, 474, 477, 478, 228, 230, 363, 494, 496, 241, 370, 378], 'year': 2017, 'coauthor': [], 'target': 988}


In [4]:
import random

def split_data(data, val_set_ratio = 0.2):
    """
    Splits provided data into a training set and validation set
    Validation set size proportion depends on val_set_ratio, anything not in the validation set is put into training
    """
    keys = [key for key in data.keys()]
    # calculate number of instances for validation set and take a random sample
    val_set_size = int(val_set_ratio * len(keys))
    val_keys = random.sample(keys, val_set_size)
    val_data = {}
    train_data = {}
    # split between validation and training
    for key in keys:
        if key in val_keys:
            val_data[key] = data[key]
        else:
            train_data[key] = data[key]
    return train_data, val_data

train_data, val_data = split_data(data)

# find the max number of authors/keywords for any one paper in the data set
author_counts = []
keyword_counts = []
for key in train_data:
    author_counts.append(len(train_data[key]['author']))
    keyword_counts.append(len(train_data[key]['keywords']))

max_authors, min_authors = max(author_counts), min(author_counts)
max_keywords, min_keywords = max(keyword_counts), min(keyword_counts)
print("Max number of authors for any one paper:", max_authors)
print("Max number of keywords for any one paper:", max_keywords)
print("Min number of authors for any one paper:", min_authors)
print("Min number of keywords for any one paper:", min_keywords)

Max number of authors for any one paper: 9
Max number of keywords for any one paper: 109
Min number of authors for any one paper: 1
Min number of keywords for any one paper: 1


In [73]:
def get_coauthor_matrix(train_data):
    """
    Returns a matrix (2D list) of how many papers each author has written 
    with each other author
    """
    coauthor_matrix = [[0 for i in range(N_AUTHORS)] for j in range(N_AUTHORS)]
    for key in train_data:
        authors = train_data[key]['author']
        for author1 in authors:
            for author2 in authors:
                coauthor_matrix[author1][author2] += 1
    return coauthor_matrix

def get_keyword_matrix(train_data):
    """
    Returns a matrix (2D list) of how many papers each author has written 
    that has each keyword
    """
    keyword_matrix = [[0 for i in range(N_KEYWORDS)] for j in range(N_AUTHORS)]
    for key in train_data:
        keywords = train_data[key]['keywords']
        authors = train_data[key]['author']
        for author in authors:
            for keyword in keywords:
                keyword_matrix[author][keyword] += 1
    return keyword_matrix

def pad(data_list, n, val):
    """
    Pads data_list with val until it is of length n
    If len(data_list) > n, cuts the list so that it is of length n
    """
    n_missing = n - len(data_list)
    if n_missing < 0:
        return data_list[:n]
    return data_list + n_missing*[val]

ca_matrix = get_coauthor_matrix(train_data)
kw_matrix = get_keyword_matrix(train_data)

In [74]:
def process_data(train_data, coauthor_matrix=[], keyword_matrix = []):
    """
    Takes each entry in training data and creates a new entry for each author of the form:
    year, keyword_0, keyword_1, ..., keyword_499, coauthor_0, coauthor_1, ... coauthor_499, target
    where target is an int in the range [0, 2301] and keyword_x and coauthor_x is in {0, 1} depending on if the 
    keyword/coauthor is in the entry
    Eg. Entry = {venue: '', keywords: [0, 3], year: 2011, author: [1, 2]} becomes the following entries:
    [2011,     1, 0, 0, 1, 0, 0, ..., 0,     0, 0, 1, 0, 0, ..., 0,     1]
    [2011,     1, 0, 0, 1, 0, 0, ..., 0,     0, 1, 0, 0, 0, ..., 0,     2]
     year     |        keywords        |    |     coauthors       |   target
    """
    train_data_processed = []
    
    for key in train_data:
        # ignores venue at the moment since papers with no venue will all be treated the same which maybe we do not want?
        # venue = entry['venue']
        entry = train_data[key]
        keywords = entry['keywords']
        year = entry['year']
        authors = entry['author']
        
        # converts keywords into a binary representation
        # TODO: find a simpler way to represent (currently given as an array of length 500)
        keyword_list = [1 if i in keywords else 0 for i in range(N_KEYWORDS)]
        
        for i in range(len(authors)):
            target = authors[i]
            
            # sorts keywords based on the most common keyword associated with the target
            if keyword_matrix:
                keyword_list = sorted(
                    keywords, 
                    key=lambda x: keyword_matrix[target][x]
                )
                keyword_list = pad(keyword_list, max_keywords, keyword_list[0])
            
            # takes all authors other than the target as coauthors
            coauthor = [author for author in authors if author != target]
            
            # sorts coauthors based on the most common author who has written with the target
            if coauthor_matrix:
                coauthor = sorted(
                    coauthor, 
                    key=lambda x: coauthor_matrix[target][x]
                )
                coauthor = pad(coauthor, max_authors, target)
            else:
                # converts coauthors into a binary representation
                # TODO: find a simpler way to represent (currently given as an array of length 2302)
                coauthor = [1 if i in coauthor else 0 for i in range(N_AUTHORS)]
            
            # concatenate everything and append to processed list
            train_data_processed.append([year] + keyword_list + coauthor + [target])
    
    return train_data_processed

def process_data_test(test_data, coauthor_matrix=[], keyword_matrix = []):
    """
    same as above but for the provided test data
    """
    test_data_processed = []
    
    for key in test_data:
        # ignores venue at the moment since papers with no venue will all be treated the same which maybe we do not want?
        # venue = entry['venue']
        entry = test_data[key]
        keywords = entry['keywords']
        year = entry['year']
        coauthor = entry['coauthor']
        target = entry['target']
        
        # converts keywords into a binary representation
        # TODO: find a simpler way to represent (currently given as an array of length 500)
        keyword_list = [1 if i in keywords else 0 for i in range(N_KEYWORDS)]
            
        # sorts keywords based on the most common keyword associated with the target
        if keyword_matrix:
            keyword_list = sorted(
                keywords, 
                key=lambda x: keyword_matrix[target][x]
            )
            keyword_list = pad(keyword_list, max_keywords, keyword_list[0])
            
        # sorts coauthors based on the most common author who has written with the target
        if coauthor_matrix:
            coauthor = sorted(
                coauthor, 
                key=lambda x: coauthor_matrix[target][x]
            )
            coauthor = pad(coauthor, max_authors, target)
        else:
            # converts coauthors into a binary representation
            # TODO: find a simpler way to represent (currently given as an array of length 2302)                coauthor = [1 if i in coauthor else 0 for i in range(N_AUTHORS)]
            coauthor = [1 if i in coauthor else 0 for i in range(N_AUTHORS)]
            
        # concatenate everything and append to processed list
        test_data_processed.append([year] + keyword_list + coauthor + [target])
    
    return test_data_processed

# process data
train_data_processed = process_data(
    train_data, 
    coauthor_matrix=ca_matrix, 
    keyword_matrix=kw_matrix
)
val_data_processed = process_data(
    val_data, 
    coauthor_matrix=ca_matrix, 
    keyword_matrix=kw_matrix
)

In [75]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import time

In [76]:
# separate into features and labels
X_train = [data[:-1] for data in train_data_processed]
y_train = [data[-1] for data in train_data_processed]

X_val = [data[:-1] for data in val_data_processed]
y_val = [data[-1] for data in val_data_processed]

print("Number of training instances:", int(len(X_train)==len(y_train))*len(X_train))
print("Features per instance:", len(X_train[0]))
print("Number of validation instances:", int(len(X_val)==len(y_val))*len(X_val))
print("Features per instance:", len(X_val[0]))

Number of training instances: 38466
Features per instance: 113
Number of validation instances: 9534
Features per instance: 113


In [77]:
start = time.time()

# create a SVM classifier and train on training data (takes too long at the moment)
#clf = SVC(C=0.1, gamma='auto', random_state=RANDOM_STATE)
#clf.fit(X_train, y_train)

# create a nB classifier and train on training data
clf = MultinomialNB()
clf.fit(X_train, y_train)

end = time.time()
print("Time taken for training:", end - start, "seconds")

Time taken for training: 1.6204843521118164 seconds


In [78]:
# predict probabilities (untested at the moment because training takes too long)
y_preds = clf.predict_proba(X_val)

In [79]:
def analyse_preds(instance_probs, target):
    probs = zip(range(N_AUTHORS), instance_probs)
    probs = sorted(probs, key=lambda x: x[1], reverse=True)
    front = probs.pop(0)
    output = []
    while front[0] != target and probs:
        output.append(front)
        front = probs.pop(0)
    output.append(front)
    return output

In [80]:
print(X_val[0])
print(y_val[0])
analyse_preds(y_preds[0], y_val[0])

lens = []
for i in range(len(y_preds)):
    lens.append(len(analyse_preds(y_preds[i], y_val[i])))
print(min(lens), max(lens))

[2000, 486, 209, 119, 110, 336, 135, 82, 189, 476, 492, 390, 146, 474, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 486, 244, 244, 244, 244, 244, 244, 244, 244, 244]
244
1 2302


In [81]:
len_counts = {}
for ln in lens:
    if ln not in len_counts:
        len_counts[ln] = 0
    len_counts[ln] += 1

len_counts

{250: 11,
 23: 31,
 2251: 2,
 1639: 6,
 859: 2,
 1738: 4,
 803: 4,
 2143: 3,
 1763: 4,
 564: 5,
 1470: 4,
 991: 2,
 2228: 4,
 2265: 6,
 1770: 3,
 1975: 3,
 68: 15,
 2115: 8,
 1069: 3,
 1484: 5,
 213: 8,
 1076: 5,
 209: 4,
 2018: 6,
 141: 14,
 57: 14,
 1899: 5,
 76: 11,
 188: 10,
 1378: 2,
 49: 10,
 1904: 7,
 84: 20,
 63: 14,
 10: 25,
 1542: 3,
 365: 5,
 562: 5,
 113: 9,
 161: 12,
 356: 4,
 2: 35,
 25: 20,
 760: 5,
 2019: 10,
 702: 5,
 695: 1,
 2059: 4,
 2284: 7,
 30: 19,
 477: 3,
 906: 3,
 355: 5,
 85: 13,
 496: 5,
 540: 4,
 275: 5,
 1237: 2,
 1528: 4,
 22: 28,
 88: 11,
 246: 9,
 1867: 2,
 59: 11,
 258: 9,
 1660: 4,
 827: 2,
 1417: 2,
 542: 2,
 2255: 6,
 1252: 3,
 599: 5,
 249: 7,
 33: 15,
 1766: 6,
 102: 5,
 367: 6,
 288: 5,
 640: 8,
 1995: 2,
 1787: 5,
 584: 2,
 350: 3,
 861: 3,
 571: 3,
 544: 6,
 1841: 3,
 34: 22,
 2206: 2,
 697: 6,
 2036: 5,
 104: 8,
 1342: 4,
 1891: 3,
 64: 21,
 1196: 5,
 1132: 8,
 1155: 2,
 55: 18,
 254: 15,
 227: 3,
 324: 5,
 208: 8,
 2279: 3,
 1384: 3,
 327: 10

In [82]:
count = 0
for i in range(len(y_preds)):
    target = y_val[i]
    if y_preds[i][target] > 0.1:
        print(i, target, y_preds[i][target])
        count += 1
print(count)

337 148 1.0
502 240 1.0
1017 693 1.0
1041 333 1.0
1124 162 1.0
1195 10 1.0
1641 1664 0.9999990490618959
1894 59 0.9993737096793283
1907 603 1.0
2144 35 1.0
2167 398 1.0
2240 1384 1.0
2421 1563 1.0
2431 86 1.0
2518 0 1.0
2542 450 1.0
2627 603 1.0
2688 1798 1.0
2845 39 1.0
2949 577 1.0
3239 1106 0.9999999999708962
3294 1092 1.0
3320 49 0.5526835090396222
3501 248 0.9948707293924562
3649 143 0.9995850563229302
3680 241 0.9974779807388469
3857 1560 0.9999999963329174
4133 2026 1.0
5103 0 1.0
5424 1067 0.9999998292478413
5582 1175 0.7510491130081105
5588 1188 0.9999997296545554
5898 1040 0.6464789288188814
5959 649 1.0
5963 1734 0.9999846479636931
6172 761 1.0
6173 1618 1.0
6482 710 1.0
6551 53 1.0
7218 2009 1.0
7514 790 1.0
7702 159 1.0
8322 591 0.16166736688936528
8349 1745 1.0
8401 307 0.9018379605315594
8601 170 1.0
8659 0 1.0
8689 171 1.0
8897 106 0.9801004246716898
8985 3 1.0
9232 467 1.0
9400 323 1.0
52


In [83]:
# find the max number of authors/keywords for any one paper in the data set
author_counts = []
keyword_counts = []
for key in train_data:
    author_counts.append(len(data[key]['author']))
    keyword_counts.append(len(data[key]['keywords']))

max_authors = max(author_counts)
max_keywords = max(keyword_counts)
print("Max number of authors for any one paper:", max_authors)
print("Max number of keywords for any one paper:", max_keywords)

# get required matrices
ca_matrix = get_coauthor_matrix(data)
kw_matrix = get_keyword_matrix(data)
# process data
data_processed = process_data(
    data, 
    coauthor_matrix=ca_matrix, 
    keyword_matrix=kw_matrix
)

test_data_processed = process_data_test(
    test_data, 
    coauthor_matrix=ca_matrix, 
    keyword_matrix=kw_matrix
)

# separate into features and labels
X_train = [data[:-1] for data in data_processed]
y_train = [data[-1] for data in data_processed]

X_test = [data[:-1] for data in test_data_processed]
y_test = [data[-1] for data in test_data_processed]

print("Number of training instances:", int(len(X_train)==len(y_train))*len(X_train))
print("Features per instance:", len(X_train[0]))
print("Number of validation instances:", int(len(X_test)==len(y_test))*len(X_test))
print("Features per instance:", len(X_val[0]))

# create a nB classifier and train on training data
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predic
y_preds = clf.predict_proba(X_test)
print("Number of predictions:", len(y_preds))

Max number of authors for any one paper: 9
Max number of keywords for any one paper: 103
Number of training instances: 48000
Features per instance: 113
Number of validation instances: 2000
Features per instance: 113
Number of predictions: 2000


In [85]:
# extract probabilities
output = []
for i in range(len(y_preds)):
    target = y_val[i]
    output.append((i, y_preds[i][target]))

# write to file
f = open("predictions.csv", "w")
f.write("Id,Predicted\n")
for i, prob in output:
    f.write(str(i) + "," + str(prob) + "\n")
f.close()