In [1]:
import torch
from torch.utils import data

import numpy as np
import time, os

import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

In [2]:
import json

with open('train.json') as fin:
    train_data = json.load(fin)
    
print(len(train_data))

26108


In [3]:
N_KEYWORDS= 500
N_VENUES = 470
N_AUTHORS = 2302
RANDOM_STATE = 1


In [4]:
def process_data(data):
    processed_features = []
    labels = []
    for paper_id, paper_val in data.items():
        venue = paper_val['venue'] if paper_val['venue'] else -1
        year = paper_val['year']
        kw_ids = paper_val['keywords']
        coauthors = paper_val['author']
        for target_author in coauthors:
            feature = np.zeros((1+1+N_KEYWORDS+N_AUTHORS,))
            feature[0] = venue / N_VENUES
            feature[1] = year / 2022
            for kw_id in kw_ids:
                feature[2+kw_id] = 1
            for author_id in coauthors:
                if author_id != target_author:
                    feature[502+author_id] = 1
            processed_features.append(feature)
            labels.append(target_author)
    return processed_features, labels


In [5]:
train_processed = process_data(train_data)
feature_processed, labels_processed = train_processed
print(len(feature_processed))
print(len(labels_processed))

48000
48000


In [6]:
feature_processed = torch.from_numpy(np.array(feature_processed)).float()
labels_processed = torch.from_numpy(np.array(labels_processed)).long()

In [7]:
print(feature_processed[0])
print(len(feature_processed[0]))

tensor([-0.0021,  0.9975,  0.0000,  ...,  0.0000,  0.0000,  0.0000])
2804


In [8]:
print(labels_processed[0])

tensor(1605)


In [9]:
# processedset = []
# for i in range(len(feature_processed)):
#     processedset.append((feature_processed[i], labels_processed[i]))

In [10]:
# print(len(processedset))
# print(processedset[0])

In [11]:
# split into train and val
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(feature_processed, labels_processed, 
                                                  test_size=0.33, random_state=RANDOM_STATE*100)

In [12]:
print(len(X_train), len(y_train))
print(len(X_val), len(y_val))

32160 32160
15840 15840


In [13]:
trainset = []
testset = []

for i in range(len(X_train)):
    trainset.append((X_train[i], y_train[i]))
    
for i in range(len(X_val)):
    testset.append((X_val[i], y_val[i]))

In [14]:
train_loader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)

In [15]:
papers, labels = iter(train_loader).next()

In [16]:
print(papers)
print(papers.size())

tensor([[0.0128, 0.9960, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2532, 0.9921, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0234, 0.9931, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0617, 0.9926, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0213, 0.9946, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.5915, 0.9941, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])
torch.Size([128, 2804])


In [17]:
print(labels)
print(labels.size())

tensor([ 879, 1186, 1853, 1577,  195, 1185,  115,   61,  209,  420, 1055,  647,
         261,  138,  745, 1538, 1816, 1615, 1858,  352, 1583,  668, 2238, 2196,
         209, 1204,  572, 1065,  275, 1821,  421,  438, 2115,  290,  868,  326,
        1450,  287, 1703,  811,  404, 1927, 1821, 1345,  769, 1483, 1096,  303,
        1488,  307,  888, 1587,  898, 1055, 1506, 1326, 2180,  644,   83,  105,
         931,  619, 1267,  374, 1585,  125,  623, 1069,  158, 1034,  528, 2039,
        1083, 2223, 1739,  499, 1851, 1326,  249, 1495, 1184,  864,  106, 1895,
         808, 1998, 1125,  815,  675,  813, 2069, 1948, 2176, 2011,  665,  486,
        1520, 1757, 1367, 1118, 1586,  838, 1456,  736,  365, 1742, 1983, 1934,
        1683,  608, 1204,  630,  561,   18, 1095, 1914, 1055,  858, 1578, 2261,
        1479, 1880, 1429,  444,   88,  192,  711, 1194])
torch.Size([128])


In [18]:
import torch.nn as nn
import torch.nn.functional as F

class LogisticRegressionModel(nn.Module):
    
    def __init__(self, n_features, n_classes):
        super(LogisticRegressionModel, self).__init__()
        
        # Register weight matrix and bias term as model parameters - automatically tracks operations for gradient computation
        self.W = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([n_features, n_classes]))) # Weights 
        self.b = torch.nn.Parameter(torch.zeros([n_classes])) # Biases
        
    def forward(self, x):
        """
        Forward pass for logistic regression.
        Input: Tensor x of shape []
        Output: Logits W @ x + b
        """
        batch_size = x.shape[0]
        
        x = x.view(batch_size, -1) # Flatten data, retaining batch size
        out = torch.matmul(x, self.W) + self.b # compute scores 
        return out

    

In [19]:
n_features, n_classes = 1+1+N_KEYWORDS+N_AUTHORS, N_AUTHORS
logistic_regression_model = LogisticRegressionModel(n_features, n_classes)

for p in logistic_regression_model.parameters():
    print(p.shape)

torch.Size([2804, 2302])
torch.Size([2302])


In [20]:
def test(model, criterion, test_loader):
    
    test_loss = 0.
    test_preds, test_labels = list(), list()
    for i, data in enumerate(test_loader):
        x, labels = data
        
        with torch.no_grad():
            logits = model(x)    # Compute scores
            predictions = torch.argmax(logits, dim=1)
            test_loss += criterion(input=logits, target=labels).item()
            test_preds.append(predictions)
            test_labels.append(labels)
            
    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)
    
    test_accuracy = torch.eq(test_preds, test_labels).float().mean().item()
    
    print('[TEST] Mean loss {:.4f} | Accuracy {:.4f}'.format(test_loss/len(test_loader), test_accuracy))



def train(model, train_loader, test_loader, optimizer, n_epochs=10):
    """
    Generic training loop for supervised multiclass learning
    """
    LOG_INTERVAL = 250
    running_loss, running_accuracy = list(), list()
    start_time = time.time()
    criterion = torch.nn.CrossEntropyLoss()
    
    for epoch in range(n_epochs): # Loop over training dataset 'n_epochs' times
        epoch_loss = 0.
        
        for i, data in enumerate(train_loader): # Loop over elements in training set
            x, labels = data
            logits = model(x)
            predictions = torch.argmax(logits, dim=1)
            train_acc = torch.mean(torch.eq(predictions, labels).float()).item()
            loss = criterion(input=logits, target=labels)
            
            loss.backward()          # Backward pass (compute parameter gradients)
            optimizer.step()         # Update weight parameter using SGD
            optimizer.zero_grad()    # Reset gradients to zero for next iteration
            
            # ============================================================================
            # You can safely ignore the boilerplate code below - just reports metrics over
            # training and test sets

            running_loss.append(loss.item())
            running_accuracy.append(train_acc)

            epoch_loss += loss.item()

            if i % LOG_INTERVAL == 0:  # Log training stats
                deltaT = time.time() - start_time
                mean_loss = epoch_loss / (i+1)
                print('[TRAIN] Epoch {} [{}/{}]| Mean loss {:.4f} | Train accuracy {:.5f} | Time {:.2f} s'.format(epoch, 
                    i, len(train_loader), mean_loss, train_acc, deltaT))

        print('Epoch complete! Mean loss: {:.4f}'.format(epoch_loss/len(train_loader)))

        test(model, criterion, test_loader)
        
    return running_loss, running_accuracy


In [21]:
# optimizer = torch.optim.SGD(logistic_regression_model.parameters(), lr=1e-2, momentum=0.9)
optimizer = torch.optim.SGD(logistic_regression_model.parameters(), lr=1e-1, momentum=0.9)


lr_loss, lr_acc = train(logistic_regression_model, train_loader, test_loader, optimizer)


[TRAIN] Epoch 0 [0/252]| Mean loss 7.7522 | Train accuracy 0.00000 | Time 0.10 s
[TRAIN] Epoch 0 [250/252]| Mean loss 7.5008 | Train accuracy 0.01562 | Time 22.61 s
Epoch complete! Mean loss: 7.5001
[TEST] Mean loss 7.3265 | Accuracy 0.0196
[TRAIN] Epoch 1 [0/252]| Mean loss 7.1115 | Train accuracy 0.03906 | Time 27.80 s
[TRAIN] Epoch 1 [250/252]| Mean loss 7.0016 | Train accuracy 0.05469 | Time 50.33 s
Epoch complete! Mean loss: 7.0006
[TEST] Mean loss 7.1103 | Accuracy 0.0370
[TRAIN] Epoch 2 [0/252]| Mean loss 6.6240 | Train accuracy 0.10156 | Time 55.51 s
[TRAIN] Epoch 2 [250/252]| Mean loss 6.6195 | Train accuracy 0.05469 | Time 77.97 s
Epoch complete! Mean loss: 6.6187
[TEST] Mean loss 6.9321 | Accuracy 0.0485
[TRAIN] Epoch 3 [0/252]| Mean loss 6.2001 | Train accuracy 0.14062 | Time 83.21 s
[TRAIN] Epoch 3 [250/252]| Mean loss 6.2757 | Train accuracy 0.10156 | Time 105.79 s
Epoch complete! Mean loss: 6.2742
[TEST] Mean loss 6.7732 | Accuracy 0.0580
[TRAIN] Epoch 4 [0/252]| Mean lo

In [24]:
optimizer = torch.optim.SGD(logistic_regression_model.parameters(), lr=2e-1, momentum=0.9)


lr_loss, lr_acc = train(logistic_regression_model, train_loader, test_loader, optimizer)

[TRAIN] Epoch 0 [0/252]| Mean loss 1.8138 | Train accuracy 0.72656 | Time 0.13 s
[TRAIN] Epoch 0 [250/252]| Mean loss 2.0028 | Train accuracy 0.67188 | Time 22.66 s
Epoch complete! Mean loss: 2.0031
[TEST] Mean loss 5.2230 | Accuracy 0.1312
[TRAIN] Epoch 1 [0/252]| Mean loss 1.9688 | Train accuracy 0.74219 | Time 27.84 s
[TRAIN] Epoch 1 [250/252]| Mean loss 1.8728 | Train accuracy 0.64062 | Time 50.30 s
Epoch complete! Mean loss: 1.8723
[TEST] Mean loss 5.1958 | Accuracy 0.1326
[TRAIN] Epoch 2 [0/252]| Mean loss 1.6599 | Train accuracy 0.76562 | Time 55.48 s
[TRAIN] Epoch 2 [250/252]| Mean loss 1.7517 | Train accuracy 0.70312 | Time 77.99 s
Epoch complete! Mean loss: 1.7523
[TEST] Mean loss 5.1735 | Accuracy 0.1341
[TRAIN] Epoch 3 [0/252]| Mean loss 1.5628 | Train accuracy 0.77344 | Time 83.17 s
[TRAIN] Epoch 3 [250/252]| Mean loss 1.6429 | Train accuracy 0.74219 | Time 105.67 s
Epoch complete! Mean loss: 1.6437
[TEST] Mean loss 5.1532 | Accuracy 0.1350
[TRAIN] Epoch 4 [0/252]| Mean lo

In [25]:
optimizer = torch.optim.SGD(logistic_regression_model.parameters(), lr=2e-1, momentum=0.9)


lr_loss, lr_acc = train(logistic_regression_model, train_loader, test_loader, optimizer)

[TRAIN] Epoch 0 [0/252]| Mean loss 1.0186 | Train accuracy 0.92188 | Time 0.13 s
[TRAIN] Epoch 0 [250/252]| Mean loss 1.0939 | Train accuracy 0.89062 | Time 22.71 s
Epoch complete! Mean loss: 1.0940
[TEST] Mean loss 5.0788 | Accuracy 0.1449
[TRAIN] Epoch 1 [0/252]| Mean loss 1.0770 | Train accuracy 0.88281 | Time 27.89 s
[TRAIN] Epoch 1 [250/252]| Mean loss 1.0382 | Train accuracy 0.84375 | Time 50.38 s
Epoch complete! Mean loss: 1.0398
[TEST] Mean loss 5.0735 | Accuracy 0.1453
[TRAIN] Epoch 2 [0/252]| Mean loss 0.9530 | Train accuracy 0.94531 | Time 55.56 s
[TRAIN] Epoch 2 [250/252]| Mean loss 0.9865 | Train accuracy 0.92969 | Time 78.08 s
Epoch complete! Mean loss: 0.9874
[TEST] Mean loss 5.0685 | Accuracy 0.1468
[TRAIN] Epoch 3 [0/252]| Mean loss 0.8784 | Train accuracy 0.96094 | Time 83.25 s
[TRAIN] Epoch 3 [250/252]| Mean loss 0.9382 | Train accuracy 0.92969 | Time 105.69 s
Epoch complete! Mean loss: 0.9377
[TEST] Mean loss 5.0665 | Accuracy 0.1482
[TRAIN] Epoch 4 [0/252]| Mean lo

In [27]:
optimizer = torch.optim.SGD(logistic_regression_model.parameters(), lr=2e-1, momentum=0.9)


lr_loss, lr_acc = train(logistic_regression_model, train_loader, test_loader, optimizer)

[TRAIN] Epoch 0 [0/252]| Mean loss 0.6512 | Train accuracy 0.96875 | Time 0.12 s
[TRAIN] Epoch 0 [250/252]| Mean loss 0.6819 | Train accuracy 0.95312 | Time 22.72 s
Epoch complete! Mean loss: 0.6823
[TEST] Mean loss 5.0627 | Accuracy 0.1566
[TRAIN] Epoch 1 [0/252]| Mean loss 0.6101 | Train accuracy 0.96875 | Time 27.90 s
[TRAIN] Epoch 1 [250/252]| Mean loss 0.6542 | Train accuracy 0.96094 | Time 50.38 s
Epoch complete! Mean loss: 0.6539
[TEST] Mean loss 5.0639 | Accuracy 0.1573
[TRAIN] Epoch 2 [0/252]| Mean loss 0.6255 | Train accuracy 0.96094 | Time 55.55 s
[TRAIN] Epoch 2 [250/252]| Mean loss 0.6281 | Train accuracy 0.95312 | Time 78.02 s
Epoch complete! Mean loss: 0.6281
[TEST] Mean loss 5.0644 | Accuracy 0.1582
[TRAIN] Epoch 3 [0/252]| Mean loss 0.6035 | Train accuracy 0.94531 | Time 83.20 s
[TRAIN] Epoch 3 [250/252]| Mean loss 0.6036 | Train accuracy 0.96875 | Time 105.67 s
Epoch complete! Mean loss: 0.6032
[TEST] Mean loss 5.0643 | Accuracy 0.1597
[TRAIN] Epoch 4 [0/252]| Mean lo

In [35]:
for p in logistic_regression_model.parameters():
    print(p)
    print(p.shape)

Parameter containing:
tensor([[-0.2999, -0.1844,  0.1120,  ..., -0.2517, -0.0123, -0.1958],
        [-0.3949, -0.1703,  0.2840,  ...,  0.1621, -0.0648,  0.0138],
        [ 0.1238,  0.3956, -0.0494,  ..., -0.1504, -0.0709, -0.0695],
        ...,
        [-0.0333,  0.0144, -0.0232,  ..., -0.6263,  0.0055, -0.0317],
        [-0.0207, -0.0183,  0.0287,  ...,  0.0041, -0.1266, -0.0072],
        [-0.0177,  0.0185, -0.0052,  ..., -0.0236, -0.0130, -0.5976]],
       requires_grad=True)
torch.Size([2804, 2302])
Parameter containing:
tensor([-0.3854, -0.1913,  0.2649,  ...,  0.1803, -0.0992,  0.0150],
       requires_grad=True)
torch.Size([2302])


In [28]:
with open('test.json') as fin:
    test_data = json.load(fin)

In [31]:
def process_test_data(test_data):
    processed_features = []
#     labels = []
    for paper_id, paper_val in test_data.items():
        venue = paper_val['venue'] if paper_val['venue'] else -1
        year = paper_val['year']
        kw_ids = paper_val['keywords']
        coauthors = paper_val['coauthor']
        target = paper_val['target']
        feature = np.zeros((1+1+N_KEYWORDS+N_AUTHORS,))
        feature[0] = venue / N_VENUES
        feature[1] = year / 2022
        for kw_id in kw_ids:
            feature[2+kw_id] = 1
        for author_id in coauthors:
            if author_id != target:
                feature[502+author_id] = 1
        
        processed_features.append(feature)
#         labels.append(target_author)
    return processed_features

In [33]:
test_processed = process_test_data(test_data)
print(len(test_processed))


2000


In [34]:
print(test_processed[0])

[-0.00212766  0.9975272   0.         ...  0.          0.
  0.        ]


In [42]:
print(len(test_processed))
print(len(test_processed[0]))

2000
2804


In [49]:
test_tensor = torch.tensor(np.array(test_processed)).float()

In [50]:
predictions = torch.matmul(test_tensor, logistic_regression_model.W) + logistic_regression_model.b

In [51]:
print(predictions)

tensor([[ 0.5472, -1.5553,  1.8692,  ..., -1.4924,  1.1380,  1.6676],
        [ 2.5158, -0.1334, -0.2301,  ..., -0.4333,  0.3597, -1.8677],
        [-1.2355, -1.7690,  0.5425,  ..., -0.4461, -1.2335, -1.3483],
        ...,
        [-0.9730, -3.9031,  1.8500,  ..., -0.4102,  0.2619, -0.3934],
        [ 2.2442, -0.1286, -0.2769,  ...,  0.5490,  1.7294,  1.1731],
        [-2.7417, -1.5671, -0.2898,  ..., -0.0727, -2.3068,  0.8413]],
       grad_fn=<AddBackward0>)


In [59]:
predictions_np = predictions.tolist()

In [60]:
print(predictions.shape)

torch.Size([2000, 2302])


In [61]:
result_pred = []

for paper_id, paper_val in test_data.items():
    paper_id = int(paper_id)
    target = paper_val['target']
    prob = predictions_np[paper_id][target]
    print(paper_id, prob)
    result_pred.append([paper_id, prob])

print(len(result_pred))
print(len(result_pred[0]))

0 5.340229511260986
1 1.9624106884002686
2 4.263720512390137
3 4.271944046020508
4 0.46411752700805664
5 -2.8767380714416504
6 1.908257246017456
7 -1.4947010278701782
8 -1.047459602355957
9 1.1964439153671265
10 2.9688870906829834
11 -2.237889289855957
12 2.3981871604919434
13 8.019248962402344
14 -2.4551541805267334
15 3.596299409866333
16 7.90132474899292
17 4.073276042938232
18 8.261945724487305
19 5.49031400680542
20 -3.2315235137939453
21 -0.8711045384407043
22 2.635655641555786
23 1.6767030954360962
24 1.4345016479492188
25 -3.7307913303375244
26 -2.5175857543945312
27 -2.7298521995544434
28 3.005125045776367
29 -0.7947632074356079
30 5.274263381958008
31 7.646844863891602
32 2.0757951736450195
33 -1.546004295349121
34 1.5865020751953125
35 1.0566571950912476
36 8.098838806152344
37 -0.9434711933135986
38 -1.5629324913024902
39 5.069618225097656
40 4.684459686279297
41 9.150161743164062
42 0.6026682257652283
43 10.052952766418457
44 9.344369888305664
45 3.9054386615753174
46 6.35

1142 -0.41744881868362427
1143 -0.151560440659523
1144 -0.7403749227523804
1145 -0.07366973161697388
1146 -1.6755963563919067
1147 2.653860092163086
1148 7.415691375732422
1149 1.5958095788955688
1150 1.344321370124817
1151 4.4223809242248535
1152 -2.5761220455169678
1153 6.073022365570068
1154 3.419881582260132
1155 0.40688300132751465
1156 -1.1585469245910645
1157 0.3026096224784851
1158 0.09437721967697144
1159 -1.2861428260803223
1160 2.302417516708374
1161 0.5851606130599976
1162 -1.961719274520874
1163 0.7025363445281982
1164 3.3046247959136963
1165 1.0250194072723389
1166 -0.35450467467308044
1167 0.5750939846038818
1168 0.07916945219039917
1169 0.6124905347824097
1170 3.5589640140533447
1171 2.0262904167175293
1172 0.29996997117996216
1173 9.122631072998047
1174 6.023552894592285
1175 5.86480188369751
1176 1.3359277248382568
1177 4.508690357208252
1178 -0.3343908190727234
1179 2.599792718887329
1180 0.8363803029060364
1181 0.6993514895439148
1182 3.6810245513916016
1183 0.66582

In [62]:
import pandas as pd
result_df = pd.DataFrame(columns=['Id','Predicted'], data=result_pred)
print(result_df)

        Id  Predicted
0        0   5.340230
1        1   1.962411
2        2   4.263721
3        3   4.271944
4        4   0.464118
...    ...        ...
1995  1995  -2.118643
1996  1996   1.728356
1997  1997   3.419102
1998  1998   0.892710
1999  1999   3.826965

[2000 rows x 2 columns]


In [63]:
result_df.to_csv('predictions_nn_lr.csv', index=False)