In [1]:
import json
import numpy as np

## Data preprocessing

In [2]:
with open("train.json","r") as f:
    train = json.load(f)

In [124]:
with open("test.json","r") as f:
    test = json.load(f)

### process train data

In [4]:
y = []
for data in train:
    authors = np.array(data['authors'])
    prolific_authors = authors[authors<100]
    label = np.zeros(101)
    if len(prolific_authors) == 0:
        label[-1] = 1.
    else:
        label[prolific_authors] = 1.
    y.append(label)
y = np.array(y)

In [5]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [6]:
titles = []
abstracts = []
for data in train:
    title = data['title']
    abstract = data['abstract']
    titles.append(title)
    abstracts.append(abstract)

In [7]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(titles+abstracts)]
d2v = Doc2Vec(documents, vector_size=300, window=2, min_count=1)

In [8]:
doc = []
for data in train:
    title = data['title']
    abstract = data['abstract']
    vec = (d2v.infer_vector(np.asarray(title,dtype='str'))+d2v.infer_vector(np.asarray(abstract,dtype='str')))/2
    doc.append(vec)
doc = np.array(doc)

In [9]:
def pos_embd(length, d, n=10000):
    P = np.zeros((length,d))
    for i in range(length):
        for j in range(int(d/2)):
            P[i, 2*j] = np.sin(i/(n**(2*j/d))) 
            P[i, 2*j+1] = np.cos(i/(n**(2*j/d)))
    return P

In [10]:
venue_embd = pos_embd(466,4)

In [11]:
venue = []
for data in train:
    v = data['venue']
    if v == "":
        venue.append(venue_embd[-1])
    else:
        venue.append(venue_embd[v])
venue = np.array(venue)

In [12]:
cas = {}
for data in train:
    a = np.array(data['authors'])
    pa = a[a<100]
    ca = a[a>=100]
    if len(pa) != 0:
        for author in ca:
            cas[author] = 1

In [13]:
ca_list = list(cas.keys())

In [14]:
len(ca_list)

6575

In [15]:
ca_index = {}
for i, d in enumerate(ca_list):
    ca_index[d] = i

In [16]:
coauthors_train = []
for data in train:
    a = np.array(data['authors'])
    pa = a[a<100]
    ca = a[a>=100]
    temp = np.zeros(6576)
    if len(pa) == 0:
        temp[-1] = 1.
    else:
        for x in ca:
            temp[ca_index[x]] = 1.
    coauthors_train.append(temp)
coauthors_train = np.array(coauthors_train)

In [17]:
x = np.concatenate((doc,venue), axis=1)

In [19]:
x = np.concatenate((x,coauthors_train), axis=1)

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

### process test data

In [22]:
doc = []
for data in test:
    title = data['title']
    abstract = data['abstract']

    vec = (d2v.infer_vector(np.asarray(title,dtype='str'))+d2v.infer_vector(np.asarray(abstract,dtype='str')))/2
    doc.append(vec)
doc = np.array(doc)

In [23]:
venue = []
for data in test:
    v = data['venue']
    if v == "":
        venue.append(venue_embd[-1])
    else:
        venue.append(venue_embd[v])
venue = np.array(venue)

In [24]:
coauthors = []
for data in test:
    a = np.array(data['coauthors'])
    temp = np.zeros(6576)
    for i in a:
        if i in ca_index: 
            temp[ca_index[i]] = 1.
    coauthors.append(temp)
coauthors = np.array(coauthors)

In [25]:
test = np.concatenate((doc,venue), axis=1)
test = np.concatenate((test,coauthors), axis=1)

In [26]:
scaler = StandardScaler()
test = scaler.fit_transform(test)

## Train models

In [95]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score
import pandas as pd
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding, SpatialDropout1D

In [75]:
x.shape

(25793, 6880)

In [76]:
y.shape

(25793, 101)

In [126]:
negative = np.where(y[:,-1] == 1.)[0]
positive = np.where(y[:,-1] != 1.)[0]
negative.shape[0]/x.shape[0]

0.7107742410731593

In [77]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 90051)

In [79]:
weights = {}
for i in range(101):
    weights[i] = 1.
weights[100] = 0.3

In [114]:
model = Sequential()
model.add(Dense(1024,input_dim=x.shape[1],activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(512,activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(512,activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(256,activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1],activation='sigmoid'))
model.summary()
lr_schedule = PolynomialDecay(initial_learning_rate=0.001,decay_steps=150,end_learning_rate=0.00025,power=0.5)
model.compile(loss = 'binary_crossentropy', optimizer = Adam(lr_schedule))
model.fit(x_train, y_train, epochs=100, batch_size=128, class_weight=weights)

Model: "sequential_38"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_104 (Dense)           (None, 1024)              7046144   
                                                                 
 dropout_71 (Dropout)        (None, 1024)              0         
                                                                 
 dense_105 (Dense)           (None, 512)               524800    
                                                                 
 dropout_72 (Dropout)        (None, 512)               0         
                                                                 
 dense_106 (Dense)           (None, 256)               131328    
                                                                 
 dropout_73 (Dropout)        (None, 256)               0         
                                                                 
 dense_107 (Dense)           (None, 101)             

In [70]:
train_pred = model.predict(x_train)
train_pred = np.where(train_pred>0.5,1,0)



In [71]:
f1_score(y_true=y_train, y_pred=train_pred,average='samples')

0.9999704606295579

In [72]:
test_pred = model.predict(x_test)
test_pred = np.where(test_pred>0.5,1,0)



In [73]:
f1_score(y_true=y_test, y_pred=test_pred,average='samples')

0.8944516420601412

In [37]:
result = model.predict(test)



In [38]:
idx = [np.where(p>0.5) for p in result]
df = pd.DataFrame(idx, columns=['Predict'])
df

Unnamed: 0,Predict
0,[92]
1,[2]
2,[31]
3,[23]
4,[32]
...,...
795,[54]
796,[97]
797,"[13, 71]"
798,[71]


In [71]:
df.to_csv('result.csv')