In [1]:
import json
import numpy as np

In [2]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score
import pandas as pd
from tensorflow.keras.optimizers.schedules import PolynomialDecay
import tensorflow as tf

## Data preprocessing

In [3]:
with open("train.json","r") as f:
    train = json.load(f)

In [4]:
with open("test.json","r") as f:
    test = json.load(f)

In [5]:
def get_labels(train):
    labels = []
    for d in train:
        authors = np.array(d['authors'])
        prolific_authors = authors[authors<100]
        label = np.zeros(101)
        if len(prolific_authors) == 0:
            label[-1] = 1.
        else:
            label[prolific_authors] = 1.
        labels.append(label)
    return np.array(labels)

In [6]:
def process_text(train, test):
    titles = []
    abstracts = []
    for d in train:
        titles.append(d['title'])
        abstracts.append(d['abstract'])
    total = titles+abstracts
    # apply Doc2Vec 
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(total)]
    d2v = Doc2Vec(documents, vector_size=200, window=2, min_count=1)
    output_train = []
    for d in train:
        t = d['title']
        a = d['abstract']
        vec = (d2v.infer_vector(np.asarray(t,dtype='str'))+d2v.infer_vector(np.asarray(a,dtype='str')))/2
        output_train.append(vec)
        
    output_test = []
    for d in test:
        t = d['title']
        a = d['abstract']
        vec = (d2v.infer_vector(np.asarray(t,dtype='str'))+d2v.infer_vector(np.asarray(a,dtype='str')))/2
        output_test.append(vec)
    return np.array(output_train), np.array(output_test)

In [None]:
# model = Word2Vec(sentences=words, vector_size=300, min_count=1)
# word_vec = model.wv
# text = []
# for data in train:
#     title = data['title']
#     abstract = data['abstract']
#     total = title + abstract
#     vec = np.zeros(300)
#     for w in total:
#         vec += word_vec[w]
#     vec /= len(total)
#     text.append(vec)
# text = np.array(text)


# vectorizer = TfidfVectorizer()
# text = vectorizer.fit_transform(total)
# vectorizer.get_feature_names_out()


In [7]:
def encode_coauthors(train, test):
    cas = {}
    for d in train:
        a = np.array(d['authors'])
        pa = a[a<100]
        ca = a[a>=100]
        if len(pa) != 0:
            for author in ca:
                cas[author] = 1
    ca_list = list(cas.keys())
    ca_index = {}
    for i, d in enumerate(ca_list):
        ca_index[d] = i
    
    
    
    output_train = []
    for d in train:
        authors = np.array(d['authors'])
        prolific_authors = authors[authors<100]
        coauthors = authors[authors>=100]
        temp = np.zeros(len(ca_list)+1)
        if len(prolific_authors) == 0:
            temp[-1] = 1.
        else:
            for a in coauthors:
                temp[ca_index[a]] = 1.
        output_train.append(temp)
        
    output_test = []
    for d in test:
        a = np.array(d['coauthors'])
        temp = np.zeros(len(ca_list)+1)
        for i in a:
            if i in ca_index: 
                temp[ca_index[i]] = 1.
        output_test.append(temp)

    return np.array(output_train), np.array(output_test)

In [8]:
def get_venue(data):
    output = []
    for d in data:
        v = d['venue']
        if v == "":
            output.append([466])
        else:
            output.append([v])    
    return np.array(output)

In [9]:
y = get_labels(train)

In [10]:
text_train, text_test = process_text(train, test)

In [11]:
venue_train = get_venue(train)
venue_test = get_venue(test)

In [12]:
coauthors_train, coauthors_test = encode_coauthors(train, test)

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
scaler = StandardScaler()
text_train = scaler.fit_transform(text_train)
venue_train = scaler.fit_transform(venue_train)
text_test = scaler.fit_transform(text_test)
venue_test = scaler.fit_transform(venue_test)


In [15]:
x = np.concatenate((text_train,venue_train), axis=1)
x = np.concatenate((x,coauthors_train), axis=1)

In [16]:
test = np.concatenate((text_test,venue_test), axis=1)
test = np.concatenate((test,coauthors_test), axis=1)

In [17]:
x.shape

(25793, 6777)

In [18]:
y.shape

(25793, 101)

In [19]:
test.shape

(800, 6777)

## Train models

In [21]:
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 90051)

In [22]:
weights = {}
weights[i] = 1. for i in range(101)
weights[100] = 0.25

In [23]:
checkpoint_filepath = '/tmp/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [31]:
model = Sequential()
model.add(Dense(2048,input_dim=x.shape[1],activation='tanh'))
model.add(Dropout(0.3))
# model.add(Dense(1024,activation='tanh'))
# model.add(Dropout(0.3))
# model.add(Dense(1024,activation='tanh'))
# model.add(Dropout(0.3))
model.add(Dense(512,activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(256,activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1],activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = Adam()) 
model.fit(x, y, epochs=150, batch_size=128, class_weight=weights) #,callbacks=[model_checkpoint_callback]

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x12f047190>

In [25]:
train_pred = model.predict(x)
train_pred = np.where(train_pred>0.5,1,0)



In [26]:
f1_score(y_true=y, y_pred=train_pred,average='samples')

0.8435402370152109

In [27]:
result = model.predict(test)



In [32]:
from collections import Counter
ID = []
Predict = []
for i in range(len(result)):
    instance = result[i]
    predict = "-1"
    for j in range(100):
        if instance[j] > 0.5:
            if predict == "-1":
                predict = "{}".format(j)
            else:
                predict += " {}".format(j)
    ID.append(i)
    Predict.append(predict)

result_df = pd.DataFrame({'ID': ID, 'Predict': Predict})
result_df.to_csv('kaggle2.csv', index=False)

In [33]:
count = 0
for i in Predict:
    if i == "-1":
        count += 1
print(count)

614
