In [None]:
import numpy as np
import pandas as pd
import json
import re
import random
import torch

from torch import nn
import torch.nn.functional as F

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertForSequenceClassification

Importing Data

In [None]:
def import_data():
    with open('./arxiv-metadata-oai-snapshot.json', 'r') as f:
        for each_line in f:
            yield each_line

imported = import_data()

In [None]:
dict_tags = {"abstract":[], "categories":[]}
for each_paper in imported:
    parsed = json.loads(each_paper)
    abstract= parsed['abstract']
    dict_tags["abstract"].append(abstract)
    dict_tags["categories"].append(parsed['categories'])

In [None]:
df = pd.DataFrame.from_records(dict_tags)
df = df.sample(n=100000, random_state=33)

Generating Arrays

In [None]:
categories = dict_tags['categories'].apply(lambda x: x.split(' ')).explode().unique()

In [None]:
#2-way identification
dict_label2int = {}
for i, key in enumerate(categories):
    dict_label2int[key] = i

dict_int2label = {}
for key, val in dict_label2int.items():
    dict_int2label[val] = key

In [None]:
def gen_array(label):
    result = np.zeros(len(dict_label2int))
    labels = label.split(' ')
    for each in labels:
        result[dict_label2int[each]] = 1
    return np.expand_dims(result, 0)

In [None]:
labels = [gen_array(tag) for tag in dict_tags["categories"]]
labe_array = np.concatenate(labels, axis = 0)

Tokenization

In [None]:
stops = stopwords.words('english')

#pre-cleaning
cleaned = dict_tags['abstracts'].apply(lambda x : x.lower())
cleaned = cleaned.apply(lambda x: x.split(' '))
cleaned = cleaned.apply(lambda x: [item for item in x if item not in stops])
cleaned = cleaned.apply(lambda x: ' '.join(x))
cleaned = cleaned.apply(lambda x: re.sub('[^A-Za-z\s]+', ' ', x))
cleaned = cleaned.apply(lambda x: re.sub('\n', ' ', x))
cleaned = cleaned.apply(lambda x: re.sub(r'\s+', ' ', x))
cleaned = cleaned.apply(lambda x: re.sub(r'^\s', '', x))
cleaned = cleaned.apply(lambda x: re.sub(r'\s$', '', x))

cleaned = list(cleaned)

In [None]:
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
text_tokens = tokenizer.batch_encode_plus(cleaned, pad_to_max_length=True, max_length=250, return_tensors='pt')
text_tokens['input_ids'].shape

Train/Test Split

In [None]:
random.seed(27)
samples = random.sample(range(text_tokens['input_ids'].shape[0]), 100000)#text_tokens["input_ids"].shape[0])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(text_tokens["input_ids"][sampls,:], dict_tags[samples, :], test_size = 0.2)

Data Model

In [None]:
class dataset(torch.utils.data.Dataset):
    def __init__(self, abstract, category):
        self.abstracts = abstract
        self.categories = category
        
    def __len__(self):
        return self.categories.shape[0]
    
    def __getitem__(self, index):
        x = self.abstracts[index, :]
        y = self.categories[index, :]
        return x, y

In [None]:
#initialize
train_data = dataset(x_train, y_train)
train_gen = torch.utils.data.DataLoader(train_data, batch_size=128)
test_data = dataset(x_test, y_test)
test_gen = torch.utils.data.DataLoader(test_data, batch_size=128, shuffle=True)


Training BERT Model

In [None]:
class BERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased",
                                                                    output_hidden_states=True)
        for param in self.encoder.parameters():
            param.requires_grad = False
            
        self.dense_1 = nn.Linear(768, 384)
        self.dense_2 = nn.Linear(384, 176)
        
    def forward(self, tokens):
        hidden_states = self.encoder(tokens)[1][-1][:, 0]
        x = F.relu(self.dense_1(hidden_states))
        x = self.dense_2(x)
        return x

model = BERT()
model = model.cuda()

In [None]:
for toks, _ in train_gen:
    print(model(toks.cuda()).shape)
    break

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0006)

In [None]:
loss_training = []
for epoch in range(15):
    running_loss = 0.0
    num_batches = 0
    for data in train_gen:
        inputs, labels = data
        inputs = inputs.cuda()
        labels = labels.cuda()
        
        #Zero gradients
        optimizer.zero_grad()
        logits = model(inputs)
        
        #BCE with logits
        loss = criterion(logits, labels)
        
        #Backprop, optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        num_batches += 1
        
        del inputs
        del labels
        del logits
        torch.cuda.empty_cache()
        
    loss_training.append(running_loss / num_batches)

print(loss_training)

In [None]:
#saving ..
torch.save(model.state_dict(), 'result.pt')