In [None]:
!pip install transformers

In [None]:
import os
import pandas as pd
from time import time
import numpy as np

import torch
import torchvision
from torchvision import datasets, transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
import sklearn

In [None]:
%%capture
import transformers
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import TrainingArguments, Trainer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
dataset = pd.read_csv('abstracts_cs_AI.csv', delimiter='|')

In [None]:
dataset.Date = pd.to_datetime(dataset.Date)
dataset.Date = dataset.Date.apply(lambda x: x.timetuple()[0])

In [None]:
dataset.drop(['Title'], axis = 1, inplace = True)

In [None]:
dataset.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['Abstract'], dataset['Date'], test_size=0.33, random_state=42)

In [None]:
train_data = pd.concat([X_train, y_train], axis=1, keys=['Abstract', 'Date'])
train_data.head()

In [None]:
test_data = pd.concat([X_test, y_test], axis=1, keys=['Abstract', 'Date'])
test_data.head()

In [None]:
class_encoder = sklearn.preprocessing.LabelEncoder()
class_encoder.fit(train_data['Date'])
train_data['Date'] = class_encoder.transform(train_data['Date'])
train_data.head()

In [None]:
test_data['Date'] = 0
test_data

In [None]:
name = 'roberta-base'

config = RobertaConfig.from_pretrained(
        name, num_labels=len(class_encoder.classes_))

model = AutoModelForSequenceClassification.from_pretrained('roberta-base', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model.to(device)

In [None]:
MAX_LEN = 150
train_encodings = tokenizer(train_data['Abstract'].tolist(), truncation=True, padding=True, max_length=MAX_LEN)
test_encodings = tokenizer(test_data['Abstract'].tolist(), truncation=True, padding=True, max_length=MAX_LEN)

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, encodings):
        self.dataset = dataset
        self.encodings = encodings

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['Date'] = torch.tensor(self.dataset.iloc[idx]['Date'])
        return item

In [None]:
train_dataset_full = MyDataset(train_data, train_encodings)
test_dataset = MyDataset(test_data, test_encodings)

In [None]:
train_dataset_full.__getitem__(1)

In [None]:
train_size = int(0.67 * len(train_dataset_full))
test_size = len(train_dataset_full) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset_full, [train_size, test_size])

In [None]:
BATCH_SZ = 20

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SZ, shuffle=True, drop_last=False)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SZ, shuffle=True, drop_last=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False)

In [None]:
years = []

for i in range(len(years)):
  year = years[i]
  norm = scipy.stats.norm(year, 1.0)

In [None]:
def pdfs(objects):
  result = np.zeros(objects.shape)

In [None]:
def train_model(model, optimizer, scheduler, criterion, num_epochs=25, alpha_mixup=0, on_plateau=False, name='model'):
    
    train_history = []
    val_history = []
    time_history = []
    
    start = time()

    for epoch in range(num_epochs):
        
        print('Epoch %i / %i \n' %(epoch + 1, num_epochs))

        model.train()
        statistics = {'loss':0, 'n_correct':0, 'size':0}
        
        for step, batch in enumerate(train_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['Date'].to(device)

            if step % (len(train_dataloader)//30) == 0:
              current = time() - start
              print('  Batch %i  of  %i;\tElapsed time: ' %(step + 1, len(train_dataloader)) +
                    '%im %is' %(int(current / 60), int(current % 60)))
                
            optimizer.zero_grad()
  
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            #loss = outputs.loss
            probs = pdfs(input_ids)
            loss = loss_criterion(outputs.pooler_output, probs)
            
            loss.backward()
            optimizer.step()
            scheduler.step()

            _, pred = torch.max(outputs.logits, 1)
            n_correct = ((pred.cpu().numpy() >= labels.cpu().numpy() - 1) &
                         (pred.cpu().numpy() <= labels.cpu().numpy() + 1)).sum()
            
            statistics['loss'] += loss.item()
            statistics['n_correct'] += n_correct
            statistics['size'] += len(input_ids)

        train_loss = statistics['loss'] / statistics['size']
        train_acc = statistics['n_correct'] / statistics['size']
        train_history.append(train_acc)
    
        print("train statistics:   \t loss: %.3f\t accuracy: %.3f"  %(train_loss, train_acc), '\n')
                
        print('validation in progress... \n')
        model.eval()
        statistics = {'loss':0, 'n_correct':0, 'size':0}
        
        for step, batch in enumerate(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['Date'].to(device)
    
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                _, pred = torch.max(outputs.logits, 1)
                n_correct = ((pred.cpu().numpy() >= labels.cpu().numpy() - 1) &
                         (pred.cpu().numpy() <= labels.cpu().numpy() + 1)).sum()

            statistics['loss'] += loss.item()
            statistics['n_correct'] += n_correct
            statistics['size'] += len(input_ids)
            
        val_loss = statistics['loss'] / statistics['size']
        val_acc = statistics['n_correct'] / statistics['size']
        val_history.append(val_acc)
        
        print("validation statistics:\t loss: %.3f\tAccuracy: %.3f \n"  %(val_loss, val_acc))
        current = time()-start
        print("epoch time: %im %is" %(int(current / 60), int(current % 60)))

    return model, train_history, val_history

In [None]:
optimizer = transformers.AdamW(model.parameters(), lr=5e-5)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=1000000)

model, train_history, val_history = train_model(model, optimizer, scheduler, 
                                                nn.MSELoss(), 
                                                num_epochs=40)



In [None]:
text = 'Using an iterative tree construction we show that for simple computable subsets of the Cantor space Hausdorff, constructive and computable dimensions might be incomputable.'

In [None]:
encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
