In [200]:
import torch
import transformers
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os
import random

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

Using cuda


In [3]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [110]:
class Hparams:
    def __init__(self, data_dir, output_dir, batch_size, token_max_length):
        self.data_dir = data_dir
        self.output_dir = output_dir
        self.batch_size = batch_size
        self.token_max_length = token_max_length

hps = Hparams(data_dir='./data', output_dir='./outputs', batch_size=64, token_max_length=256)

In [5]:
train_df = pd.read_csv(os.path.join(hps.data_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(hps.data_dir, 'test.csv'))
sample_submit_df = pd.read_csv(os.path.join(hps.data_dir, 'sample_submit.csv'))

In [6]:
train_df

Unnamed: 0,id,title,abstract,judgement
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0
...,...,...,...,...
27140,27140,The amyloidogenic pathway of amyloid precursor...,Amyloid beta-protein (A beta) is the main cons...,0
27141,27141,Technologic developments in radiotherapy and s...,We present a review of current technological p...,0
27142,27142,Novel screening cascade identifies MKK4 as key...,Phosphorylation of Tau at serine 422 promotes ...,0
27143,27143,Visualization of the gall bladder on F-18 FDOP...,The ability to label dihydroxyphenylalanine (D...,0


In [84]:
train_df.isna().sum()

id              0
title           0
abstract     4390
judgement       0
dtype: int64

In [48]:
base_tokenizer = transformers.AutoTokenizer.from_pretrained("allenai/biomed_roberta_base")
base_model = transformers.BertForSequenceClassification.from_pretrained("allenai/biomed_roberta_base", num_labels=1)
configuration = model.config

Some weights of the model checkpoint at allenai/biomed_roberta_base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
print(configuration)

RobertaConfig {
  "_name_or_path": "allenai/biomed_roberta_base",
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



## Dataset

In [177]:
class TextClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, token_max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.title_tokenized = tokenizer.batch_encode_plus(
            df.title.to_list(),
            padding = 'max_length',            
            max_length = token_max_length,
            truncation = True,
            return_attention_mask=True,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        title = dict(
            input_ids=self.title_tokenized['input_ids'][idx],
            attention_mask=self.title_tokenized['attention_mask'][idx]
        )
        label = torch.tensor(self.df.loc[idx, 'judgement'], dtype=torch.int8)
        return title, label
        

In [182]:
train_ds = TextClassificationDataset(df=train_df, tokenizer=base_tokenizer, token_max_length=hps.token_max_length)
token, label = train_ds[0]
print(token['input_ids'].shape)
print(token['input_ids'].shape)
print(label)

torch.Size([256])
torch.Size([256])
tensor(0, dtype=torch.int8)


## DataLoader

In [183]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=hps.batch_size, shuffle=True)

In [191]:
inputs, labels = next(iter(train_dl))
print(inputs['input_ids'].shape)
print(inputs['attention_mask'].shape)
print(labels)

torch.Size([64, 256])
torch.Size([64, 256])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int8)


## Model

In [193]:
class TextClassificationModel(torch.nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
    
    def forward(self, input_ids, attention_mask):
        out = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        return out

In [194]:
model = TextClassificationModel(base_model=base_model)

In [197]:
outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

In [198]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-4.3194e-02,  1.1181e-01, -6.5627e-02,  ..., -2.0005e-01,
          -5.9042e-03, -6.2817e-02],
         [-2.0745e-01,  1.5751e-01, -1.6291e-01,  ..., -4.3243e-01,
          -1.3206e-01,  3.3198e-01],
         [-1.8083e-01,  4.1004e-02,  7.3151e-02,  ..., -3.1200e-01,
          -6.8425e-02,  4.3357e-01],
         ...,
         [-6.9608e-05, -1.3627e-01,  1.6160e-01,  ..., -1.5264e-01,
           1.2920e-02,  1.6599e-02],
         [-6.9608e-05, -1.3627e-01,  1.6160e-01,  ..., -1.5264e-01,
           1.2920e-02,  1.6599e-02],
         [-6.9608e-05, -1.3627e-01,  1.6160e-01,  ..., -1.5264e-01,
           1.2920e-02,  1.6599e-02]],

        [[-4.2200e-02,  1.1607e-01, -1.5275e-02,  ..., -1.4232e-01,
          -6.3146e-02, -9.6721e-02],
         [-9.4649e-02,  2.7217e-02,  7.5241e-02,  ...,  9.6108e-02,
           1.5575e-01,  5.9891e-02],
         [ 1.8918e-01,  3.9893e-01, -6.5476e-03,  ...,  2.2967e-01,
           8.