In [19]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch import optim

import transformers
import pandas as pd
import numpy as np
import os
import random
import time
from tqdm.notebook import tqdm
from sklearn.metrics import fbeta_score

ModuleNotFoundError: No module named 'sklearn'

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

Using cuda


In [3]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [4]:
class Hparams:
    def __init__(self):
        self.data_dir = './data'
        self.output_dir = './outputs'
        self.batch_size = 64
        self.token_max_length = 256
        self.model_name = "allenai/biomed_roberta_base"
        self.num_epochs = 10

hps = Hparams()

In [5]:
train_df = pd.read_csv(os.path.join(hps.data_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(hps.data_dir, 'test.csv'))
sample_submit_df = pd.read_csv(os.path.join(hps.data_dir, 'sample_submit.csv'))

In [6]:
train_df

Unnamed: 0,id,title,abstract,judgement
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0
...,...,...,...,...
27140,27140,The amyloidogenic pathway of amyloid precursor...,Amyloid beta-protein (A beta) is the main cons...,0
27141,27141,Technologic developments in radiotherapy and s...,We present a review of current technological p...,0
27142,27142,Novel screening cascade identifies MKK4 as key...,Phosphorylation of Tau at serine 422 promotes ...,0
27143,27143,Visualization of the gall bladder on F-18 FDOP...,The ability to label dihydroxyphenylalanine (D...,0


In [7]:
train_df.isna().sum()

id              0
title           0
abstract     4390
judgement       0
dtype: int64

In [8]:
base_tokenizer = transformers.AutoTokenizer.from_pretrained(hps.model_name)
base_model = transformers.AutoModel.from_pretrained(hps.model_name)
base_model_config = transformers.AutoConfig.from_pretrained(hps.model_name)

Some weights of the model checkpoint at allenai/biomed_roberta_base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
print(base_model_config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



## Dataset

In [10]:
class TextClassificationDataset(Dataset):
    def __init__(self, df, tokenizer, token_max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.title_tokenized = tokenizer.batch_encode_plus(
            df.title.to_list(),
            padding = 'max_length',            
            max_length = token_max_length,
            truncation = True,
            return_attention_mask=True,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        title = dict(
            input_ids=self.title_tokenized['input_ids'][idx],
            attention_mask=self.title_tokenized['attention_mask'][idx]
        )
        label = torch.tensor(self.df.loc[idx, 'judgement'], dtype=torch.float32)
        return title, label
        

In [11]:
train_ds = TextClassificationDataset(df=train_df, tokenizer=base_tokenizer, token_max_length=hps.token_max_length)
token, label = train_ds[0]
print(token['input_ids'].shape)
print(token['input_ids'].shape)
print(label)

torch.Size([256])
torch.Size([256])
tensor(0.)


## DataLoader

In [12]:
train_dl = DataLoader(train_ds, batch_size=hps.batch_size, shuffle=True)

In [13]:
inputs, labels = next(iter(train_dl))
print(inputs['input_ids'].shape)
print(inputs['attention_mask'].shape)
print(labels)

torch.Size([64, 256])
torch.Size([64, 256])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])


## Model

In [14]:
class TextClassificationModel(nn.Module):
    def __init__(self, base_model, hidden_size):
        super().__init__()
        self.base_model = base_model
        self.conv1d_1 = nn.Conv1d(hidden_size, 256, kernel_size=2, padding=1)
        self.conv1d_2 = nn.Conv1d(256, 1, kernel_size=2, padding=1)
        self.linear = nn.Linear(258, 1)
    
    def forward(self, input_ids, attention_mask):
        out = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = out['last_hidden_state'].permute(0, 2, 1)
        conv_embed = torch.relu(self.conv1d_1(last_hidden_state))
        conv_embed = self.conv1d_2(conv_embed).squeeze()
        logits = torch.sigmoid(self.linear(conv_embed)).squeeze()
        return logits



In [15]:
model = TextClassificationModel(base_model=base_model, hidden_size=base_model_config.hidden_size)

In [16]:
outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
print(outputs)

tensor([0.5214, 0.5158, 0.5237, 0.5164, 0.5151, 0.5174, 0.5205, 0.5088, 0.5186,
        0.5170, 0.5185, 0.5193, 0.5228, 0.5140, 0.5170, 0.5196, 0.5151, 0.5209,
        0.5234, 0.5234, 0.5175, 0.5157, 0.5204, 0.5148, 0.5156, 0.5195, 0.5183,
        0.5148, 0.5212, 0.5225, 0.5203, 0.5200, 0.5228, 0.5182, 0.5192, 0.5098,
        0.5225, 0.5221, 0.5142, 0.5235, 0.5176, 0.5195, 0.5173, 0.5140, 0.5185,
        0.5194, 0.5202, 0.5160, 0.5149, 0.5230, 0.5205, 0.5180, 0.5174, 0.5176,
        0.5177, 0.5170, 0.5191, 0.5214, 0.5176, 0.5174, 0.5193, 0.5057, 0.5179,
        0.5179], grad_fn=<SqueezeBackward0>)


In [17]:
def train(train_loader, model, criterion, optimizer, num_epochs, ds_size, device, batch_size):
    phase = 'train'
    since = time.time()
    print(f"Using device : {device}")
    for epoch in range(num_epochs):
        print(f"【 Epoch {epoch+1: 3}/{num_epochs: 3} 】")

        running_loss = 0.0
        running_corrects = 0

        for i, (inputs, labels) in enumerate(tqdm(train_loader)):
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.where(outputs >= 0.5, 1, 0)

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item() + input_ids.size(0)
            running_corrects += torch.sum(preds == labels)

            if i % 10 == 9:
                total_num = float((i * batch_size) + input_ids.size(0))
                print(f"{i+1: 4}/{len(train_loader): 4}  <{phase}> Loss:{(running_loss/total_num):.3f}  Acc:{(running_corrects/total_num):.3f}")

        epoch_loss = running_loss / ds_size
        epoch_acc = running_loss / ds_size

        print(f"Epoch {epoch+1:3} Done.  <{phase}> Loss:{epoch_loss:.4f}  Acc:{epoch_acc:.4f}")

    

    return model

In [18]:
model = model.to(device)
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)


model = train(train_loader=train_dl, model=model, criterion=criterion, 
              optimizer=optimizer, num_epochs=hps.num_epochs, ds_size=len(train_ds), device=device, batch_size=hps.batch_size)

Using device : cuda
Epoch   1/ 10


  0%|          | 0/425 [00:00<?, ?it/s]

10/425  <train> Loss:1.008  Acc:0.877
20/425  <train> Loss:1.005  Acc:0.928
30/425  <train> Loss:1.004  Acc:0.947
40/425  <train> Loss:1.003  Acc:0.953
50/425  <train> Loss:1.003  Acc:0.957
60/425  <train> Loss:1.003  Acc:0.960
70/425  <train> Loss:1.003  Acc:0.962
80/425  <train> Loss:1.003  Acc:0.965
90/425  <train> Loss:1.002  Acc:0.967
100/425  <train> Loss:1.002  Acc:0.968
110/425  <train> Loss:1.002  Acc:0.968
120/425  <train> Loss:1.002  Acc:0.969
130/425  <train> Loss:1.002  Acc:0.969
140/425  <train> Loss:1.002  Acc:0.970
150/425  <train> Loss:1.002  Acc:0.971
160/425  <train> Loss:1.002  Acc:0.971
170/425  <train> Loss:1.002  Acc:0.971
180/425  <train> Loss:1.002  Acc:0.972
190/425  <train> Loss:1.002  Acc:0.972
200/425  <train> Loss:1.002  Acc:0.973
210/425  <train> Loss:1.002  Acc:0.973
220/425  <train> Loss:1.002  Acc:0.973
230/425  <train> Loss:1.002  Acc:0.973
240/425  <train> Loss:1.002  Acc:0.973
250/425  <train> Loss:1.002  Acc:0.974
260/425  <train> Loss:1.002  Acc:0

AttributeError: 'float' object has no attribute 'double'