In [3]:
import torch
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from datasets import load_metric
from sklearn.model_selection import train_test_split

base_path = '/kaggle/input/nlp-getting-started/'
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
def convert_to_text(data):
    text = data['text'].values
    tokenizer_text = []
    for t in text:
        tokenizer_text.append(str(t))
    return tokenizer_text

def convert_to_target(data):
    return data['target'].values

def get_text_and_targets(data):
    return convert_to_text(data), convert_to_target(data)

In [5]:
class TransformerDataset(Dataset):
    def __init__(self, text, target, tokenizer):
        self.target = target
        self.dict = tokenizer(text, padding=True, truncation=True)
    
    def __len__(self):
        return len(self.dict['input_ids'])
    
    def __getitem__(self, ids):
        if (self.target is None):
            return {
            'input_ids' : torch.tensor(self.dict['input_ids'][ids], dtype=torch.long),
            'token_type_ids' : torch.tensor(self.dict['token_type_ids'][ids], dtype=torch.long),
            'attention_mask' : torch.tensor(self.dict['attention_mask'][ids], dtype=torch.long),
        }
        else :
            return {
            'input_ids' : torch.tensor(self.dict['input_ids'][ids], dtype=torch.long),
            'token_type_ids' : torch.tensor(self.dict['token_type_ids'][ids], dtype=torch.long),
            'attention_mask' : torch.tensor(self.dict['attention_mask'][ids], dtype=torch.long),
            'labels' : torch.tensor(self.target[ids], dtype=torch.long)
        }

In [7]:
train_data = pd.read_csv(base_path + 'train.csv')
test_data = pd.read_csv(base_path + 'test.csv')


train_text_full, train_targets_full = get_text_and_targets(train_data)

train_text, dev_text, train_target, dev_target = train_test_split(
    train_text_full,
    train_targets_full,
    random_state=42,   # seed?
    test_size=0.2,
    stratify=train_targets_full)

test_text = convert_to_text(test_data)

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

train_dataset = TransformerDataset(train_text, train_target, tokenizer)
dev_dataset   = TransformerDataset(dev_text, dev_target, tokenizer)
test_dataset  = TransformerDataset(test_text, None, tokenizer)

# change batch size here
batch_size = 8

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [14]:
def evaluate(model, dataloader):
    metric = load_metric("accuracy")
    model.eval()
    for batch in tqdm(dataloader):
        batch = {k : v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    
    return metric.compute()

def train(model, train_dataloader, initial_lr, nums_epochs, dev_dataloader):
    n_training_steps = num_epochs * len(train_dataloader)
    optimizer = AdamW(model.parameters(), lr=initial_lr)
    lr_scheduler = get_scheduler("linear",
                                 optimizer=optimizer,
                                 num_warmup_steps=0,
                                 num_training_steps=n_training_steps)


    for epoch in range(num_epochs):
        model.train()
        
        epoch_loss = 0
        num_batches = 0
        
        for batch in tqdm(train_dataloader):
            batch = {k : v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            epoch_loss += loss.float()
            num_batches += 1
            
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        
        print(f"[Training]: Epoch=[{epoch}], loss=[{loss}]")
        
        dev_accuracy = evaluate(model, dev_dataloader)
        print(f"[Dev]: accuracy=[{dev_accuracy}]")
        
def test(model, test_dataloader):
    preds = torch.Tensor()
    model.eval()
    for batch in tqdm(test_dataloader):
        batch = {k : v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        preds = torch.cat((preds, predictions.cpu()))
    
    result = preds.numpy()
    return result

In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model.to(device)

In [None]:
num_epochs = 1

train(model, train_dataloader, 5e-5, num_epochs, dev_dataloader)

In [18]:
import numpy as np

res = test(model, test_dataloader)
print(f"[Test Results]: {res}")
sub = pd.read_csv(base_path + 'sample_submission.csv')
sub['target'] = res
sub.to_csv('submission.csv', index=False)