<a href="https://colab.research.google.com/github/ipavlopoulos/paremia/blob/main/GreekBERTreg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
import ast

balanced_corpus = pd.read_csv("https://raw.githubusercontent.com/ipavlopoulos/paremia/main/data/balanced_corpus.csv", index_col=0)
train, test = train_test_split(balanced_corpus, test_size=0.05, random_state=2023)
train, dev = train_test_split(train, test_size=test.shape[0], random_state=2023)

In [None]:
%%capture
!pip install transformers
from transformers import *
model_name = 'nlpaueb/bert-base-greek-uncased-v1'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
import torch
import numpy as np

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, max_length = 32):
        self.max_length = max_length
        self.labels = df[["lat", "lon"]].values
        self.texts = df.text.apply(lambda txt: tokenizer(txt, padding='max_length', max_length = self.max_length, truncation=True, return_tensors="pt")).values

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [None]:
from torch import nn

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.1, num_classes=2):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 128, bias=True)
        #self.norm = nn.BatchNorm1d(128)
        self.linear2 = nn.Linear(128, num_classes, bias=True)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        x = pooled_output
        x = self.dropout(x)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        return x

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def finetune(model, train_data, val_data, learning_rate=2e-5, epochs=100, criterion=nn.MSELoss(), batch_size=32, max_length=32):
    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=False)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, drop_last=False)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    optimizer = Adam(model.parameters(), lr=learning_rate)
    model.train()
    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
    for epoch_num in range(epochs):
            total_acc_train = 0
            total_loss_train = 0
            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
                output = model(input_id, mask)                
                batch_loss = criterion(output, train_label.float())
                total_loss_train += batch_loss.item()
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()            
            total_loss_val = 0
            with torch.no_grad():
                for val_input, val_label in val_dataloader:
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
                    output = model(input_id, mask)
                    batch_loss = criterion(output, val_label.float())
                    total_loss_val += batch_loss.item()
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f}')
                  
REGRESSION = True
num_classes = 2 if REGRESSION else 1
criterion = nn.MSELoss() if REGRESSION else nn.CrossEntropyLoss()

model = BertClassifier(num_classes=num_classes)
finetune(model, 
         Dataset(train, max_length=16, regression=REGRESSION), 
         Dataset(dev, max_length=16, regression=REGRESSION), 
         epochs=20, 
         criterion=criterion,
         regression=REGRESSION)

100%|██████████| 324/324 [00:13<00:00, 23.19it/s]


Epochs: 1 | Train Loss:  0.071                 | Val Loss:  0.136


100%|██████████| 324/324 [00:14<00:00, 23.03it/s]


Epochs: 2 | Train Loss:  0.060                 | Val Loss:  0.145


100%|██████████| 324/324 [00:13<00:00, 23.41it/s]


Epochs: 3 | Train Loss:  0.052                 | Val Loss:  0.140


100%|██████████| 324/324 [00:13<00:00, 23.17it/s]


Epochs: 4 | Train Loss:  0.044                 | Val Loss:  0.148


100%|██████████| 324/324 [00:13<00:00, 23.19it/s]


Epochs: 5 | Train Loss:  0.039                 | Val Loss:  0.137


100%|██████████| 324/324 [00:13<00:00, 23.22it/s]


Epochs: 6 | Train Loss:  0.035                 | Val Loss:  0.140


100%|██████████| 324/324 [00:13<00:00, 23.20it/s]


Epochs: 7 | Train Loss:  0.032                 | Val Loss:  0.139


100%|██████████| 324/324 [00:13<00:00, 23.34it/s]


Epochs: 8 | Train Loss:  0.029                 | Val Loss:  0.146


100%|██████████| 324/324 [00:14<00:00, 23.05it/s]


Epochs: 9 | Train Loss:  0.026                 | Val Loss:  0.148


100%|██████████| 324/324 [00:13<00:00, 23.22it/s]


Epochs: 10 | Train Loss:  0.024                 | Val Loss:  0.149


In [None]:
def validate(model, dataloader, device="cpu"):
    predictions, gold_labels = [], []
    with torch.no_grad():
        for val_input, val_label in dataloader:
            val_label = val_label.to(device)
            mask = val_input['attention_mask'].to(device)
            input_id = val_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            predictions.extend(output.cpu().numpy())
            gold_labels.extend(val_label.cpu())
    return predictions, gold_labels

test_dataloader = torch.utils.data.DataLoader(Dataset(test, max_length=16, regression=REGRESSION), batch_size=32, drop_last=True)
p,l = validate(model.to("cpu"), test_dataloader)

In [None]:
preds_pd = pd.DataFrame({"pred_lat": np.array(p)[:, 0], "pred_lon": np.array(p)[:, 1],
                         "gold_lat": np.array([i.numpy() for i in l])[:, 0], "gold_lon": np.array([i.numpy() for i in l])[:, 1]})
preds_pd.to_csv("proverb.predictions.csv", index=False)
from sklearn.metrics import *
print(f"{mean_absolute_error(preds_pd.pred_lat, preds_pd.gold_lat):.2f}")
print(f"{mean_absolute_error(preds_pd.pred_lon, preds_pd.gold_lon):.2f}")

1.39
1.82


In [None]:
torch.save(model.state_dict(), "checkpoint.pt")