<a href="https://colab.research.google.com/github/ipavlopoulos/paremia/blob/main/bert-gr-c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
import ast

# load the data
balanced_corpus = pd.read_csv("https://raw.githubusercontent.com/ipavlopoulos/paremia/main/data/balanced_corpus.csv", index_col=0)
train, test = train_test_split(balanced_corpus, test_size=0.05, random_state=2023)
train, dev = train_test_split(train, test_size=test.shape[0], random_state=2023)

In [2]:
%%capture
!pip install transformers
from transformers import *
model_name = 'nlpaueb/bert-base-greek-uncased-v1'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [3]:
import torch
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# the areas that will serve as target label indices
idx2loc = {i:a for i,a in enumerate(train.area.unique())}
loc2idx = {idx2loc[i]:i for i in idx2loc}

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, max_length = 32):
        self.max_length = max_length
        self.labels = df.area.apply(lambda a: loc2idx[a])
        self.labels = np.array(self.labels.values)
        self.labels = np.reshape(self.labels, (self.labels.shape[0], 1))
        self.labels = OneHotEncoder(sparse_output=False).fit_transform(self.labels)
        self.texts = np.array(df.text.apply(lambda txt: tokenizer(txt, padding='max_length', max_length = self.max_length, truncation=True, return_tensors="pt")).values)

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        batch_texts = self.texts[idx]
        batch_labels = self.labels[idx]
        return batch_texts, batch_labels

In [4]:
from torch import nn

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.1, num_classes=1):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 128, bias=True)
        self.norm = nn.BatchNorm1d(128)
        self.linear2 = nn.Linear(128, num_classes, bias=True)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        x = pooled_output
        x = self.dropout(x)
        x = self.relu(self.linear1(x))
        x = self.norm(x)
        x = self.linear2(x)
        return x

In [21]:
from torch.optim import Adam
from tqdm import tqdm

def validate(model, dataloader, device="cpu", criterion=nn.CrossEntropyLoss()):
    predictions, gold_labels = [], []
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_id, (val_input, val_label) in enumerate(dataloader):
            val_label = val_label.to(device)
            mask = val_input['attention_mask'].to(device)
            input_id = val_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            batch_loss = criterion(output, val_label)
            gold = np.argmax(val_label.cpu().detach().numpy(), axis=1)
            pred = np.argmax(output.cpu().detach().numpy(), axis=1)
            predictions.extend(pred)
            gold_labels.extend(gold)
            val_loss += batch_loss.item()
    return predictions, gold_labels, val_loss/batch_id

def finetune(model, train_data, val_data, learning_rate=2e-5, epochs=10, criterion=nn.CrossEntropyLoss(), batch_size=32, max_length=32, patience=2):
    train_dataloader = torch.utils.data.DataLoader(Dataset(train_data, max_length=max_length), batch_size=batch_size, shuffle=True, drop_last=False)
    val_dataloader = torch.utils.data.DataLoader(Dataset(val_data, max_length=max_length), batch_size=batch_size, drop_last=False)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    optimizer = Adam(model.parameters(), lr=learning_rate)
    model.to(device)
    lowest_loss = 10
    best_epoch = 0
    epochs_not_improving = 0
    for epoch_num in range(epochs):
            total_acc_train = 0
            total_loss_train = 0
            for batch_id, (inputs, labels) in tqdm(enumerate(train_dataloader)):
                model.train()
                output = model(inputs['input_ids'].squeeze(1).to(device), 
                               inputs['attention_mask'].to(device))
                batch_loss = criterion(output.to(device), labels.to(device))
                total_loss_train += batch_loss.item()

                optimizer.zero_grad(); batch_loss.backward(); optimizer.step()           

            predictions, gold_labels, val_loss = validate(model, val_dataloader, device, criterion)
            if val_loss<lowest_loss:
              print(f"New best epoch found: {epoch_num} (val loss: {val_loss:.3f})!")
              lowest_loss = val_loss
              best_epoch = epoch_num
              torch.save(model.state_dict(), "checkpoint.pt")
              epochs_not_improving = 0
            else:
              if epochs_not_improving > patience:
                model.load_state_dict(torch.load("checkpoint.pt"))
                print('Patience is up, restoring the best model and exiting...')
                break
              epochs_not_improving +=1
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train/batch_id: .3f} \
                | Val Loss: {val_loss: .3f} (best epoch: {best_epoch} w/val_loss: {lowest_loss:.3f})')
    model.eval()    
    return model

model = finetune(BertClassifier(num_classes=len(loc2idx)), train, dev, epochs = 100, patience = 5, batch_size=64, max_length=16)

162it [00:30,  5.28it/s]


New best epoch found: 0 (val loss: 3.452)!
Epochs: 1 | Train Loss:  3.232                 | Val Loss:  3.452 (best epoch: 0 w/val_loss: 3.452)


In [23]:
test_dataloader = torch.utils.data.DataLoader(Dataset(test), batch_size=1, drop_last=False)
p,l,_ = validate(model.to("cpu"), test_dataloader, "cpu")
print(classification_report([idx2loc[i] for i in l], [idx2loc[i] for i in p]))

                 precision    recall  f1-score   support

        Ήπειρος       0.25      0.04      0.07        23
        Αιτωλία       0.33      0.08      0.13        24
        Αμοργός       0.00      0.00      0.00        22
Ανατολική Θράκη       0.07      0.25      0.11        24
        Αρκαδία       0.11      0.03      0.05        31
          Αχαΐα       0.14      0.16      0.15        32
      Επτάνησος       0.10      0.17      0.13        23
         Εύβοια       0.20      0.05      0.08        20
      Θεσπρωτία       0.03      0.09      0.04        22
          Θράκη       0.00      0.00      0.00        25
       Ιωάννινα       0.50      0.03      0.06        29
       Κάρπαθος       0.10      0.21      0.13        28
     Κεφαλληνία       0.06      0.07      0.07        27
          Κρήτη       0.05      0.03      0.04        30
         Κύπρος       0.27      0.54      0.36        24
         Λέσβος       0.13      0.17      0.15        24
        Λακωνία       0.08    

In [24]:
torch.save(model.state_dict(), "bert-gr-c.pt")

In [25]:
model = BertClassifier(num_classes=len(loc2idx))
model.load_state_dict(torch.load("bert-gr-c.pt"))

<All keys matched successfully>