<a href="https://colab.research.google.com/github/ipavlopoulos/paremia/blob/main/proverbs_gr_GreekBERTclf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
import ast

# load the data
balanced_corpus = pd.read_csv("https://raw.githubusercontent.com/ipavlopoulos/paremia/main/data/balanced_corpus.csv", index_col=0)
train, test = train_test_split(balanced_corpus, test_size=0.05, random_state=2023)
train, dev = train_test_split(train, test_size=test.shape[0], random_state=2023)

In [2]:
%%capture
!pip install transformers
from transformers import *
model_name = 'nlpaueb/bert-base-greek-uncased-v1'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [5]:
import torch
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# the areas that will serve as target label indices
idx2loc = {i:a for i,a in enumerate(train.area.unique())}
loc2idx = {idx2loc[i]:i for i in idx2loc}

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, max_length = 32):
        self.max_length = max_length
        self.labels = df.area.apply(lambda a: loc2idx[a])
        self.labels = np.array(self.labels.values)
        self.labels = np.reshape(self.labels, (self.labels.shape[0], 1))
        self.labels = OneHotEncoder(sparse_output=False).fit_transform(self.labels)
        self.texts = np.array(df.text.apply(lambda txt: tokenizer(txt, padding='max_length', max_length = self.max_length, truncation=True, return_tensors="pt")).values)

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        batch_texts = self.texts[idx]
        batch_labels = self.labels[idx]
        return batch_texts, batch_labels

In [6]:
from torch import nn

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.1, num_classes=1):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 128, bias=True)
        self.norm = nn.BatchNorm1d(128)
        self.linear2 = nn.Linear(128, num_classes, bias=True)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        x = pooled_output
        x = self.dropout(x)
        x = self.relu(self.linear1(x))
        x = self.norm(x)
        x = self.linear2(x)
        return x

In [24]:
from torch.optim import Adam
from tqdm import tqdm

def finetune(model, train_data, val_data, learning_rate=2e-5, epochs=100, criterion=nn.CrossEntropyLoss(), batch_size=32, max_length=16):
    train_dataloader = torch.utils.data.DataLoader(Dataset(train_data, max_length=max_length), batch_size=batch_size, shuffle=True, drop_last=False)
    val_dataloader = torch.utils.data.DataLoader(Dataset(val_data, max_length=max_length), batch_size=batch_size, drop_last=False)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    optimizer = Adam(model.parameters(), lr=learning_rate)
    model.to(device)
    for epoch_num in range(epochs):
            total_acc_train = 0
            total_loss_train = 0
            for batch_id, (inputs, labels) in tqdm(enumerate(train_dataloader)):
                model.train()
                output = model(inputs['input_ids'].squeeze(1).to(device), 
                               inputs['attention_mask'].to(device))
                batch_loss = criterion(output.to(device), labels.to(device))
                total_loss_train += batch_loss.item()

                optimizer.zero_grad(); batch_loss.backward(); optimizer.step()           

            total_loss_val = 0
            with torch.no_grad():
                for val_input, val_label in val_dataloader:
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
                    output = model(input_id, mask)
                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f}')
    model.eval()    
    return model
      
model = finetune(BertClassifier(num_classes=len(loc2idx)), train, dev, epochs=3, max_length=32)

324it [00:16, 19.08it/s]


Epochs: 1 | Train Loss:  0.033                 | Val Loss:  0.089


324it [00:17, 19.04it/s]


Epochs: 2 | Train Loss:  0.024                 | Val Loss:  0.092


324it [00:16, 19.10it/s]


Epochs: 3 | Train Loss:  0.018                 | Val Loss:  0.099


In [29]:
def validate(model, dataloader, device="cpu"):
    predictions, gold_labels = [], []
    with torch.no_grad():
        for val_input, val_label in dataloader:
            val_label = val_label.to(device)
            mask = val_input['attention_mask'].to(device)
            input_id = val_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            gold = np.argmax(val_label.cpu().detach().numpy(), axis=1)
            pred = np.argmax(output.cpu().detach().numpy(), axis=1)
            predictions.extend(pred)
            gold_labels.extend(gold)
    return predictions, gold_labels

test_dataloader = torch.utils.data.DataLoader(Dataset(test, max_length=32), batch_size=32, drop_last=False)
p,l = validate(model.to("cpu"), test_dataloader)
print(classification_report([idx2loc[i] for i in l], [idx2loc[i] for i in p]))

                 precision    recall  f1-score   support

        Ήπειρος       0.15      0.09      0.11        23
        Αιτωλία       0.43      0.38      0.40        24
        Αμοργός       0.28      0.36      0.31        22
Ανατολική Θράκη       0.12      0.21      0.16        24
        Αρκαδία       0.14      0.10      0.11        31
          Αχαΐα       0.55      0.53      0.54        32
      Επτάνησος       0.35      0.39      0.37        23
         Εύβοια       0.10      0.10      0.10        20
      Θεσπρωτία       0.13      0.18      0.15        22
          Θράκη       0.15      0.12      0.13        25
       Ιωάννινα       0.26      0.17      0.21        29
       Κάρπαθος       0.44      0.39      0.42        28
     Κεφαλληνία       0.21      0.26      0.23        27
          Κρήτη       0.48      0.33      0.39        30
         Κύπρος       0.64      0.75      0.69        24
         Λέσβος       0.58      0.46      0.51        24
        Λακωνία       0.28    

In [23]:
torch.save(model.state_dict(), "bert-gr-clf_checkpoint.pt")

In [27]:
model = BertClassifier(num_classes=len(loc2idx))
model.load_state_dict(torch.load("bert-gr-clf_checkpoint.pt"))

<All keys matched successfully>