### Classification Transformers
---------------------
Kilian Lüders & Bent Stohlmann

8.6.2023 (Submitted Version)

BERT models are computed here.

Made for Google Colab.

In [None]:
!nvidia-smi

In [None]:
! pip install simpletransformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls drive/MyDrive/'Colab Notebooks'/data/

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

import torch

from simpletransformers.classification import ClassificationModel

In [None]:
cuda_available = torch.cuda.is_available()
print(cuda_available)

In [None]:
data = pd.read_pickle("drive/MyDrive/Colab Notebooks/data/vhmk_class/training_data.pkl")
data['prop'] = (data.prop > 0).astype('int')
data = data[['text', 'prop']].rename(columns={'prop':'labels'})
print(data.labels.value_counts())
data.head()

In [None]:
# list for resuluts
data_metrics = list()

# function to report results
def report_result(y_pre, y_tes, model_name, feature_name, info_txt, class_type = "sent", fold_num=np.nan, loss=np.nan):
    precision_ind, recall_ind, fscore_ind, scores = precision_recall_fscore_support(y_tes, y_pre, average=None)
    precision_micro, recall_micro, fscore_micro, scores_micro = precision_recall_fscore_support(y_tes, y_pre, average='micro')
    print("{} - {} - {} \t {} \t F1:\t{:.2f}".format(model_name,feature_name, info_txt, class_type, fscore_micro))
    tn, fp, fn, tp = confusion_matrix(y_tes, y_pre).ravel()
    result = {
        'model': model_name,
        'feature': feature_name,
        'info': info_txt,
        'type': class_type,
        'fold': fold_num,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'precision_ind_0' : precision_ind[0],
        'precision_ind_1' : precision_ind[1],
        'precision_micro': precision_micro,
        'recall_ind_0': recall_ind[0],
        'recall_ind_1': recall_ind[1],
        'recall_micro': recall_micro,
        'fscore_ind_0': fscore_ind[0],
        'fscore_ind_1': fscore_ind[1],
        'fscore_mirco': fscore_micro,
        'loss': loss
    }
    return result

In [None]:
# define hyperparameter
train_args ={"reprocess_input_data": True,
             "overwrite_output_dir": True,
             "max_seq_length":512,
             "fp16":False,
             "num_train_epochs": 10,
             "sliding_window": True}

In [None]:

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)
kf.split(data.text, data.labels)

In [None]:
data.iloc[[5,6,7],]

In [None]:
# distilbert

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data.text, data.labels)):
    ! rm -r /content/outputs/*
    train_df, test_df = data.iloc[train_index,], data.iloc[test_index,]

    model = ClassificationModel(
        "distilbert", "distilbert-base-german-cased",
        num_labels=2,
        args=train_args,
        use_cuda = cuda_available
    )
    
    model.train_model(train_df)
    pred, model_output = model.predict(test_df.text.to_list())
    
    data_metrics.append(report_result(pred, test_df.labels.to_list(), "distilbert", "distilbert-base-german-cased", "-", "-", i))
    
    output = pd.DataFrame(data_metrics)
    print(output.tail())
    output.to_csv("data/performance_data_tmp.csv")

In [None]:
# bert

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data.text, data.labels)):
    ! rm -r /content/outputs/*
    train_df, test_df = data.iloc[train_index,], data.iloc[test_index,]

    model = ClassificationModel(
        "bert", "bert-base-multilingual-cased",
        num_labels=2,
        args=train_args,
        use_cuda = cuda_available
    )
    
    model.train_model(train_df)
    pred, model_output = model.predict(test_df.text.to_list())
    
    data_metrics.append(report_result(pred, test_df.labels.to_list(), "bert", "bert-base-multilingual-cased", "-", "-", i))
    
    output = pd.DataFrame(data_metrics)
    print(output.tail())
    output.to_csv("performance_data_simpletransformers_tmp.csv")

In [None]:
# bert german

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=123)

for i, (train_index, test_index) in enumerate(kf.split(data.text, data.labels)):
  ! rm -r /content/outputs/*
  print(i)
  train_df, test_df = data.iloc[train_index,], data.iloc[test_index,]

  model = ClassificationModel(
        "bert", "bert-base-german-cased",
        num_labels=2,
        args=train_args,
        use_cuda = cuda_available
    )
    
  model.train_model(train_df)
  pred, model_output = model.predict(test_df.text.to_list())
    
  data_metrics.append(report_result(pred, test_df.labels.to_list(), "bert", "bert-base-german-cased", "-", "-", i))

  output = pd.DataFrame(data_metrics)
  print(output.tail())
  output.to_csv("performance_data_simpletransformers_tmp.csv")

In [None]:
output = pd.DataFrame(data_metrics)
output.tail()

In [None]:
output.to_csv("performance_data_simpletransformers.csv")