In [None]:
## get GPU info
!nvidia-smi -L

In [None]:
## get CPU info
!cat /proc/cpuinfo

In [None]:
# define model
model_name = "bert-base-german-cased"

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
! pip3 install datasets transformers
import transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.2-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 14.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 94.7 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 86.9 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 64.9 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 77.7 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26

In [6]:
from transformers.file_utils import is_torch_available
from transformers import set_seed
import torch
import numpy as np
import random

# fix random seed for reproducibility
seed = 123123

set_seed(seed)
random.seed(seed)
np.random.seed(seed)
if is_torch_available():
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [7]:
# read data
import pickle
import pandas as pd

pickle_sf = open("./drive/My Drive/ArgMining-2022/splits.pickle", "rb")
splits = pickle.load(pickle_sf)
pickle_sf.close()
splits_joint, splits_mpos, splits_prem = splits['joint'], splits['mpos'], splits['premise']

df = pd.read_csv("./drive/My Drive/ArgMining-2022/dataset+labels.csv")

# map labels to integers
concr_dict = {
    'high concreteness': 2,
    'intermediate concreteness': 1,
    'low concreteness': 0
}
df['concreteness'] = df['concreteness'].apply(lambda x: concr_dict[x])

In [8]:
from transformers import AutoTokenizer
from datasets import load_from_disk, Dataset, Features, ClassLabel, Value, DatasetDict
import statistics

tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, use_fast=True)

class Encoder:
    def __init__(self, df, train, val, test, task):

        self.train = Dataset.from_dict(
            {'text':  df.iloc[train_index]['text'], 'label': df.iloc[train_index][task]})
        self.val = Dataset.from_dict(
            {'text':  df.iloc[val_index]['text'], 'label': df.iloc[val_index][task]})
        self.test = Dataset.from_dict(
            {'text':  df.iloc[test_index]['text'], 'label': df.iloc[test_index][task]})

        self.train_encoded = None
        self.val_encoded = None
        self.test_encoded = None

        self.max_length = 128

    def preprocess_function(self, examples):
        return tokenizer(examples['text'],
                         add_special_tokens=True,
                         padding='max_length',
                         max_length=self.max_length,
                         return_attention_mask=True,
                         truncation=True
                         )

    def prepare_dataset(self):
        # encode train/val/test splits
        self.train_encoded = self.train.map(self.preprocess_function, batched=True)
        self.val_encoded = self.val.map(self.preprocess_function, batched=True)
        self.test_encoded = self.test.map(self.preprocess_function, batched=True)

        return self.train_encoded, self.val_encoded, self.test_encoded

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/485k [00:00<?, ?B/s]

In [9]:
# BERT
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report


class BertModel:
    def __init__(self, num_labels, train_dataset, eval_dataset, lr, optimizer, adam_beta1, adam_beta2, adam_epsilon, batch_size, num_train_epochs, model_name):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.selection_metric_name = "f1"
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.args = TrainingArguments(
            output_dir="./checkpoints",
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            learning_rate=lr,
            optim=optimizer,
            adam_beta1=adam_beta1,
            adam_beta2=adam_beta2,
            adam_epsilon=adam_epsilon,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            load_best_model_at_end=True,
            metric_for_best_model=self.selection_metric_name,
            log_level='info',
            )
        self.trainer = Trainer(
            self.model,
            self.args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            compute_metrics=self.compute_metrics
            )
        
    def free_memory(self):
        del self.model
        del self.selection_metric_name
        del self.train_dataset
        del self.eval_dataset
        del self.args
        del self.trainer
        
    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
        acc = accuracy_score(labels, predictions)
        full_report = classification_report(labels, predictions, output_dict=True)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'classification_report': full_report
        }

    def train(self):
        self.trainer.train()

    def evaluate(self, test_dataset=None):
        if test_dataset == None:
            self.trainer.evaluate()
            return None
        else:
            test_preds = self.trainer.predict(test_dataset)
            results = self.compute_metrics(test_preds[:2])
            print(results['accuracy'])
            return np.argmax(test_preds[:2][0], axis=1), results

In [10]:
class GridSearchCustomized:
    def __init__(self, num_labels, train_dataset, eval_dataset, task, model_name):
        self.num_labels = num_labels
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.task = task
        
        self.model_name = model_name
        self.optimizer = "adamw_hf"
        self.adam_beta1=0.9
        self.adam_beta2=0.999
        self.adam_epsilon=1e-8
        self.num_train_epochs = 5

        self.lr = [5e-5,4e-5, 3e-5]
        self.bs = [16, 32]
    

    def gridsearch(self):
        
        grid = [[lr,bs] for bs in self.bs for lr in self.lr]

        best_f1 = 0
        best_values = None

        for e in grid:
            print(e)
            model = BertModel(
                num_labels = self.num_labels, 
                train_dataset = self.train_dataset, 
                eval_dataset = self.eval_dataset, 
                lr = e[0], 
                optimizer = self.optimizer, 
                adam_beta1 = self.adam_beta1, 
                adam_beta2 = self.adam_beta2, 
                adam_epsilon = self.adam_epsilon, 
                batch_size = e[1], 
                num_train_epochs = self.num_train_epochs, 
                model_name = self.model_name)
            
            model.train()            
            val_preds, val_results = model.evaluate(val)

            if val_results['f1'] > best_f1:
                best_f1 = val_results['f1']
                best_values = e

            model.free_memory()
            del model
       
        print()
        print("Best val F1:", best_f1)
        print("Best hyperparameters:", best_values)
        print()

        best_model = BertModel(
                num_labels = self.num_labels, 
                train_dataset = self.train_dataset, 
                eval_dataset = self.eval_dataset, 
                lr = best_values[0], 
                optimizer = self.optimizer, 
                adam_beta1 = self.adam_beta1, 
                adam_beta2 = self.adam_beta2, 
                adam_epsilon = self.adam_epsilon, 
                batch_size = best_values[1], 
                num_train_epochs = self.num_train_epochs, 
                model_name = self.model_name)

        best_model.train()
        preds, results = best_model.evaluate(test)
        
        print("Predictions:", preds)
        print("Results best model on test:", results)

        best_model.free_memory()
        del best_model

        return preds, results['classification_report']

In [11]:
def avg_results(results, labels):

    f1_scores = {label: np.mean([d[str(label)]['f1-score'] for d in results]) for label in labels}
    f1_std = {label: np.std([d[str(label)]['f1-score'] for d in results]) for label in labels}

    acc, acc_std = np.mean([d['accuracy'] for d in results]), \
                   np.std([d['accuracy'] for d in results])
    macro, macro_std = np.mean([d['macro avg']['f1-score'] for d in results]), \
                       np.std([d['macro avg']['f1-score'] for d in results])

    print("f1:", [str(l) + ": "+ str(round(f1_scores[l], 2))+" +- "+str(round(f1_std[l], 2)) for l in labels])
    print("acc:", round(acc, 2), round(acc_std, 2))
    print("macro f1:", round(macro, 2), round(macro_std, 2))

# **Example execution**
Joint analysis of the different argument component types (major position + premise):

In [None]:
print('Start experiments for predicting the concreteness...')

# define task
target_var = 'concreteness'
num_labels = 3

results_c_obj = []
results_c_subj = []
results_c = []

splits_pred = {}

for k, v in splits_joint.items():
    print('+++++++++++++++++++')
    print("Iteration ", k)
    print('+++++++++++++++++++')
    # prepare data
    train_index, val_index, test_index = v['train_index'], v['val_index'], v['test_index']

    encoder = Encoder(df, train_index, val_index, test_index, target_var)
    train, val, test = encoder.prepare_dataset()

    # grid search and return best model result for test
    gsc = GridSearchCustomized(num_labels, train, val, target_var, model_name)
    preds, result = gsc.gridsearch()
    results_c.append(result)

    # analyze the results regarding the level of subjectivity (2 class)
    pickle_out = open("./drive/My Drive/ArgMining-2022/preds_joint_concreteness"+str(k)+".pickle", "wb")
    pickle.dump(preds, pickle_out)
    pickle_out.close()

    df_preds = df.iloc[v['test_index']]
    df_preds['preds_concreteness'] = preds

    # Results subjectivity == 0
    df_obj = df_preds.loc[df_preds['subjectivity'] == 0]
    report_obj = classification_report(df_obj['concreteness'], df_obj['preds_concreteness'], output_dict=True)
    results_c_obj.append(report_obj)
    # Results subjectivity == 1
    df_subj = df_preds.loc[df_preds['subjectivity'] == 1]
    report_subj = classification_report(df_subj['concreteness'], df_subj['preds_concreteness'], output_dict=True)
    results_c_subj.append(report_subj)

print("AVG results overall:")
avg_results(results_c, [i for i in range(num_labels)])
print("AVG results overall objective:")
avg_results(results_c_obj, [i for i in range(num_labels)])
print("AVG results overall subjective:")
avg_results(results_c_subj, [i for i in range(num_labels)])

In [None]:
print('Start experiments for predicting the subjectivity (2 classes)...')

# define task
target_var = 'subjectivity_2-class'
num_labels = 2

results_s2 = []

for k, v in splits_joint.items():
    print('+++++++++++++++++++')
    print("Iteration ", k)
    print('+++++++++++++++++++')
    # prepare data
    train_index, val_index, test_index = v['train_index'], v['val_index'], v['test_index']

    encoder = Encoder(df, train_index, val_index, test_index, target_var)
    train, val, test = encoder.prepare_dataset()

    # grid search and return best model result for test
    gsc = GridSearchCustomized(num_labels, train, val, target_var, model_name)
    result = gsc.gridsearch()
    results_s2.append(result)
avg_results(results_s2, [i for i in range(num_labels)])

In [None]:
print('Start experiments for predicting the subjectivity (4 classes)...')

# define task
target_var = 'subjectivity_4-class'
num_labels = 4

results_s4 = []

for k, v in splits_joint.items():
    print('+++++++++++++++++++')
    print("Iteration ", k)
    print('+++++++++++++++++++')
    # prepare data
    train_index, val_index, test_index = v['train_index'], v['val_index'], v['test_index']

    encoder = Encoder(df, train_index, val_index, test_index, target_var)
    train, val, test = encoder.prepare_dataset()

    # grid search and return best model result for test
    gsc = GridSearchCustomized(num_labels, train, val, target_var, model_name)
    result = gsc.gridsearch()
    results_s4.append(result)
avg_results(results_s4, [i for i in range(num_labels)])