# Argument Structure Construction Classifier: Combining BERT with Linguistic Features

# 1. Preprocessing


In [None]:
# @title Mount Google drive in Colab

from google.colab import drive

drive.mount('/content/drive')

In [None]:
# @title Dataset into a pandas DataFrame

import pandas as pd

data = pd.read_csv('YOUR_DATASET.csv', encoding='latin-1')
data_df = data
data_df = data_df[data_df['reference'] != "ANC"]
data_df.reset_index(drop=True, inplace=True)
data_df


In [None]:
# @title Assign an integer value to each class

data_df['construction'] = data_df['construction'].replace([
                                             'attributive',
                                             'caused_motion',
                                             'ditransitive',
                                             'existential',
                                             'intransitive_motion',
                                             'intransitive_resultative',
                                             'passive',
                                             'simple_intransitive',
                                             'simple_transitive',
                                             'transitive_resultative'],
                                              [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

data_df.head()

# 2. Modeling

## 2.1. Add features to text

In [None]:
# @title Create input data for analysis

data_df['final_input'] = ['not_set']*data_df.shape[0]

for index, row in data_df.iterrows():

    combined = ""
    combined += "{:} [SEP]" \
                "{:} [SEP]".format(
                    row["dependency"],
                    #row["pos"],
                    row["verb"]
                    )

    combined += str ( row["sentence_raw"] )

    data_df.iloc[index, data_df.columns.get_loc("final_input")] = combined

data_df.head()

## 2.2. GPU & Transformers setup

In [None]:
# @title Install huggingface transformers

!pip install transformers

In [None]:
# @title GPU

import torch

if torch.cuda.is_available():

    # Tell PyTorch to use the GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# @title Load tokenizer

from transformers import BertTokenizer, RobertaTokenizer

print('Loading tokenizer...')
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
# @title Classification model

from transformers import BertForSequenceClassification, RobertaForSequenceClassification

# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-uncased", # Use the 12-layer BERT model
#     num_labels = 10 # Put the number of output labels
# )

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=10)


#   Tell pytorch to run this model on GPU
desc = model.cuda()

## 2.3. Parameter setting

In [None]:
# @title Define training parameters

batch_size = 32

epochs = 10

In [None]:
# @title Find the longest sequence for "maximum sequence length"

max_len = 0


for index, row in data_df.iterrows():

    sent = row["final_input"]

    # Tokenize the text, and add `[CLS]` and `[SEP]` tokens
    input_ids = tokenizer.encode(sent, add_special_tokens=True, padding=True, truncation=True)

    max_len = max(max_len, len(input_ids))


print('Max sentence length: ', max_len)

In [None]:
# @title Use the maximum length

max_len = 196 #BERT: 200; ROBERTA: 196

## 2.4. Train-Validation-Test split

In [None]:
# @title Data split by source


df_curated = data_df[data_df['source'] == 'curated_data']

df_academic = data_df[data_df['source'] == 'naturalistic_data_academic']
df_blog = data_df[data_df['source'] == 'naturalistic_data_blog']
df_email = data_df[data_df['source'] == 'naturalistic_data_email']
df_newsgroup = data_df[data_df['source'] == 'naturalistic_data_newsgroup']
df_review = data_df[data_df['source'] == 'naturalistic_data_review']
df_yahoo_answers = data_df[data_df['source'] == 'naturalistic_data_yahoo_answers']

df_L2_speech = data_df[data_df['source'] == 'naturalistic_data_L2_speech']


df_curated.reset_index(drop=True, inplace=True)

df_academic.reset_index(drop=True, inplace=True)
df_blog.reset_index(drop=True, inplace=True)
df_email.reset_index(drop=True, inplace=True)
df_newsgroup.reset_index(drop=True, inplace=True)
df_review.reset_index(drop=True, inplace=True)
df_yahoo_answers.reset_index(drop=True, inplace=True)

df_L2_speech.reset_index(drop=True, inplace=True)


In [None]:
# @title Train-Validataion-Test split

from sklearn.model_selection import train_test_split
import pandas as pd

##  1. naturalistic_data_academic

# Split df_sp (50:25:25)
train_df_academic, val_test_df_academic = train_test_split(
    df_academic,
    test_size=0.5,  # 50% for train, 50% for test
    stratify=df_academic[['construction']],  # Stratified by 'construction'
    random_state=1
)

# Now split the 50% test set from df_sp into validation and test (50:50 split)
val_df_academic, test_df_academic = train_test_split(
    val_test_df_academic,
    test_size=0.5,  # 50% of 50% = 25% of total data
    #stratify=val_test_df_academic[['construction']],  # Stratified by 'construction'
    random_state=1
)



##  2. naturalistic_data_blog

# Split df_sp (50:25:25)
train_df_blog, val_test_df_blog = train_test_split(
    df_blog,
    test_size=0.5,  # 50% for train, 50% for test
    stratify=df_blog[['construction']],  # Stratified by 'construction'
    random_state=1
)

# Now split the 50% test set from df_sp into validation and test (50:50 split)
val_df_blog, test_df_blog = train_test_split(
    val_test_df_blog,
    test_size=0.5,  # 50% of 50% = 25% of total data
    stratify=val_test_df_blog[['construction']],  # Stratified by 'construction'
    random_state=1
)


##  3. naturalistic_data_email

# Split df_sp (50:25:25)
train_df_email, val_test_df_email = train_test_split(
    df_email,
    test_size=0.5,  # 50% for train, 50% for test
    stratify=df_email[['construction']],  # Stratified by 'construction'
    random_state=1
)

# Now split the 50% test set from df_sp into validation and test (50:50 split)
val_df_email, test_df_email = train_test_split(
    val_test_df_email,
    test_size=0.5,  # 50% of 50% = 25% of total data
    stratify=val_test_df_email[['construction']],  # Stratified by 'construction'
    random_state=1
)


##  4. naturalistic_data_newsgroup

# Split df_sp (50:25:25)
train_df_newsgroup, val_test_df_newsgroup = train_test_split(
    df_newsgroup,
    test_size=0.5,  # 50% for train, 50% for test
    stratify=df_newsgroup[['construction']],  # Stratified by 'construction'
    random_state=1
)

# Now split the 50% test set from df_sp into validation and test (50:50 split)
val_df_newsgroup, test_df_newsgroup = train_test_split(
    val_test_df_newsgroup,
    test_size=0.5,  # 50% of 50% = 25% of total data
    stratify=val_test_df_newsgroup[['construction']],  # Stratified by 'construction'
    random_state=1
)


##  5. naturalistic_data_reivew

# Split df_sp (50:25:25)
train_df_review, val_test_df_review = train_test_split(
    df_review,
    test_size=0.5,  # 50% for train, 50% for test
    stratify=df_review[['construction']],  # Stratified by 'construction'
    random_state=1
)

# Now split the 50% test set from df_sp into validation and test (50:50 split)
val_df_review, test_df_review = train_test_split(
    val_test_df_review,
    test_size=0.5,  # 50% of 50% = 25% of total data
    stratify=val_test_df_review[['construction']],  # Stratified by 'construction'
    random_state=1
)


##  6. naturalistic_data_yahoo_answers

# Split df_sp (50:25:25)
train_df_yahoo_answers, val_test_df_yahoo_answers = train_test_split(
    df_yahoo_answers,
    test_size=0.5,  # 50% for train, 50% for test
    stratify=df_yahoo_answers[['construction']],  # Stratified by 'construction'
    random_state=1
)

# Now split the 50% test set from df_sp into validation and test (50:50 split)
val_df_yahoo_answers, test_df_yahoo_answers = train_test_split(
    val_test_df_yahoo_answers,
    test_size=0.5,  # 50% of 50% = 25% of total data
    stratify=val_test_df_yahoo_answers[['construction']],  # Stratified by 'construction'
    random_state=1
)


##  7. naturalistic_data_L2_speech

# Split df_sp (50:25:25)
train_df_L2_speech, val_test_df_L2_speech = train_test_split(
    df_L2_speech,
    test_size=0.5,  # 50% for train, 50% for test
    stratify=df_L2_speech[['construction']],  # Stratified by 'construction'
    random_state=1
)

# Now split the 50% test set from df_sp into validation and test (50:50 split)
val_df_L2_speech, test_df_L2_speech = train_test_split(
    val_test_df_L2_speech,
    test_size=0.5,  # 50% of 50% = 25% of total data
    stratify=val_test_df_L2_speech[['construction']],  # Stratified by 'construction'
    random_state=1
)


# Combine test datasets from all three dataframes
test_df = pd.concat([test_df_academic, test_df_blog, test_df_email, test_df_newsgroup, test_df_review, test_df_yahoo_answers, test_df_L2_speech], ignore_index=True)

# Combine validation datasets from all three dataframes
val_df = pd.concat([val_df_academic, val_df_blog, val_df_email, val_df_newsgroup, val_df_review, val_df_yahoo_answers, val_df_L2_speech], ignore_index=True)

# Combine training datasets from all three dataframes
train_df = pd.concat([df_curated, train_df_academic, train_df_blog, train_df_email, train_df_newsgroup, train_df_review, train_df_yahoo_answers, train_df_L2_speech], ignore_index=True)



# Print the count of instances for each combination of construction and source in the test set
print("Test set distribution:")
print(test_df.groupby(['source', 'construction']).count())


In [None]:
# @title Save data to a CSV file

test_df.to_csv('test_df.csv', index=False)
val_df.to_csv('val_df.csv', index=False)
train_df.to_csv('train_df.csv', index=False)

In [None]:
# @title Load data

# test_df = pd.read_csv('test_df.csv')
# val_df = pd.read_csv('val_df.csv')
# train_df = pd.read_csv('train_df.csv')

In [None]:
# @title Input data to tensors

from torch.utils.data import TensorDataset
import torch

def encode_data(data, labels=None):
    """Encodes data using the tokenizer and returns input_ids, attention masks, and labels if provided."""
    encoded_data = tokenizer.batch_encode_plus(
        list(data),  # "data" for BERT and RoBERTa, "list(data)" for DeBERTa
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        return_attention_mask=True,
        padding='longest',
        return_tensors='pt'
    )

    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']

    if labels is not None:
        labels_tensor = torch.tensor(labels)
        return input_ids, attention_masks, labels_tensor

    return input_ids, attention_masks


# Encode training, validation, and test datasets

# For `train_df` (train dataset)
input_ids_train, attention_masks_train, labels_train = encode_data(
    train_df.final_input.values,  # Assuming `final_input` is the column with the text data
    train_df.construction.values  # Assuming `construction` is the column with labels
)

# For `val_df` (validation dataset)
input_ids_val, attention_masks_val, labels_val = encode_data(
    val_df.final_input.values,  # Assuming `final_input` is the column with the text data
    val_df.construction.values  # Assuming `construction` is the column with labels
)

# For `test_df` (test dataset)
input_ids_test, attention_masks_test, labels_test = encode_data(
    test_df.final_input.values,  # Assuming `final_input` is the column with the text data
    test_df.construction.values  # Assuming `construction` is the column with labels
)

# Create TensorDatasets for each split
train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
val_dataset = TensorDataset(input_ids_val, attention_masks_val, labels_val)
test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)

# Optional: Print the dataset sizes to verify
# print(f"Train dataset size: {len(train_dataset)}")
# print(f"Validation dataset size: {len(val_dataset)}")
# print(f"Test dataset size: {len(test_dataset)}")


## 2.5. Training

### 2.5.1. Setup

In [None]:
# @title Create an iterator for our dataset using the torch DataLoader class

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )


validation_dataloader = DataLoader(
            val_dataset,
            sampler = RandomSampler(val_dataset),
            batch_size = batch_size
        )

In [None]:
# @title Set the optimizer to pass it the weights from the running model

from transformers import AdamW

# # BERT
# optimizer = AdamW(model.parameters(),
#                   lr = 5e-5, #learning_rate, #5e-5 for BERT and DeBERTa-base; 3e-5 for RoBERTa (5e-5, 5e-6; 1e-6, 2e-6, 1e-7 5e-7, 1e-5, 1e-4 X); DeBERTa-small: 1e-5 (2e-5, 5e-5, 3e-5, 1e-6 x)
#                   eps = 1e-8  #1e-8 FOR BERT and DeBERTa; 5e-7 for RoBERTa
#                 )


# ROBERTA
optimizer = AdamW(model.parameters(),
                   lr = 3e-5, #learning_rate, #5e-5 for BERT and DeBERTa-base; 3e-5 for RoBERTa (5e-5, 5e-6; 1e-6, 2e-6, 1e-7 5e-7, 1e-5, 1e-4 X); DeBERTa-small: 1e-5 (2e-5, 5e-5, 3e-5, 1e-6 x)
                   eps = 1e-8  #1e-8 FOR BERT and DeBERTa; 5e-7 for RoBERTa
                 )


In [None]:
# @title Create a learning rate scheduler

from transformers import get_linear_schedule_with_warmup

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# @title Define a function to calculate the accuracy of predictions

import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [None]:
# @title Define a function to get the time

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


### 2.5.2. Training Loop

In [None]:
# @title Training

import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Initialize optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)


# Initialize ReduceLROnPlateau scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)


# Training loop
for epoch_i in range(0, epochs):

    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(f"  Batch {step}  of  {len(train_dataloader)}.    Elapsed: {elapsed}.")

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)  # for BERT, ROBERTA

        model.zero_grad()

        result = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
            return_dict=True,
            output_attentions=True  # Ensure you get attention scores
        )

        loss = result.loss
        logits = result.logits

        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    # Calculate average training loss
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print(f"\n  Average training loss: {avg_train_loss:.2f}")
    print(f"  Training epoch took: {training_time}")

    # Validation phase
    print("\nRunning Validation...")
    t0 = time.time()
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)  # for BERT, ROBERTA

        with torch.no_grad():
            result = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
                return_dict=True
            )

        loss = result.loss
        logits = result.logits
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)

    print(f"  Accuracy: {avg_val_accuracy:.2f}")
    print(f"  Validation Loss: {avg_val_loss:.2f}")
    print(f"  Validation took: {validation_time}")

    # Step the scheduler with the validation loss
    scheduler.step(avg_val_loss)

    # Log the current learning rate
    current_lr = optimizer.param_groups[0]['lr']
    print(f"  Current Learning Rate: {current_lr}")

print("\nTraining complete!")


In [None]:
# @title Save the model

import os
import datetime

current_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'bert_{current_datetime}_model.pt'
file_path = os.path.join('/content/drive/MyDrive', filename)
torch.save(model.state_dict(), file_path)

In [None]:
# @title Load the model

#model.load_state_dict(torch.load('/content/drive/MyDrive/YOUR_MODEL_NAME.pt'))

## 2.6. Evaluation the model with the test dataset


In [None]:
# @title Evaluation of the entire dataset

import random
import numpy as np
import torch
from torch.cuda.amp import autocast
from torch.utils.checkpoint import checkpoint
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, SequentialSampler, Subset
from sklearn.metrics import f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd


# Set the seed value all over the place to make this reproducible
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


# Number of folds for cross-validation
k_folds = 5


# Initialize k-fold
kf = KFold(n_splits=k_folds, shuffle=True, random_state=seed_val)


# K-fold Cross Validation
fold_accuracies = []
all_predictions = []
all_true_labels = []
attention_scores = []


# Evaluation loop
for fold, (train_idx, test_idx) in enumerate(kf.split(test_dataset)):
    print(f'Fold {fold + 1}/{k_folds}')
    print('--------------------------------')

    # Create test dataloader for the current fold
    test_subsampler = Subset(test_dataset, test_idx)
    prediction_sampler = SequentialSampler(test_subsampler)
    prediction_dataloader = DataLoader(test_subsampler, sampler=prediction_sampler, batch_size=batch_size)

    print(f'Predicting labels for {len(test_subsampler)} test samples...')

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions, true_labels = [], []

    # Make predictions
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Tell the model not to compute or store gradients to save memory and speed up prediction
        with torch.no_grad():
            with autocast():
                result = checkpoint(model, b_input_ids, b_input_mask)

        # Store attention scores
        attention_scores.append(result.attentions)

        logits = result.logits

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.extend(np.argmax(logits, axis=1))
        true_labels.extend(label_ids)

    print('    DONE.')

    # Calculate accuracy for this fold
    accuracy = np.mean(np.array(predictions) == np.array(true_labels))
    fold_accuracies.append(accuracy)
    print(f'Accuracy for fold {fold + 1}: {accuracy:.2f}')

    # Store all predictions and true labels for the current fold
    all_predictions.extend(predictions)
    all_true_labels.extend(true_labels)


##  Calculate and print the average accuracy over all folds
average_accuracy = np.mean(fold_accuracies)
print(f'Average accuracy over {k_folds} folds: {average_accuracy:.2f}')


##  Calculate the F1 score
f1 = f1_score(all_true_labels, all_predictions, average="micro")
print('F1 Score (micro): %.3f' % f1)


##  Classification report
print('Classification report:')
labels = ['attributive',
          'caused_motion',
          'ditransitive',
          'existential',
          'intransitive_motion',
          'intransitive_resultative',
          'passive',
          'simple_intransitive',
          'simple_transitive',
          'transitive_resultative']

print(classification_report(all_true_labels, all_predictions, target_names=labels, digits=3))


##  Display the confusion matrix
cm = confusion_matrix(all_true_labels, all_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap='Blues', xticks_rotation='vertical')
plt.show()

In [None]:
# @title Evaluation by source

import random
import numpy as np
import torch
from torch.cuda.amp import autocast
from torch.utils.checkpoint import checkpoint
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, SequentialSampler, Subset
from sklearn.metrics import f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd


# Set the seed value all over the place to make this reproducible
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


# Number of folds for cross-validation
k_folds = 5


# Initialize k-fold
kf = KFold(n_splits=k_folds, shuffle=True, random_state=seed_val)


# K-fold Cross Validation
fold_accuracies = []
all_predictions = []
all_true_labels = []
attention_scores = []
all_sources = test_df['source'].tolist()


# Evaluation loop
for fold, (train_idx, test_idx) in enumerate(kf.split(test_dataset)):
    print(f'Fold {fold + 1}/{k_folds}')
    print('--------------------------------')

    # Create test dataloader for the current fold
    test_subsampler = Subset(test_dataset, test_idx)
    prediction_sampler = SequentialSampler(test_subsampler)
    prediction_dataloader = DataLoader(test_subsampler, sampler=prediction_sampler, batch_size=batch_size)

    print(f'Predicting labels for {len(test_subsampler)} test samples...')

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions, true_labels = [], []

    # Make predictions
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Tell the model not to compute or store gradients to save memory and speed up prediction
        with torch.no_grad():
            with autocast():
                result = checkpoint(model, b_input_ids, b_input_mask)

        # Store attention scores
        attention_scores.append(result.attentions)

        logits = result.logits

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.extend(np.argmax(logits, axis=1))
        true_labels.extend(label_ids)

    print('    DONE.')

    # Calculate accuracy for this fold
    accuracy = np.mean(np.array(predictions) == np.array(true_labels))
    fold_accuracies.append(accuracy)
    print(f'Accuracy for fold {fold + 1}: {accuracy:.2f}')

    # Store all predictions and true labels for the current fold
    all_predictions.extend(predictions)
    all_true_labels.extend(true_labels)


##  Calculate and print the average accuracy over all folds
average_accuracy = np.mean(fold_accuracies)
print(f'Average accuracy over {k_folds} folds: {average_accuracy:.2f}')


##  Calculate the F1 score
f1 = f1_score(all_true_labels, all_predictions, average="micro")
print('F1 Score (micro): %.3f' % f1)


##  Display the results by source
unique_sources = np.unique(all_sources)
for source in unique_sources:
    source_idx = [i for i, s in enumerate(all_sources) if s == source]
    source_true_labels = [all_true_labels[i] for i in source_idx]
    source_predictions = [all_predictions[i] for i in source_idx]
    source_accuracy = np.mean(np.array(source_predictions) == np.array(source_true_labels))
    print(f'Accuracy for source "{source}": {source_accuracy:.2f}')

    # Classification report for the current source
    print(f'\nClassification report for source "{source}":')
    labels = ['attributive',
              'caused_motion',
              'ditransitive',
              'existential',
              'intransitive_motion',
              'intransitive_resultative',
              'passive',
              'simple_intransitive',
              'simple_transitive',
              'transitive_resultative']
    print(classification_report(source_true_labels, source_predictions, target_names=labels, digits=3))

    # Confusion matrix for the current source
    cm = confusion_matrix(source_true_labels, source_predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    plt.figure(figsize=(12, 10))
    disp.plot(cmap='Blues', xticks_rotation='vertical')
    plt.title(f'Confusion Matrix for Source: "{source}"')
    plt.show()