In [6]:
# Install necessary libraries if not already installed
# !pip install transformers datasets pandas
# !pip install matplotlib

# Import necessary libraries
from datasets import load_dataset, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sentence_transformers import SentenceTransformer, losses
import pandas as pd
import torch
import logging
from datetime import datetime




In [4]:
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

# Check for MPS and CUDA availability
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("MPS is available")
    logging.info("Using MPS")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("CUDA is available")
    logging.info("Using CUDA")
else:
    device = torch.device('cpu')
    print("Using CPU")
    logging.info("Using CPU")




2024-10-18 14:48:17 - Using MPS


MPS is available


In [5]:

try:
    # Load the datasets from CSV files
    logging.info("Loading datasets from CSV files")
    train_df = pd.read_csv('data/train.csv')
    valid_df = pd.read_csv('data/dev.csv')
    test_df = pd.read_csv('data/test.csv')
    logging.info("Datasets loaded successfully")

    # Convert pandas DataFrames to Hugging Face Dataset objects
    logging.info("Converting DataFrames to Hugging Face Dataset objects")
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    test_dataset = Dataset.from_pandas(test_df)
    logging.info("Conversion completed successfully")

    # Log some information about the datasets
    logging.info(f"Train dataset size: {len(train_dataset)}")
    logging.info(f"Validation dataset size: {len(valid_dataset)}")
    logging.info(f"Test dataset size: {len(test_dataset)}")

except Exception as e:
    logging.error(f"An error occurred: {str(e)}", exc_info=True)


2024-10-18 14:48:17 - Loading datasets from CSV files
2024-10-18 14:48:18 - Datasets loaded successfully
2024-10-18 14:48:18 - Converting DataFrames to Hugging Face Dataset objects
2024-10-18 14:48:19 - Conversion completed successfully
2024-10-18 14:48:19 - Train dataset size: 293298
2024-10-18 14:48:19 - Validation dataset size: 5000
2024-10-18 14:48:19 - Test dataset size: 5000


In [None]:
# Load the tokenizer for BERT
model_name = 'onlplab/alephbert-base'
tokenizer = BertTokenizer.from_pretrained(model_name)
label_mapping = {'contradiction': -1, 'entailment': 1, 'neutral': 0}


# Define a function to preprocess the input data
def preprocess_function(examples):
    # Tokenize the inputs (translation1 and translation2)
    inputs = tokenizer(
        examples['translation1'], 
        examples['translation2'], 
        truncation=False, 
        padding='max_length', 
        max_length=128  # Or another max length depending on your needs
    )
    
    # Map the annotator_labels to numerical labels if needed
    # inputs['labels'] = examples['gold_label']
    inputs['labels'] = [label_mapping[label] for label in examples['gold_label']]

    return inputs

# Apply the preprocessing function to the datasets
train_dataset = train_dataset.map(preprocess_function,batched=True)
# train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Load the pretrained BERT model for sequence classification

In [5]:
from sklearn.metrics import accuracy_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


In [None]:

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Change num_labels to match the number of classes in your NLI task
# tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base')
# model = BertModel.from_pretrained('onlplab/alephbert-base')
# Create a data collator that will dynamically pad the inputs during training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# train_loss = losses.MultipleNegativesRankingLoss(model)


model.to(device)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./output",          # Output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs to train
    weight_decay=0.01,               # Strength of weight decay
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=valid_dataset,          # Evaluation dataset
    tokenizer=tokenizer,                 # Tokenizer
    data_collator=data_collator,          # Data collator
    compute_metrics=compute_metrics      # Custom metrics function
)



In [None]:
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

# Fine-Tune the Model
trainer.train()

In [None]:
# Evaluate the Model
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

# Save the fine-tuned model for future use
output_dir = "output/_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
test_results = trainer.evaluate(test_dataset)

# Print the test results
print(f"Test Results: {test_results}")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib
import numpy as np

# Get predictions on the test dataset
predictions = trainer.predict(test_dataset)

# Extract the predicted labels (argmax of logits)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Extract the true labels
true_labels = predictions.label_ids

# Calculate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Display the confusion matrix
print(f"Confusion Matrix:\n{conf_matrix}")

# Optional: Plot the confusion matrix using ConfusionMatrixDisplay from sklearn
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=list(label_mapping.keys()))
disp.plot(cmap='Blues')


In [None]:
accuracy = np.trace(conf_matrix) / np.sum(conf_matrix)

print(f"Accuracy from Confusion Matrix: {accuracy:.4f}")


In [None]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import numpy as np

# Assuming predictions and true labels are already computed
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

# Binarize the true labels for ROC-AUC computation (necessary for multi-class)
n_classes = len(label_mapping)  # Number of classes (e.g., 3 for 'contradiction', 'entailment', 'neutral')
true_labels_binarized = label_binarize(true_labels, classes=[0, 1, 2])

# Calculate Precision, Recall, and F1-Score for each class and overall (micro and macro)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average=None, labels=[0, 1, 2])
precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='micro')
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='macro')

# Print Precision, Recall, and F1-Score for each class
print(f"Per-class Precision, Recall, F1-Score:")
for idx, label in enumerate(label_mapping.keys()):
    print(f"Class {label} - Precision: {precision[idx]:.4f}, Recall: {recall[idx]:.4f}, F1-Score: {f1[idx]:.4f}")

# Print overall metrics
print(f"\nOverall Micro-Averaged Metrics - Precision: {precision_micro:.4f}, Recall: {recall_micro:.4f}, F1-Score: {f1_micro:.4f}")
print(f"Overall Macro-Averaged Metrics - Precision: {precision_macro:.4f}, Recall: {recall_macro:.4f}, F1-Score: {f1_macro:.4f}")

# Compute ROC-AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(true_labels_binarized[:, i], predictions.predictions[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute macro-average ROC curve and ROC area
# Aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot ROC curve for each class and the macro-average
plt.figure()
plt.plot(fpr["macro"], tpr["macro"], label=f'Macro-average ROC curve (area = {roc_auc["macro"]:.2f})', color='navy', linestyle=':', linewidth=2)
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], lw=2, label=f'ROC curve of class {list(label_mapping.keys())[i]} (area = {roc_auc[i]:.2f})')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves for Multi-Class NLI Model')
plt.legend(loc="lower right")
plt.show()
