In [1]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:

# Now you can access your files
project_path = '/content/drive/My Drive/Irony-Detection'

In [3]:
import os

# Specify the path to your project folder
project_path = '/content/drive/My Drive/Irony-Detection'

# List files and directories in the project folder
files = os.listdir(project_path)
print(files)

['algo', 'predictions', 'data', '.git', 'models', '__pycache__', 'bert_exp.py', 'config.txt', 'embeddings.py', 'preprocessing.py', 'README.md', 'requirements.txt', 'run_experiment.py']


In [4]:
import os

# Specify the path to the project folder
project_path = '/content/drive/My Drive/Irony-Detection'

# List directories and files in the project folder
for root, dirs, files in os.walk(project_path):
    print(f"Root: {root}")
    print(f"Directories: {dirs}")
    print(f"Files: {files}")
    print()

Root: /content/drive/My Drive/Irony-Detection
Directories: ['algo', 'predictions', 'data', '.git', 'models', '__pycache__']
Files: ['bert_exp.py', 'config.txt', 'embeddings.py', 'preprocessing.py', 'README.md', 'requirements.txt', 'run_experiment.py']

Root: /content/drive/My Drive/Irony-Detection/algo
Directories: ['nn', '__pycache__']
Files: ['__init__.py']

Root: /content/drive/My Drive/Irony-Detection/algo/nn
Directories: ['__pycache__']
Files: ['layers.py', 'callbacks.py', 'models.py', 'utility.py', 'wrappers.py', '__init__.py']

Root: /content/drive/My Drive/Irony-Detection/algo/nn/__pycache__
Directories: []
Files: ['layers.cpython-37.pyc', 'models.cpython-36.pyc', 'layers.cpython-36.pyc', 'utility.cpython-36.pyc', 'models.cpython-37.pyc', 'utility.cpython-37.pyc', '__init__.cpython-36.pyc', 'wrappers.cpython-37.pyc', '__init__.cpython-37.pyc']

Root: /content/drive/My Drive/Irony-Detection/algo/__pycache__
Directories: []
Files: ['__init__.cpython-36.pyc', '__init__.cpython-37.

In [5]:
!pip install --upgrade tensorflow




In [8]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Define the f1_smart function as needed
def f1_smart(y_true, y_pred_probs):
    thresholds = np.arange(0.0, 1.1, 0.01)
    f1_scores = []

    for threshold in thresholds:
        y_pred = (y_pred_probs > threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        f1_scores.append(f1)

    optimal_idx = np.argmax(f1_scores)
    optimal_f1 = f1_scores[optimal_idx]
    optimal_threshold = thresholds[optimal_idx]

    return optimal_f1, optimal_threshold

# Set random seeds
np.random.seed(726)
torch.manual_seed(726)

# Define paths directly
MODEL_PATH = "models/capsule_weights_best.pth"
PREDICTION_FILE = "bert_predictions.csv"

# Create the directory if it does not exist
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

# Correct file path for colab
file_path = '/content/drive/My Drive/Irony-Detection/data/arabic/training.csv'

# Read the CSV file
full = pd.read_csv(file_path, sep='\t', header=None, names=["id", "tweet", "label"], index_col=0)
print("Number of tweets in the dataset: ", full.shape[0])

# Split dataset
train, test = train_test_split(full, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print("Completed reading")
print("Train shape : ", train.shape)
print("Test shape : ", test.shape)

# Variables
TEXT_COLUMN = "tweet"
LABEL_COLUMN = "label"

# Tokenizer and Encoding
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_and_encode(texts, max_length=128):
    encodings = tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'  # Return PyTorch tensors
    )
    return encodings

# Tokenize and encode sequences in the training set
train_encodings = tokenize_and_encode(list(train[TEXT_COLUMN].values))
test_encodings = tokenize_and_encode(list(test[TEXT_COLUMN].values))

# Convert to PyTorch Datasets
def create_torch_dataset(encodings, labels):
    dataset = TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask'],
        torch.tensor(labels, dtype=torch.long)
    )
    return dataset

# Prepare datasets
label_encoder = LabelEncoder()

# Fit the encoder on the training labels
Y_train = label_encoder.fit_transform(train[LABEL_COLUMN].values)

# Transform the test labels using the fitted encoder
Y_test = label_encoder.transform(test[LABEL_COLUMN].values)

train_dataset = create_torch_dataset(train_encodings, Y_train)
test_dataset = create_torch_dataset(test_encodings, Y_test)

# Define DataLoader
def create_dataloader(dataset, batch_size=64, shuffle=True):
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_dataloader = create_dataloader(train_dataset)
test_dataloader = create_dataloader(test_dataset, shuffle=False)

# Define the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(np.unique(Y_train)))

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define training loop
def train_model(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Average Training Loss: {avg_loss:.4f}")

# Training the model
kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)

best_f1_scores = []
best_fold_metrics = {}
best_fold_conf_matrix = None
best_fold_index = -1
y_test = np.zeros((len(Y_test),))

for fold, (train_index, valid_index) in enumerate(kfold.split(train_encodings['input_ids'], Y_train)):
    print(f"Starting fold {fold+1}")

    X_train_encodings = {
        'input_ids': torch.tensor(np.array(train_encodings['input_ids'])[train_index]),
        'attention_mask': torch.tensor(np.array(train_encodings['attention_mask'])[train_index]),
    }
    X_val_encodings = {
        'input_ids': torch.tensor(np.array(train_encodings['input_ids'])[valid_index]),
        'attention_mask': torch.tensor(np.array(train_encodings['attention_mask'])[valid_index]),
    }
    Y_train_split = torch.tensor(Y_train[train_index], dtype=torch.long)
    Y_val_split = torch.tensor(Y_train[valid_index], dtype=torch.long)

    # Prepare the datasets for training and validation
    train_split_dataset = create_torch_dataset(X_train_encodings, Y_train_split)
    val_split_dataset = create_torch_dataset(X_val_encodings, Y_val_split)

    train_split_dataloader = create_dataloader(train_split_dataset)
    val_split_dataloader = create_dataloader(val_split_dataset, shuffle=False)

    # Training
    train_model(model, train_split_dataloader, optimizer, device)

    # Save the best model weights
    torch.save(model.state_dict(), MODEL_PATH)

    # Load the best model weights
    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()

    # Predict and calculate F1 score
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_split_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1, threshold = f1_smart(np.array(all_labels), np.array(all_preds))
    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()

    print(f"Fold {fold+1} - F1: {f1:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    if not best_f1_scores or f1 > max(best_f1_scores):
        best_f1_scores.append(f1)
        best_fold_metrics = {
            'f1': f1,
            'accuracy': accuracy,
            'recall': recall,
            'precision': precision,
        }
        best_fold_conf_matrix = (tn, fp, fn, tp)
        best_fold_index = fold + 1

    # Prediction on the test set
    all_test_preds = []
    model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, _ = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_test_preds.extend(preds.cpu().numpy())

    y_test_pred = np.array(all_test_preds)
    y_test += y_test_pred / kfold.n_splits

print('Finished Training')

# Final predictions on the test set
y_test = y_test.reshape((-1, 1))
pred_test_y = (y_test > np.mean(best_f1_scores)).astype(int)
test['predictions'] = pred_test_y

# Save predictions
test.to_csv(PREDICTION_FILE, sep='\t', encoding='utf-8')
print('Saved Predictions')

# Print the best fold metrics
print(f"Best Fold: {best_fold_index}")
print(f"Best F1: {best_fold_metrics['f1']:.4f}")
print(f"Best Accuracy: {best_fold_metrics['accuracy']:.4f}")
print(f"Best Precision: {best_fold_metrics['precision']:.4f}")
print(f"Best Recall: {best_fold_metrics['recall']:.4f}")
print("Best Confusion Matrix (tn, fp, fn, tp):", best_fold_conf_matrix)


Number of tweets in the dataset:  4024
Completed reading
Train shape :  (3219, 2)
Test shape :  (805, 2)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.tensor(labels, dtype=torch.long)


Starting fold 1
Average Training Loss: 0.6547


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 1 - F1: 0.7282, Accuracy: 0.6801, Precision: 0.6509, Recall: 0.8263
Starting fold 2


  torch.tensor(labels, dtype=torch.long)


Average Training Loss: 0.5952


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 2 - F1: 0.7815, Accuracy: 0.7578, Precision: 0.7342, Recall: 0.8353
Starting fold 3


  torch.tensor(labels, dtype=torch.long)


Average Training Loss: 0.5117


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 3 - F1: 0.8188, Accuracy: 0.7966, Precision: 0.7609, Recall: 0.8862
Starting fold 4


  torch.tensor(labels, dtype=torch.long)


Average Training Loss: 0.4713


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 4 - F1: 0.8412, Accuracy: 0.8230, Precision: 0.7885, Recall: 0.9015
Starting fold 5


  torch.tensor(labels, dtype=torch.long)


Average Training Loss: 0.4308


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 5 - F1: 0.8437, Accuracy: 0.8351, Precision: 0.8314, Recall: 0.8563
Finished Training
Saved Predictions
Best Fold: 5
Best F1: 0.8437
Best Accuracy: 0.8351
Best Precision: 0.8314
Best Recall: 0.8563
Best Confusion Matrix (tn, fp, fn, tp): (251, 58, 48, 286)


In [None]:
from google.colab import files
files.download('bert_predictions.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Define the f1_smart function as needed
def f1_smart(y_true, y_pred_probs):
    thresholds = np.arange(0.0, 1.1, 0.01)
    f1_scores = []

    for threshold in thresholds:
        y_pred = (y_pred_probs > threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        f1_scores.append(f1)

    optimal_idx = np.argmax(f1_scores)
    optimal_f1 = f1_scores[optimal_idx]
    optimal_threshold = thresholds[optimal_idx]

    return optimal_f1, optimal_threshold

# Set random seeds
np.random.seed(726)
torch.manual_seed(726)

# Define paths directly
MODEL_PATH = "models/capsule_weights_best.pth"
PREDICTION_FILE = "arabert_predictions.csv"

# Create the directory if it does not exist
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

# Correct file path
file_path = '/content/drive/My Drive/Irony-Detection/data/arabic/training.csv'

# Read the CSV file
full = pd.read_csv(file_path, sep='\t', header=None, names=["id", "tweet", "label"], index_col=0)
print("Number of tweets in the dataset: ", full.shape[0])

# Split dataset
train, test = train_test_split(full, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print("Completed reading")
print("Train shape : ", train.shape)
print("Test shape : ", test.shape)

# Variables
TEXT_COLUMN = "tweet"
LABEL_COLUMN = "label"

# Tokenizer and Encoding for AraBERT
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

def tokenize_and_encode(texts, max_length=128):
    encodings = tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'  # Return PyTorch tensors
    )
    return encodings

# Tokenize and encode sequences in the training set
train_encodings = tokenize_and_encode(list(train[TEXT_COLUMN].values))
test_encodings = tokenize_and_encode(list(test[TEXT_COLUMN].values))

# Convert to PyTorch Datasets
def create_torch_dataset(encodings, labels):
    dataset = TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask'],
        torch.tensor(labels, dtype=torch.long)
    )
    return dataset

# Prepare datasets
label_encoder = LabelEncoder()

# Fit the encoder on the training labels
Y_train = label_encoder.fit_transform(train[LABEL_COLUMN].values)

# Transform the test labels using the fitted encoder
Y_test = label_encoder.transform(test[LABEL_COLUMN].values)

train_dataset = create_torch_dataset(train_encodings, Y_train)
test_dataset = create_torch_dataset(test_encodings, Y_test)

# Define DataLoader
def create_dataloader(dataset, batch_size=64, shuffle=True):
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_dataloader = create_dataloader(train_dataset)
test_dataloader = create_dataloader(test_dataset, shuffle=False)

# Define the model
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2", num_labels=len(np.unique(Y_train)))

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define training loop
def train_model(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Average Training Loss: {avg_loss:.4f}")

# Training the model
kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)

best_f1_scores = []
best_fold_metrics = {}
best_fold_conf_matrix = None
best_fold_index = -1
y_test = np.zeros((len(Y_test),))

for fold, (train_index, valid_index) in enumerate(kfold.split(train_encodings['input_ids'], Y_train)):
    print(f"Starting fold {fold+1}")

    X_train_encodings = {
        'input_ids': torch.tensor(np.array(train_encodings['input_ids'])[train_index]),
        'attention_mask': torch.tensor(np.array(train_encodings['attention_mask'])[train_index]),
    }
    X_val_encodings = {
        'input_ids': torch.tensor(np.array(train_encodings['input_ids'])[valid_index]),
        'attention_mask': torch.tensor(np.array(train_encodings['attention_mask'])[valid_index]),
    }
    Y_train_split = torch.tensor(Y_train[train_index], dtype=torch.long)
    Y_val_split = torch.tensor(Y_train[valid_index], dtype=torch.long)

    # Prepare the datasets for training and validation
    train_split_dataset = create_torch_dataset(X_train_encodings, Y_train_split)
    val_split_dataset = create_torch_dataset(X_val_encodings, Y_val_split)

    train_split_dataloader = create_dataloader(train_split_dataset)
    val_split_dataloader = create_dataloader(val_split_dataset, shuffle=False)

    # Training
    train_model(model, train_split_dataloader, optimizer, device)

    # Save the best model weights
    torch.save(model.state_dict(), MODEL_PATH)

    # Load the best model weights
    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()

    # Predict and calculate F1 score
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_split_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate performance metrics
    f1, threshold = f1_smart(np.array(all_labels), np.array(all_preds))
    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds, average='weighted')
    precision = precision_score(all_labels, all_preds, average='weighted')
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()

    print(f"Fold {fold+1} - F1: {f1:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    if not best_f1_scores or f1 > max(best_f1_scores):
        best_f1_scores.append(f1)
        best_fold_metrics = {
            'f1': f1,
            'accuracy': accuracy,
            'recall': recall,
            'precision': precision,
        }
        best_fold_conf_matrix = (tn, fp, fn, tp)
        best_fold_index = fold + 1

    # Prediction on the test set
    all_test_preds = []
    model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, _ = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_test_preds.extend(preds.cpu().numpy())

    y_test_pred = np.array(all_test_preds)
    y_test += y_test_pred / kfold.n_splits

print('Finished Training')

# Final predictions on the test set
y_test = y_test.reshape((-1, 1))
pred_test_y = (y_test > np.mean(best_f1_scores)).astype(int)
test['predictions'] = pred_test_y

# Save predictions
test.to_csv(PREDICTION_FILE, sep='\t', encoding='utf-8')
print('Saved Predictions')

# Print the best fold metrics
print(f"Best Fold: {best_fold_index}")
print(f"Best F1: {best_fold_metrics['f1']:.4f}")
print(f"Best Accuracy: {best_fold_metrics['accuracy']:.4f}")
print(f"Best Precision: {best_fold_metrics['precision']:.4f}")
print(f"Best Recall: {best_fold_metrics['recall']:.4f}")
print("Best Confusion Matrix (tn, fp, fn, tp):", best_fold_conf_matrix)


Number of tweets in the dataset:  4024
Completed reading
Train shape :  (3219, 2)
Test shape :  (805, 2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.tensor(labels, dtype=torch.long)


Starting fold 1
Average Training Loss: 0.5622


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 1 - F1: 0.8176, Accuracy: 0.7873, Precision: 0.8059, Recall: 0.7873
Starting fold 2


  torch.tensor(labels, dtype=torch.long)


Average Training Loss: 0.4485


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 2 - F1: 0.8412, Accuracy: 0.8276, Precision: 0.8301, Recall: 0.8276
Starting fold 3


  torch.tensor(labels, dtype=torch.long)


Average Training Loss: 0.4037


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 3 - F1: 0.8819, Accuracy: 0.8711, Precision: 0.8751, Recall: 0.8711
Starting fold 4


  torch.tensor(labels, dtype=torch.long)


Average Training Loss: 0.3442


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 4 - F1: 0.8981, Accuracy: 0.8898, Precision: 0.8922, Recall: 0.8898
Starting fold 5


  torch.tensor(labels, dtype=torch.long)


Average Training Loss: 0.2989


  model.load_state_dict(torch.load(MODEL_PATH))


Fold 5 - F1: 0.9238, Accuracy: 0.9191, Precision: 0.9198, Recall: 0.9191
Finished Training
Saved Predictions
Best Fold: 5
Best F1: 0.9238
Best Accuracy: 0.9191
Best Precision: 0.9198
Best Recall: 0.9191
Best Confusion Matrix (tn, fp, fn, tp): (276, 33, 19, 315)


In [None]:
from google.colab import files
files.download('arabert_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>