In [5]:
import os
import json
import random
from glob import glob
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score


In [6]:
class DialogueCoherenceDataset(Dataset):
    """
    For each dialogue, adds:
        - The original (label=1)
        - One negative: shuffle a random subset (between 2 and all) of one speaker's utterances, guaranteed different from original if possible.
    """
    def __init__(self, data_folder, max_utterances=16, seed=42):
        self.dialogues = []
        self.labels = []
        self.max_utterances = max_utterances
        random.seed(seed)

        all_dialogues = []

        for file in glob(os.path.join(data_folder, "*.jsonl")):
            with open(file, encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        obj = json.loads(line)
                        raw_dialogue = obj.get("dialogue", [])
                        dialogue = []
                        for turn in raw_dialogue:
                            if isinstance(turn, dict):
                                for speaker, utterance in turn.items():
                                    dialogue.append((speaker, utterance))
                        if len(dialogue) >= 2:
                            dialogue = dialogue[:self.max_utterances]
                            all_dialogues.append(dialogue)
                    except json.JSONDecodeError:
                        print(f"Warning: could not parse line in {file}")

        if not all_dialogues:
            raise ValueError("No valid dialogues found in the provided folder.")

        for original_dialogue in all_dialogues:
            self.dialogues.append(original_dialogue)
            self.labels.append(1)

            speakers = list({spk for spk, _ in original_dialogue})
            if not speakers or len(speakers) < 2:
                continue

            valid_speakers = [spk for spk in speakers if sum(1 for s, _ in original_dialogue if s == spk) > 1]
            if not valid_speakers:
                continue
            target_spk = random.choice(valid_speakers)
            indices = [idx for idx, (spk, _) in enumerate(original_dialogue) if spk == target_spk]
            utts = [original_dialogue[idx][1] for idx in indices]

            attempts = 0
            max_attempts = 20
            negative_found = False
            while attempts < max_attempts:
                num_to_shuffle = random.randint(2, len(utts))
                selected = random.sample(range(len(utts)), num_to_shuffle)
                shuffled_utts = utts.copy()
                utts_to_shuffle = [shuffled_utts[i] for i in selected]
                random.shuffle(utts_to_shuffle)
                for idx, pos in enumerate(selected):
                    shuffled_utts[pos] = utts_to_shuffle[idx]
                if shuffled_utts != utts:
                    negative_found = True
                    break
                attempts += 1

            if not negative_found:
                continue

            neg_dialogue = original_dialogue.copy()
            for pos, orig_idx in enumerate(indices):
                neg_dialogue[orig_idx] = (target_spk, shuffled_utts[pos])

            if neg_dialogue != original_dialogue:
                self.dialogues.append(neg_dialogue)
                self.labels.append(0)

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        dialogue = self.dialogues[idx]
        utterances = [utt for _, utt in dialogue]
        return utterances, self.labels[idx]


In [None]:

class SentenceEncoder(nn.Module):
    def __init__(self, model_name="SI2M-Lab/DarijaBERT", n_unfrozen=2):
        super().__init__()
        # Load the tokenizer and model from HuggingFace
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Freeze all model parameters by default
        for param in self.model.parameters():
            param.requires_grad = False
        
        # Unfreeze the last n_unfrozen encoder layers for fine-tuning
        n_layers = len(self.model.encoder.layer)
        for i in range(n_layers - n_unfrozen, n_layers):
            for param in self.model.encoder.layer[i].parameters():
                param.requires_grad = True

    def forward(self, utterances):
        batch = self.tokenizer(
            utterances,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        batch = {k: v.to(self.model.device) for k, v in batch.items()}
        
        # Get model outputs (last_hidden_state)
        outputs = self.model(**batch)
        
        # Return the embedding of the [CLS] token for each utterance
        return outputs.last_hidden_state[:, 0, :]


In [None]:
class OrderAwareDocEncoder(nn.Module):
    def __init__(self, hidden_size=768, nhead=4, num_layers=2, dropout=0.1, max_len=32):
        super().__init__()
        # Learnable positional embeddings for encoding order information
        self.positional = nn.Parameter(torch.zeros(1, max_len, hidden_size))
        nn.init.normal_(self.positional, std=0.02)  # Initialize with normal distribution

        # Transformer encoder layer for modeling sentence order and context
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,    # Embedding dimension
            nhead=nhead,            # Number of attention heads
            dropout=dropout,        # Dropout rate
            batch_first=True        # (batch, seq, features) format
        )
        # Stack multiple Transformer encoder layers
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Dense (fully connected) layers for classification
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_size, 1)  # Final output layer for logits (single value per document)

    def forward(self, sent_emb, mask):
        """
        Args:
            sent_emb: Tensor of shape (batch_size, seq_len, hidden_size), sentence embeddings for each document
            mask: Tensor of shape (batch_size, seq_len), 1 for real sentences, 0 for padding
        Returns:
            logits: Tensor of shape (batch_size, 1), document coherence logits
        """
        seq_len = sent_emb.size(1)

        # Slice positional embedding to match input sequence length, and move to input device
        pos_emb = self.positional[:, :seq_len, :].to(sent_emb.device)

        # Add positional information to sentence embeddings
        x = sent_emb + pos_emb

        # Build the mask for the Transformer: True for PAD positions (to be ignored)
        transformer_mask = ~mask.bool()
        # Pass through the Transformer encoder
        x = self.encoder(x, src_key_padding_mask=transformer_mask)

        # Use the embedding of the first token ([CLS]) as document representation
        doc_emb = x[:, 0, :]

        # Pass through dense layers and activation for classification
        h = self.dense(doc_emb)
        h = self.relu(h)
        h = self.dropout(h)
        logits = self.out(h)  # Output logits (before sigmoid or softmax)

        return logits


In [None]:
def collate_fn(batch):
    utter_lists, labels = zip(*batch)
    maxlen = max(len(utter) for utter in utter_lists)
    padded_utts = [utts + [""] * (maxlen - len(utts)) for utts in utter_lists]
    mask = torch.tensor([[1]*len(utts) + [0]*(maxlen-len(utts)) for utts in utter_lists], dtype=torch.float)
    labels = torch.tensor(labels, dtype=torch.float).view(-1, 1)
    # Return the padded utterance lists, mask, and labels
    return padded_utts, mask, labels


In [None]:
def train_one_epoch(sent_encoder, doc_encoder, dataloader, optimizer, device, criterion):
    # Set both encoders to training mode (enables dropout, gradient updates, etc.)
    sent_encoder.train()
    doc_encoder.train()
    total_loss, total_correct, total = 0, 0, 0  # Track total loss and accuracy

    # Iterate over each batch in the dataloader
    for utter_lists, mask, labels in tqdm(dataloader):
        batch_size = len(utter_lists)  # Number of samples in the batch

        # Flatten all utterances in the batch into a single list for encoding
        flat_utterances = [utt for dialogue in utter_lists for utt in dialogue]

        # Get embeddings for every utterance in the batch
        flat_embeddings = sent_encoder(flat_utterances)

        # Figure out how to split flat_embeddings back into dialogues
        splits = [len(utts) for utts in utter_lists]
        utter_emb_batch = torch.split(flat_embeddings, splits, dim=0)

        # Pad all dialogue embeddings to the same sequence length (maxlen)
        maxlen = max(splits)
        utter_emb_batch = [
            torch.cat([e, torch.zeros(maxlen - e.size(0), e.size(1), device=device)], dim=0)
            for e in utter_emb_batch
        ]
        # Stack into a 3D tensor (batch_size, seq_len, hidden_size)
        utter_emb_batch = torch.stack(utter_emb_batch)

        # Move mask and labels to the same device (CPU/GPU)
        mask = mask.to(device)
        labels = labels.to(device)

        # Forward pass: get logits from document encoder
        logits = doc_encoder(utter_emb_batch, mask)
        loss = criterion(logits, labels)  # Compute loss

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Compute predictions and update accuracy statistics
        preds = (torch.sigmoid(logits) > 0.5).long()  # Convert logits to binary predictions
        total_correct += (preds == labels.long()).sum().item()
        total += batch_size
        total_loss += loss.item() * batch_size

    # Calculate average loss and accuracy over the epoch
    avg_loss = total_loss / total
    avg_acc = total_correct / total
    print(f"Train loss: {avg_loss:.4f} | Train acc: {avg_acc:.4f}")
    return avg_loss, avg_acc


In [None]:
#Same logic used in the train function, however, without allowing the gradients to flow 
def val_one_epoch(sent_encoder, doc_encoder, dataloader, device, criterion):
    sent_encoder.eval()
    doc_encoder.eval()
    total_loss, total_correct, total = 0, 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for utter_lists, mask, labels in tqdm(dataloader):
            batch_size = len(utter_lists)
            flat_utterances = [utt for dialogue in utter_lists for utt in dialogue]
            flat_embeddings = sent_encoder(flat_utterances)
            splits = [len(utts) for utts in utter_lists]
            utter_emb_batch = torch.split(flat_embeddings, splits, dim=0)
            maxlen = max(splits)
            utter_emb_batch = [
                torch.cat([e, torch.zeros(maxlen - e.size(0), e.size(1), device=device)], dim=0)
                for e in utter_emb_batch
            ]
            utter_emb_batch = torch.stack(utter_emb_batch)
            mask = mask.to(device)
            labels = labels.to(device)

            logits = doc_encoder(utter_emb_batch, mask)
            loss = criterion(logits, labels)

            preds = (torch.sigmoid(logits) > 0.5).long().cpu().numpy().flatten()
            true_labels = labels.long().cpu().numpy().flatten()

            all_preds.extend(preds.tolist())
            all_labels.extend(true_labels.tolist())

            total_correct += (preds == true_labels).sum()
            total += batch_size
            total_loss += loss.item() * batch_size

    avg_loss = total_loss / total
    avg_acc = total_correct / total
    f1 = f1_score(all_labels, all_preds, average="binary")
    print(f"Val loss: {avg_loss:.4f} | Val acc: {avg_acc:.4f} ")
    return avg_loss, avg_acc


In [None]:
#Same as the previous function 
def test_one_epoch(sent_encoder, doc_encoder, dataloader, device, criterion):
    sent_encoder.eval()
    doc_encoder.eval()
    total_loss, total_correct, total = 0, 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for utter_lists, mask, labels in tqdm(dataloader):
            batch_size = len(utter_lists)
            flat_utterances = [utt for dialogue in utter_lists for utt in dialogue]
            flat_embeddings = sent_encoder(flat_utterances)
            splits = [len(utts) for utts in utter_lists]
            utter_emb_batch = torch.split(flat_embeddings, splits, dim=0)
            maxlen = max(splits)
            utter_emb_batch = [
                torch.cat([e, torch.zeros(maxlen - e.size(0), e.size(1), device=device)], dim=0)
                for e in utter_emb_batch
            ]
            utter_emb_batch = torch.stack(utter_emb_batch)
            mask = mask.to(device)
            labels = labels.to(device)

            logits = doc_encoder(utter_emb_batch, mask)
            loss = criterion(logits, labels)

            preds = (torch.sigmoid(logits) > 0.5).long().cpu().numpy().flatten()
            true_labels = labels.long().cpu().numpy().flatten()

            all_preds.extend(preds.tolist())
            all_labels.extend(true_labels.tolist())

            total_correct += (preds == true_labels).sum()
            total += batch_size
            total_loss += loss.item() * batch_size

    avg_loss = total_loss / total
    avg_acc = total_correct / total
    f1 = f1_score(all_labels, all_preds, average="binary")
    print(f"Test loss: {avg_loss:.4f} | Test acc: {avg_acc:.4f} | Test F1: {f1:.4f}")
    return avg_loss, avg_acc, f1, all_preds, all_labels


In [None]:
# Training cell (Skip if using pre-trained models)

# Settings 

TRAIN_DATA_FOLDER = "final_datasets_cross"
VAL_DATA_FOLDER = "Test"
BATCH_SIZE = 8
EPOCHS = 12
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare dataset and loader
train_dataset = DialogueCoherenceDataset(TRAIN_DATA_FOLDER)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

val_dataset = DialogueCoherenceDataset(VAL_DATA_FOLDER)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

n_positive = sum(train_dataset.labels)
n_negative = len(train_dataset.labels) - n_positive
pos_weight = torch.tensor([n_negative / n_positive]).to(DEVICE)
train_criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

val_n_positive = sum(val_dataset.labels)
val_n_negative = len(val_dataset.labels) - val_n_positive
val_pos_weight = torch.tensor([val_n_negative / val_n_positive]).to(DEVICE)
val_criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)



# Build models
sent_encoder = SentenceEncoder(n_unfrozen=2).to(DEVICE)
doc_encoder = OrderAwareDocEncoder(hidden_size=768, num_layers=2, nhead=4, dropout=0.1, max_len=32).to(DEVICE)
optimizer = torch.optim.Adam(
    list(doc_encoder.parameters()) +
    list(filter(lambda p: p.requires_grad, sent_encoder.parameters())),
    lr=1e-5
)

best_val_acc=0.0

# Training loop, saving the model whenever we reach a higher validation accuracy to avoid overfitting 

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    train_one_epoch(sent_encoder, doc_encoder, train_dataloader, optimizer, DEVICE, train_criterion)
    val_loss, val_acc=val_one_epoch(sent_encoder, doc_encoder, val_dataloader, DEVICE, val_criterion)
    if val_acc > best_val_acc:
        best_val_acc=val_acc
        torch.save(
                {
                    "sent_encoder_state_dict": sent_encoder.state_dict(),
                    "doc_encoder_state_dict": doc_encoder.state_dict()
                }, "best_HT_cross"
            )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/307 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


model.safetensors:   0%|          | 0.00/836M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at SI2M-Lab/DarijaBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/12


100%|██████████| 140/140 [00:33<00:00,  4.19it/s]


Train loss: 0.6912 | Train acc: 0.5439
Epoch 2/12


100%|██████████| 140/140 [00:34<00:00,  4.02it/s]


Train loss: 0.6666 | Train acc: 0.5860
Epoch 3/12


100%|██████████| 140/140 [00:32<00:00,  4.28it/s]


Train loss: 0.6441 | Train acc: 0.6129
Epoch 4/12


100%|██████████| 140/140 [00:33<00:00,  4.21it/s]


Train loss: 0.6303 | Train acc: 0.6120
Epoch 5/12


100%|██████████| 140/140 [00:33<00:00,  4.17it/s]


Train loss: 0.6189 | Train acc: 0.6380
Epoch 6/12


100%|██████████| 140/140 [00:33<00:00,  4.12it/s]


Train loss: 0.6060 | Train acc: 0.6192
Epoch 7/12


100%|██████████| 140/140 [00:33<00:00,  4.22it/s]


Train loss: 0.5989 | Train acc: 0.6470
Epoch 8/12


100%|██████████| 140/140 [00:33<00:00,  4.19it/s]


Train loss: 0.5850 | Train acc: 0.6559
Epoch 9/12


100%|██████████| 140/140 [00:32<00:00,  4.30it/s]


Train loss: 0.5760 | Train acc: 0.6487
Epoch 10/12


100%|██████████| 140/140 [00:33<00:00,  4.21it/s]


Train loss: 0.5672 | Train acc: 0.6604
Epoch 11/12


100%|██████████| 140/140 [00:32<00:00,  4.25it/s]


Train loss: 0.5626 | Train acc: 0.6747
Epoch 12/12


100%|██████████| 140/140 [00:33<00:00,  4.24it/s]

Train loss: 0.5654 | Train acc: 0.6640





In [None]:
DATA_FOLDER = "Test"
BATCH_SIZE = 8

# Prepare dataset and loader
test_dataset = DialogueCoherenceDataset(DATA_FOLDER)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_n_positive = sum(test_dataset.labels)
test_n_negative = len(test_dataset.labels) - test_n_positive
test_pos_weight = torch.tensor([test_n_negative / test_n_positive]).to(DEVICE)

criterion = torch.nn.BCEWithLogitsLoss(pos_weight=test_pos_weight)

# Load pre-trained model from checkpoint 

checkpoint = torch.load("best_HT_cross", map_location='cpu')  # or 'cuda' if on GPU



sent_encoder = SentenceEncoder(n_unfrozen=2).to(DEVICE)
sent_encoder.load_state_dict(checkpoint["sent_encoder_state_dict"])

doc_encoder = OrderAwareDocEncoder(hidden_size=768, num_layers=2, nhead=4, dropout=0.1, max_len=32).to(DEVICE)
doc_encoder.load_state_dict(checkpoint["doc_encoder_state_dict"])



# Run testing

val_loss, val_acc, val_f1, val_preds, val_labels = test_one_epoch(
    sent_encoder,
    doc_encoder,
    test_dataloader,
    DEVICE,
    criterion
)


  output = torch._nested_tensor_from_mask(
100%|██████████| 23/23 [00:04<00:00,  4.69it/s]

Test loss: 0.5536 | Test acc: 0.6793 | Test F1: 0.7531





In [None]:
from sklearn.metrics import confusion_matrix

# printing the confusion matrix 

cm = confusion_matrix(val_labels, val_preds)
print("Confusion matrix:")
print(cm)

Confusion matrix:
[[35 57]
 [ 2 90]]
