In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import os
import json
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
import time

In [None]:
class SentenceEncoder(nn.Module):
    def __init__(self, model_name="SI2M-Lab/DarijaBERT"):
        super().__init__()
        self.tokenizer=AutoTokenizer.from_pretrained(model_name)
        self.model=AutoModel.from_pretrained(model_name)
        self.output_dim=self.model.config.hidden_size
        #Freeze all layers by default
        for param in self.model.parameters():
            param.requires_grad = False
        #Unfreeze the top 4 layers
        for i in range(8, 12):
            for param in self.model.encoder.layer[i].parameters():
                param.requires_grad=True

    def forward(self, sentences: list[str]) -> torch.Tensor:
        """
        Input: list of utterances
        Output: Tensor of shape (len(sentences), hidden_size)
        """
        encoded=self.tokenizer(sentences, padding=True, max_length=128,truncation=True, return_tensors="pt").to(self.model.device)


        output=self.model(**encoded)

        # Mean pooling over token embeddings

        embeddings=output.last_hidden_state
        attention_mask=encoded["attention_mask"].unsqueeze(-1)
        masked_embeddings=embeddings*attention_mask
        summed=masked_embeddings.sum(dim=1)
        counts=attention_mask.sum(dim=1).clamp(min=1)

        return summed/counts


In [None]:
def build_pairwise_features(s:torch.Tensor, t:torch.Tensor) ->torch.Tensor:
    """
    Given two senetence embeddings s and t of shape [d]
    the function returns feature vector of shape [5*d]: [s,t,s-t,|s-t|,s*t]
    """
    diff=s-t
    abs_diff=torch.abs(s-t)
    dot=s*t

    features=torch.cat([s,t,diff, abs_diff, dot], dim=-1)

    return features

In [None]:
class LCD_scorer(nn.Module):
    def __init__(self, embedding_dim=768, hidden_dim=128, hidden_dropout=0.1, input_dropout=0.1):
        super().__init__()
        input_dim=5*embedding_dim
        #initializing the one layer MLP
        self.mlp=nn.Sequential(
            nn.Dropout(input_dropout),
            nn.Linear(input_dim,hidden_dim),
            nn.ReLU(),
            nn.Dropout(hidden_dropout),
            nn.Linear(hidden_dim,1) #to output a single score
        )
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        return self.mlp(features).squeeze(-1)

In [None]:
def load_dialogues_with_speakers(folder_path="/content/final_datasets"):
    all_dialogues = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".jsonl"):
            filepath = os.path.join(folder_path, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                for line in f:
                    try:
                        data = json.loads(line)
                        raw_dialogue = data.get("dialogue", [])

                        # Extract list of (speaker, utterance)
                        dialogue = []
                        for turn in raw_dialogue:
                            if isinstance(turn, dict):
                                for speaker, utterance in turn.items():
                                    dialogue.append((speaker, utterance))
                        if len(dialogue) >= 2:  # skip too-short dialogues
                            all_dialogues.append(dialogue)
                    except json.JSONDecodeError:
                        print(f"Warning: could not parse line in {filename}")

    return all_dialogues # Each dialogue has form: [(speaker, utterance), ...], all_dialogues of form [dialogue1, dialogue2, ...]


In [None]:
def bounded_margin_loss(
    score_pos,
    score_neg,
    margin=10,
    upper_bound=10.0,
    lower_bound=0.0,
    lambda_upper=1.0,
    lambda_lower=1.0
):
    # standard margin ranking loss
    margin_loss = torch.clamp(margin - (score_pos - score_neg), min=0)
    # Upper and lower bound penalties
    upper_penalty = torch.clamp(score_pos - upper_bound, min=0)
    lower_penalty = torch.clamp(lower_bound - score_pos, min=0)
    loss = margin_loss + lambda_upper * upper_penalty + lambda_lower * lower_penalty
    return loss


In [None]:
def train_one_epoch(dialogues, encoder, scorer_fwd, scorer_bwd, criterion, optimizer_fwd, optimizer_bwd, device):
    scorer_fwd.train()
    scorer_bwd.train()
    encoder.train()
    total_loss = 0
    correct = 0
    total = 0

    for dialogue in dialogues:
        if len(dialogue) < 3:
            continue
        speakers, utterances = zip(*dialogue)
        #using the sentence encoder, get the embeddings of the utterances
        senetence_embeddings = encoder(list(utterances)).to(device)
        dialogue_loss = 0
        pair_count = 0

        for i in range(len(senetence_embeddings) - 1):
            s_i = senetence_embeddings[i].unsqueeze(0)
            s_pos = senetence_embeddings[i + 1].unsqueeze(0)
            current_speaker = speakers[i]

            # Sample negatives from the opposite speaker (not i+1)
            candidates = [
                j for j in range(len(senetence_embeddings))
                if j != i + 1 and speakers[j] != current_speaker
            ]
            if not candidates:
                continue
            neg_idx = random.choice(candidates)
            s_neg = senetence_embeddings[neg_idx].unsqueeze(0)

            # Build features for both directions
            feat_pos_fwd = build_pairwise_features(s_i, s_pos).to(device)
            feat_neg_fwd = build_pairwise_features(s_i, s_neg).to(device)
            feat_pos_bwd = build_pairwise_features(s_pos, s_i).to(device)
            feat_neg_bwd = build_pairwise_features(s_neg, s_i).to(device)

            # Score the features
            score_pos_fwd = scorer_fwd(feat_pos_fwd.unsqueeze(0)).view(1)
            score_neg_fwd = scorer_fwd(feat_neg_fwd.unsqueeze(0)).view(1)
            score_pos_bwd = scorer_bwd(feat_pos_bwd.unsqueeze(0)).view(1)
            score_neg_bwd = scorer_bwd(feat_neg_bwd.unsqueeze(0)).view(1)

            # Average scores
            score_pos = (score_pos_fwd + score_pos_bwd) / 2
            score_neg = (score_neg_fwd + score_neg_bwd) / 2

            # Compute loss
            loss = bounded_margin_loss(
                score_pos,
                score_neg,
            )

            dialogue_loss += loss
            pair_count += 1
        if pair_count > 0:
            # Backpropagate and update
            dialogue_loss = dialogue_loss / pair_count
            optimizer_fwd.zero_grad()
            optimizer_bwd.zero_grad()
            dialogue_loss.backward()
            optimizer_fwd.step()
            optimizer_bwd.step()

            total_loss += dialogue_loss.item()
            #Increment the correct counter if the score of the positive is bigger than the negative one
            if score_pos.detach().item() > score_neg.detach().item():
                correct += 1
            total += 1
    accuracy = correct / total if total > 0 else 0.0

    return total_loss, accuracy


In [None]:
def eval_one_epoch(dialogues, encoder, scorer_fwd, scorer_bwd, criterion, device):
    scorer_fwd.eval()
    scorer_bwd.eval()
    encoder.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for dialogue in dialogues:
            if len(dialogue) < 3:
                continue
            speakers, utterances = zip(*dialogue)
            senetence_embeddings = encoder(list(utterances)).to(device)

            for i in range(len(senetence_embeddings) - 1):
                s_i = senetence_embeddings[i].unsqueeze(0).detach()
                s_pos = senetence_embeddings[i + 1].unsqueeze(0).detach()
                current_speaker = speakers[i]

                # Sample negatives from the same speaker (not i+1)
                candidates = [
                    j for j in range(len(senetence_embeddings))
                    if j != i + 1 and speakers[j] != current_speaker
                ]
                if not candidates:
                    continue
                neg_idx = random.choice(candidates)
                s_neg = senetence_embeddings[neg_idx].unsqueeze(0).detach()

                # Build features for both directions
                feat_pos_fwd = build_pairwise_features(s_i, s_pos).to(device)
                feat_neg_fwd = build_pairwise_features(s_i, s_neg).to(device)
                feat_pos_bwd = build_pairwise_features(s_pos, s_i).to(device)
                feat_neg_bwd = build_pairwise_features(s_neg, s_i).to(device)

                # Score the features
                score_pos_fwd = scorer_fwd(feat_pos_fwd.unsqueeze(0)).view(1)
                score_neg_fwd = scorer_fwd(feat_neg_fwd.unsqueeze(0)).view(1)
                score_pos_bwd = scorer_bwd(feat_pos_bwd.unsqueeze(0)).view(1)
                score_neg_bwd = scorer_bwd(feat_neg_bwd.unsqueeze(0)).view(1)

                # Average scores
                score_pos = (score_pos_fwd + score_pos_bwd) / 2
                score_neg = (score_neg_fwd + score_neg_bwd) / 2
                # Compute loss
                target = torch.tensor([1.0], device=device)

                loss = bounded_margin_loss(
                    score_pos,
                    score_neg,
                )
                total_loss += loss.item()

                if score_pos.detach().item() > score_neg.detach().item():
                    correct += 1
                total += 1

    accuracy = correct / total if total > 0 else 0.0
    return total_loss, accuracy

In [None]:
def train_val_split():
    all_dialogues=load_dialogues_with_speakers()
    train_d, test_val_d= train_test_split(all_dialogues, test_size=0.15, shuffle=True)
    test_d, val_d=train_test_split(test_val_d, test_size=0.333, shuffle=True)
    return train_d, test_d ,val_d #train (85%) test(10%) val(5%)

In [None]:

def permute_dialogue(dialogue, speaker_to_permute, num_permutations=1):
    # Gather indices and utterances of the target speaker
    speaker_indices = []
    speaker_utterances = []
    other_turns = []

    for idx, turn in enumerate(dialogue):
        if turn[0] == speaker_to_permute:
            speaker_indices.append(idx)
            speaker_utterances.append(turn[1])
        else:
            other_turns.append((idx, turn))

    permutations = []
    for _ in range(num_permutations):
        if len(speaker_indices) < 2:
            # Not enough to permute, just return original dialogue
            permutations.append(dialogue.copy())
            continue

        # Randomly choose how many utterances to permute (at least 2)
        k = random.randint(2, len(speaker_indices))
        # Pick random indices to permute
        permute_sub_idx = random.sample(range(len(speaker_indices)), k)
        # Extract their utterances and shuffle
        to_shuffle = [speaker_utterances[i] for i in permute_sub_idx]
        shuffled = to_shuffle.copy()
        while True:
            random.shuffle(shuffled)
            if shuffled != to_shuffle:
                break

        # Build a new utterance list for the target speaker
        new_speaker_utterances = speaker_utterances.copy()
        for orig, new in zip(permute_sub_idx, shuffled):
            new_speaker_utterances[orig] = new

        # Reconstruct dialogue
        permuted = []
        speaker_ptr = 0
        for idx, (speaker, _) in enumerate(dialogue):
            if speaker == speaker_to_permute:
                permuted.append((speaker, new_speaker_utterances[speaker_ptr]))
                speaker_ptr += 1
            else:
                # Get original non-permuted turn at this index
                orig_idx, orig_turn = other_turns.pop(0)
                permuted.append(orig_turn)

        permutations.append(permuted)

    return permutations


In [None]:
def compute_dialogue_pred(dialogue, encoder, scorer_fwd, scorer_bwd, device):
    speakers, utterances = zip(*dialogue)
    embeddings = encoder(list(utterances)).to(device)

    score_sum = 0
    for i in range(len(embeddings) - 1):
        s_i = embeddings[i].to(device)
        s_j = embeddings[i + 1].to(device)

        feat_fwd = build_pairwise_features(s_i, s_j).to(device)
        feat_bwd = build_pairwise_features(s_j, s_i).to(device)

        score_fwd = scorer_fwd(feat_fwd.unsqueeze(0)).view(1)
        score_bwd = scorer_bwd(feat_bwd.unsqueeze(0)).view(1)

        score = (score_fwd + score_bwd) / 2

        if score <0:
          return 0 # return 0 (as incoherent) if the score of a pair is negative 


    return 1 # return 1 (as coherent) if no pair has a negative score


In [None]:
def test_experiment(dialogues, encoder, scorer_fwd, scorer_bwd, device , num_permutations=1):
    encoder.eval()
    scorer_fwd.eval()
    scorer_bwd.eval()

    true=[]
    pred=[]

    with torch.no_grad():
        for dialogue in dialogues:
            if len(dialogue)< 3:
                continue
            #predict the class(coherent 1 or incoherent 0) of the original dialogue
            score=compute_dialogue_pred(dialogue,encoder, scorer_fwd, scorer_bwd, device)
            true.append(1)
            pred.append(score)

            #predict the class of the permuted dialogues in A's or B's utterances
            choice = random.choice(["A", "B"])
            permuted=permute_dialogue(dialogue, choice, num_permutations=num_permutations)
            for perm in permuted:
                score2=compute_dialogue_pred(perm, encoder, scorer_fwd, scorer_bwd, device)
                true.append(0)
                pred.append(score2)

            
        f1=f1_score(true , pred)
        accuracy=accuracy_score(true, pred)

        return f1, accuracy,  true, pred




In [None]:
if __name__=="__main__":
    device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder=SentenceEncoder()
    encoder.to(device)
    scorer_fwd=LCD_scorer()
    scorer_bwd=LCD_scorer()
    scorer_fwd.to(device)
    scorer_bwd.to(device)
    criterion=nn.MarginRankingLoss(margin=10)
    params_fwd = list(filter(lambda p: p.requires_grad, encoder.parameters())) + list(scorer_fwd.parameters())
    params_bwd = list(filter(lambda p: p.requires_grad, encoder.parameters())) + list(scorer_bwd.parameters())
    optimizer_fwd=torch.optim.Adam(params_fwd, lr=1e-5)
    optimizer_bwd=torch.optim.Adam(params_bwd, lr=1e-5)


    train_data, test_data ,val_data=train_val_split()

    num_epochs=8

    best_val_acc=0.0

    # Training loop, saving the model whenever we reach a higher validation accuracy to avoid overfitting 

    for epoch in range(num_epochs):
        start_time = time.time()

        with tqdm(total=1, desc=f"Epoch {epoch+1}/{num_epochs}", bar_format="{l_bar}{bar} [elapsed: {elapsed}]") as pbar:
            train_loss, train_acc = train_one_epoch(dialogues=train_data, encoder=encoder, scorer_fwd=scorer_fwd, scorer_bwd=scorer_bwd, criterion=criterion, optimizer_fwd=optimizer_fwd, optimizer_bwd=optimizer_bwd, device=device)
            val_loss, val_acc = eval_one_epoch(dialogues=val_data, encoder=encoder, scorer_fwd=scorer_fwd, scorer_bwd=scorer_bwd, criterion=criterion, device=device)


            pbar.update(1)
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        if val_acc>best_val_acc:
            best_val_acc=val_acc
            print(f"Saving the current best model with validation accuracy {val_acc} at epoch {epoch+1}")
            torch.save(
                {
                    "encoder_state_dict": encoder.state_dict(),
                    "scorer_fwd_state_dict": scorer_fwd.state_dict(),
                    "scorer_bwd_state_dict": scorer_bwd.state_dict()
                }, "/content/best_lcd_in.pt"
            )



The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Some weights of BertModel were not initialized from the model checkpoint at SI2M-Lab/DarijaBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch 1/8: 100%|██████████ [elapsed: 00:40]


Train Loss: 4876.0948, Train Acc: 0.7790, Val Loss: 1851.9860, Val Acc: 0.7016
Saving the current best model with validation accuracy 0.7016129032258065 at epoch 1


Epoch 2/8: 100%|██████████ [elapsed: 00:38]


Train Loss: 3570.0419, Train Acc: 0.7935, Val Loss: 1534.3780, Val Acc: 0.7298
Saving the current best model with validation accuracy 0.7298387096774194 at epoch 2


Epoch 3/8: 100%|██████████ [elapsed: 00:38]


Train Loss: 2989.9386, Train Acc: 0.8007, Val Loss: 1421.7869, Val Acc: 0.7782
Saving the current best model with validation accuracy 0.7782258064516129 at epoch 3


Epoch 4/8: 100%|██████████ [elapsed: 00:37]


Train Loss: 2580.6839, Train Acc: 0.8225, Val Loss: 1491.6307, Val Acc: 0.7863
Saving the current best model with validation accuracy 0.7862903225806451 at epoch 4


Epoch 5/8: 100%|██████████ [elapsed: 00:38]


Train Loss: 2342.1787, Train Acc: 0.8478, Val Loss: 1251.9556, Val Acc: 0.8145
Saving the current best model with validation accuracy 0.8145161290322581 at epoch 5


Epoch 6/8: 100%|██████████ [elapsed: 00:38]


Train Loss: 2082.8186, Train Acc: 0.8786, Val Loss: 1245.5516, Val Acc: 0.7782


Epoch 7/8: 100%|██████████ [elapsed: 00:37]


Train Loss: 1835.9685, Train Acc: 0.9058, Val Loss: 1266.0744, Val Acc: 0.7984


Epoch 8/8: 100%|██████████ [elapsed: 00:38]

Train Loss: 1665.2188, Train Acc: 0.9203, Val Loss: 1307.8588, Val Acc: 0.8145





In [None]:
#load the model wiht the best validation accuracy and testing it on binary classification 

checkpoint = torch.load("/content/best_lcd_in.pt", map_location="cuda")
encoder=SentenceEncoder()
encoder.load_state_dict(checkpoint["encoder_state_dict"])
encoder.to(device)
scorer_fwd=LCD_scorer()
scorer_fwd.load_state_dict(checkpoint["scorer_fwd_state_dict"])
scorer_bwd=LCD_scorer()
scorer_bwd.load_state_dict(checkpoint["scorer_bwd_state_dict"])
scorer_fwd.to(device)
scorer_bwd.to(device)

with tqdm(total=1, desc=f"Test", bar_format="{l_bar}{bar} [elapsed: {elapsed}]") as pbar:
    f1,accuracy, true, pred=test_experiment(test_data, encoder, scorer_fwd, scorer_bwd, 0, device)

    pbar.update(1)

print(f"F1 score: {f1}")
print(f"accuracy score: {accuracy}")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Some weights of BertModel were not initialized from the model checkpoint at SI2M-Lab/DarijaBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test: 100%|██████████ [elapsed: 00:01]

F1 score: 0.7627118644067796
accuracy score: 0.7846153846153846





In [None]:
#Printing the confusion matrix 
from sklearn.metrics import confusion_matrix
print(confusion_matrix(true, pred))


[[57  8]
 [20 45]]
