<a href="https://colab.research.google.com/github/khered20/MTL-Dial2MSA/blob/main/MTLtrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Datasets and Libraries Preparation**

In [1]:
!pip install sacremoses sacrebleu  -q

In [2]:
import os

# Clone your GitHub repository
if not os.path.exists("MTL-Dial2MSA"):
    !git clone https://github.com/khered20/MTL-Dial2MSA.git
    %cd MTL-Dial2MSA
else:
    %cd MTL-Dial2MSA

/content/MTL-Dial2MSA


In [3]:
import os

# Clone your GitHub repository
if not os.path.exists("Dial2MSA-Verified"):
    !git clone https://github.com/khered20/Dial2MSA-Verified.git

In [4]:
import pandas as pd
import glob, os
import os

# Create the 'data' directory if it doesn't exist
if not os.path.exists("data"):
    os.makedirs("data")


# Define mapping of dialects to msa column name
msa_mapping = {
    "egy": "msa",
    "mgr": "msa",
    "glf": "msa_verified",
    "lev": "msa_verified"
}

def merge_csvs(folder):
    dfs = []
    files = glob.glob(os.path.join("Dial2MSA-Verified", folder, "*.csv"))
    for f in files:
        dialect = f.split("/")[-1].split("_")[0]   # e.g., "egy_train.csv" -> "egy"
        msa_col = msa_mapping[dialect]             # choose correct msa column
        df = pd.read_csv(f)

        # Extract required columns
        df = df[["cleanedtweet2", msa_col]].rename(
            columns={"cleanedtweet2": "dialect_sentence", msa_col: "msa_translation"}
        )
        df["dialect_label"] = dialect.upper()

        dfs.append(df)
    merged = pd.concat(dfs, ignore_index=True)
    merged = merged[["dialect_label", "dialect_sentence", "msa_translation"]]
    return merged

# Merge train & dev
train_df = merge_csvs("train")
dev_df   = merge_csvs("dev")

print("Train samples:", len(train_df))
print("Dev samples:", len(dev_df))
print(train_df.head())


Train samples: 23087
Dev samples: 800
  dialect_label                                   dialect_sentence  \
0           MGR  حررو المقر الاصلي لمصرف الوحدة السوق ميدان الش...   
1           MGR    الان صباحكم مفتاح شقلوف ومعاش بنقول أكثر من هكي   
2           MGR  سلمىٰ تي هذا مرا من المرات ياتامو ومشكلا فجأه هكي   
3           MGR                          صار هكي يا جيجي توا نوريك   
4           MGR                         حتى انا في الاول هكي نحساب   

                                     msa_translation  
0  حرروا المقر الاصلي لمصرف  الوحدة السوق ميدان ا...  
1             الان صباحكم مفتاح ولن اقول اكثر من هذا  
2  سلمى إن هذا في مرة من المرات  يتجمعون و فجأة ت...  
3                        هكذا اذا يا جيجي الان سأريك  
4              حتى انا كنت  اظنها في بادئ الامر هكذا  


In [5]:
print("Available dialect labels:", train_df['dialect_label'].unique())

Available dialect labels: ['MGR' 'LEV' 'GLF' 'EGY']


In this notebook, we only used the Dial2MSA-Verified training set.

---

Combine the following Additional Corpora in the training (**PADIC, MADAR,Arabic STS, Emi-NADI and LahjaTube datasets**) to reproduce the models from the paper


In [6]:
### Optional if you want augmenting the data with MSA pairs similar to the paper
# Duplicate train_df
train_df_msa = train_df.copy()

# Change dialect_label to 'MSA' in the duplicated DataFrame
train_df_msa['dialect_label'] = 'MSA'
train_df_msa['dialect_sentence'] = train_df_msa['msa_translation']

# Concatenate the original and duplicated DataFrames
train_df = pd.concat([train_df, train_df_msa], ignore_index=True)

train_df = train_df.drop_duplicates(["dialect_sentence", "msa_translation"], keep="first")
# Display the first few rows and the new length to verify
print("New length of train_df:", len(train_df))
print(train_df.head())
print("Available dialect labels:", train_df['dialect_label'].unique())

New length of train_df: 45760
  dialect_label                                   dialect_sentence  \
0           MGR  حررو المقر الاصلي لمصرف الوحدة السوق ميدان الش...   
1           MGR    الان صباحكم مفتاح شقلوف ومعاش بنقول أكثر من هكي   
2           MGR  سلمىٰ تي هذا مرا من المرات ياتامو ومشكلا فجأه هكي   
3           MGR                          صار هكي يا جيجي توا نوريك   
4           MGR                         حتى انا في الاول هكي نحساب   

                                     msa_translation  
0  حرروا المقر الاصلي لمصرف  الوحدة السوق ميدان ا...  
1             الان صباحكم مفتاح ولن اقول اكثر من هذا  
2  سلمى إن هذا في مرة من المرات  يتجمعون و فجأة ت...  
3                        هكذا اذا يا جيجي الان سأريك  
4              حتى انا كنت  اظنها في بادئ الامر هكذا  
Available dialect labels: ['MGR' 'LEV' 'GLF' 'EGY' 'MSA']


In [7]:
# Save train_df and dev_df
os.makedirs("data", exist_ok=True)
train_df.to_csv("data/All_train_mtl.csv", index=False)
dev_df.to_csv("data/All_dev_mtl.csv", index=False)

**Training Phase**

In [8]:
import sys
sys.path.append('./MTL-Dial2MSA')

from mtl.dataset import create_data_loaders
from mtl.models import MultiTaskT5, MultiTaskMBart
from mtl.train import train
from mtl.utils import cleanup
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch


In [9]:
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 16
max_length = 128
num_epochs = 1
alpha=0.5

# === USER CHOICE ===
MODEL_TYPE = "AraT5"      # options: "AraT5" or "AraBART"
if MODEL_TYPE == "AraT5":
    SAVE_PATH = "saved_models/mtl_AraT5"
    MODEL_NAME = "UBC-NLP/AraT5v2-base-1024"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = MultiTaskT5(num_labels=5, pretrained_model=MODEL_NAME).to(device)
elif MODEL_TYPE == "AraBART":
    SAVE_PATH = "saved_models/mtl_AraBART"
    MODEL_NAME="moussaKam/AraBART"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = MultiTaskMBart(num_labels=5, pretrained_model=MODEL_NAME).to(device)


train_loader, val_loader, tokenizer = create_data_loaders(
    "data/All_train_mtl.csv", "data/All_dev_mtl.csv",
    tokenizer, batch_size=batch_size, max_length=max_length
)

print(f"Initialized {MODEL_TYPE} with {len(train_loader.dataset)} training samples and {len(val_loader.dataset)} dev samples.")
print(f"Using {device} for training.")


tokenizer.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH+'/trns')
tokenizer.save_pretrained(SAVE_PATH+'/cls')
tokenizer.save_pretrained(SAVE_PATH+'/last')

config = {
    "MODEL_TYPE": MODEL_TYPE,
    "base_model": MODEL_NAME,
    "num_labels": 5,
    "max_length": max_length,
    "batch_size": batch_size,
    "custom_parameters": {
        "alpha": alpha,
    }
}

config_path = SAVE_PATH+'/config.json'
with open(config_path, 'w') as json_file:
    json.dump(config, json_file, indent=4)

config_path = SAVE_PATH+'/trns'+'/config.json'
with open(config_path, 'w') as json_file:
    json.dump(config, json_file, indent=4)

config_path = SAVE_PATH+'/cls'+'/config.json'
with open(config_path, 'w') as json_file:
    json.dump(config, json_file, indent=4)

config_path = SAVE_PATH+'/last'+'/config.json'
with open(config_path, 'w') as json_file:
    json.dump(config, json_file, indent=4)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Initialized AraT5 with 45760 training samples and 800 dev samples.
Using cuda for training.


In [10]:
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs
)


In [11]:
best_bleu, best_f1 = train(
    model, train_loader, optimizer, scheduler, device,
    val_loader, tokenizer, epochs=num_epochs, save_path=SAVE_PATH,alpha=alpha
)
print("Training finished!")
print("Best BLEU:", best_bleu)
print("Best F1:", best_f1)

cleanup()


Epoch 1:  17%|█▋        | 499/2860 [11:05<52:06,  1.32s/it, loss=1.37] 

Step 500: BLEU=21.6114, F1=0.9684


Epoch 1:  35%|███▍      | 999/2860 [23:05<39:45,  1.28s/it, loss=0.29]

Step 1000: BLEU=25.4980, F1=0.9705


Epoch 1:  52%|█████▏    | 1499/2860 [35:12<29:14,  1.29s/it, loss=0.806]

Step 1500: BLEU=26.4580, F1=0.9570


Epoch 1:  70%|██████▉   | 1999/2860 [47:20<18:34,  1.29s/it, loss=0.355]

Step 2000: BLEU=27.4874, F1=0.9630


Epoch 1:  87%|████████▋ | 2500/2860 [1:00:14<1:40:19, 16.72s/it, loss=0.741]

Step 2500: BLEU=27.4769, F1=0.9665


Epoch 1: 100%|██████████| 2860/2860 [1:07:58<00:00,  1.43s/it, loss=0.65]


dev Epoch 1, BLEU Score: 27.7920, F1 Score: 0.9694, -best bleu: 27.7920, best f1: 0.9705
Training finished!
Best BLEU: 27.792019431319627
Best F1: 0.9705434651555342


**Prepare libraries for loading MTL Model**:

In [9]:
import os

if os.path.exists("MTL-Dial2MSA"):
    %cd MTL-Dial2MSA
else:
    print("MTL-Dial2MSA directory does not exist.")

import sys
sys.path.append('./MTL-Dial2MSA')
from mtl.dataset import create_data_loaders
from mtl.models import MultiTaskT5, MultiTaskMBart
from mtl.train import train
from mtl.utils import cleanup
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch
import json
import pandas as pd
import glob

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if os.path.exists("saved_models/mtl_AraT5") and 'SAVE_PATH' not in locals():
    SAVE_PATH="saved_models/mtl_AraT5"
else:
    print("saved_models/mtl_AraT5 directory does not exist.")
    SAVE_PATH = None # Or handle the case where the directory doesn't exist

MTL-Dial2MSA directory does not exist.


**Load MTL Model**:
1.   trns: for best BLUE score translation model
2.   cls: for best weighted F1 score classification model
3.   last: for last model saved during training





In [10]:
# Load the model with best translation results on DEV set
best_model_path = os.path.join(SAVE_PATH, "trns")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(best_model_path)

# Load full model
model = torch.load(best_model_path + "/model.pt", map_location=device, weights_only=False)
model.to(device)
model.eval()



MultiTaskT5(
  (translator): T5ForConditionalGeneration(
    (shared): Embedding(110208, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(110208, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseGatedActDense(
                (wi_0): Linear(in_features=768, out_features=2048, bias=False)
                (wi_1): L

In [25]:
import json
config_path = os.path.join(SAVE_PATH, 'config.json')
if os.path.exists(config_path) and 'batch_size' not in locals() and 'MODEL_TYPE' not in locals():
    with open(config_path, 'r') as f:
        config = json.load(f)
    batch_size = config.get("batch_size")
    max_length = config.get("max_length")
    MODEL_TYPE = config.get("MODEL_TYPE")
    print(f"Batch size: {batch_size}")
    print(f"Max length: {max_length}")
    print(f"Model type: {MODEL_TYPE}")
else:
    print(f"MODEL_TYPE and batch_size already exists or Config file not found at {config_path}")
    batch_size = 16  # Default value
    max_length = 128 # Default value
    MODEL_TYPE = "AraT5" # Default value


Config file not found at saved_models/mtl_AraT5/config.json


**Test the MTL Model**

In [11]:
from mtl.predict import predict

samples = "شو عم تعمل"
outputs = predict(model, tokenizer, samples, device)

for o in outputs:
    print("\nInput:", o["input"])
    print("Predicted Dialect:", o["dialect"])
    print("Predicted Translation:", o["translation"])



Input: شو عم تعمل
Predicted Dialect: LEV
Predicted Translation: ماذا تفعل؟


In [12]:
samples = ["إزاي عملت كده", "شلونك ي ابوي"]
outputs = predict(model, tokenizer, samples, device)

for o in outputs:
    print("\nInput:", o["input"])
    print("Predicted Dialect:", o["dialect"])
    print("Predicted Translation:", o["translation"])


Input: إزاي عملت كده
Predicted Dialect: EGY
Predicted Translation: كيف فعلت ذلك؟

Input: شلونك ي ابوي
Predicted Dialect: GLF
Predicted Translation: كيف حالك يا أبي؟


**Get the Dial2MSA-Verified test set**

In [15]:
!7z x Dial2MSA-Verified/test.7z



7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 1041763 bytes (1018 KiB)

Extracting archive: Dial2MSA-Verified/test.7z
--
Path = Dial2MSA-Verified/test.7z
Type = 7z
Physical Size = 1041763
Headers Size = 680
Method = LZMA2:12m
Solid = +
Blocks = 1

  0%    Everything is Ok

Folders: 5
Files: 24
Size:       10699521
Compressed: 1041763


In [13]:
egy_file_path = "test/egy/cln2_tweet_egy_ts.txt"
glf_file_path = "test/glf/cln_tweet_glf_ts.txt"
lev_file_path = "test/lev/cln_tweet_lev_ts.txt"
mgr_file_path = "test/mgr/cln_tweet_mgr_ts.txt"

with open(egy_file_path, 'r') as f:
    egy_tweets = [line.strip() for line in f.readlines()]

with open(glf_file_path, 'r') as f:
    glf_tweets = [line.strip() for line in f.readlines()]

with open(lev_file_path, 'r') as f:
    lev_tweets = [line.strip() for line in f.readlines()]

with open(mgr_file_path, 'r') as f:
    mgr_tweets = [line.strip() for line in f.readlines()]

print(f"Read {len(egy_tweets)} Egyptian tweets.")
print(f"Read {len(glf_tweets)} Gulf tweets.")
print(f"Read {len(lev_tweets)} Levantine tweets.")
print(f"Read {len(mgr_tweets)} Maghrebi tweets.")

Read 2000 Egyptian tweets.
Read 2000 Gulf tweets.
Read 2000 Levantine tweets.
Read 2000 Maghrebi tweets.


In [31]:
import pandas as pd
from mtl.dataset import create_data_loaders
from mtl.predict import BigListPred

# Create a DataFrame from the test tweets
test_data = {
    "dialect_label": ["EGY"] * len(egy_tweets)+ ["MGR"] * len(mgr_tweets) + ["GLF"] * len(glf_tweets) + ["LEV"] * len(lev_tweets) ,
    "dialect_sentence": egy_tweets + mgr_tweets + glf_tweets + lev_tweets,
    "msa_translation": egy_tweets + mgr_tweets + glf_tweets + lev_tweets # There are multiple MSA references, we will not load them now
}
test_df = pd.DataFrame(test_data)
test_df.to_csv("data/All_test.csv", index=False)

# Create a data loader for the test set
# Assuming create_data_loaders can handle a single DataFrame for testing

shuffled_test_loader, test_loader, tokenizer = create_data_loaders(
    "data/All_test.csv", "data/All_test.csv",
    tokenizer, batch_size=batch_size, max_length=max_length
)

# Evaluate the model
model_trns_pred, model_dialects_pred = BigListPred(model, test_loader, tokenizer, device)

In [32]:
# Save the trns_pred list to a file
model_test_pred_8000 = f"test_pred_8000_MTL_{MODEL_TYPE}.txt"
with open(model_test_pred_8000, "w", encoding="utf-8") as f:
    for item in model_trns_pred:
        f.write("%s\n" % item)

print(f"Predictions saved to {model_test_pred_8000}")

# Run the evaluation script
!python Dial2MSA-Verified/bleu_chrf2.py Dial2MSA_test_eval_4_dialects {model_test_pred_8000}

Predictions saved to test_pred_8000_MTL_AraT5.txt
test_pred_8000_MTL_AraT5_egy_pred.txt
MSA refrences: 3 refrences files
bleu score:  30.000766912223803
chrf++ score:  52.12574286048247
====
test_pred_8000_MTL_AraT5_egy_pred.txt
MSA refrences: 1 refrences files
bleu score:  14.825464987161311
chrf++ score:  41.12207622682467
====
test_pred_8000_MTL_AraT5_egy_pred.txt
MSA refrences: 1 refrences files
bleu score:  14.024623649031575
chrf++ score:  39.85025457432316
====
test_pred_8000_MTL_AraT5_egy_pred.txt
MSA refrences: 1 refrences files
bleu score:  14.436049008179106
chrf++ score:  40.4512791364493
====
test_pred_8000_MTL_AraT5_egy_pred.txt
MSA refrences: 2 refrences files
bleu score:  23.51528303866898
chrf++ score:  47.96493751861695
====
test_pred_8000_MTL_AraT5_egy_pred.txt
MSA refrences: 2 refrences files
bleu score:  23.31289897261804
chrf++ score:  47.51616912765657
====
test_pred_8000_MTL_AraT5_mgr_pred.txt
MSA refrences: 2 refrences files
bleu score:  34.95442082173521
chrf+

In [33]:
# Show the overall translation performance on all MSA reference for each dialect
import pandas as pd
df = pd.read_csv('eval_dial2msa.csv')
df

Unnamed: 0,model,g_file,p_file,metrx,Egy,Glf,Mgr,Lev,Avg
0,test_pred_8000_MTL_AraT5,Dial2MSA_test_eval_4_dialects,test_pred_8000_MTL_AraT5.txt,bleu,30.000767,51.877278,34.954421,45.941933,40.6936
1,test_pred_8000_MTL_AraT5,Dial2MSA_test_eval_4_dialects,test_pred_8000_MTL_AraT5.txt,chrf++,52.125743,70.089353,58.749621,65.427929,61.598161


In [34]:
# Show the translation performance on each MSA reference for each dialect
import pandas as pd
df = pd.read_csv('eval_trans.csv')
df

Unnamed: 0,model,di,source,bleu,chrf2
0,test_pred_8000_MTL_AraT5_egy_pred.txt,3_filesRefs_test/egy/gold_msa_egy_ts,2000,30.000767,52.125743
1,test_pred_8000_MTL_AraT5_egy_pred.txt,file1_,2000,14.825465,41.122076
2,test_pred_8000_MTL_AraT5_egy_pred.txt,file2_,2000,14.024624,39.850255
3,test_pred_8000_MTL_AraT5_egy_pred.txt,file3_,2000,14.436049,40.451279
4,test_pred_8000_MTL_AraT5_egy_pred.txt,file12_,2000,23.515283,47.964938
5,test_pred_8000_MTL_AraT5_egy_pred.txt,file23_,2000,23.312899,47.516169
6,test_pred_8000_MTL_AraT5_mgr_pred.txt,2_filesRefs_test/mgr/gold_msa_mgr_ts,2000,34.954421,58.749621
7,test_pred_8000_MTL_AraT5_mgr_pred.txt,file1_,2000,23.951199,52.048655
8,test_pred_8000_MTL_AraT5_mgr_pred.txt,file2_,2000,23.387989,51.712088
9,test_pred_8000_MTL_AraT5_glf_pred.txt,3_filesRefs_test/glf/gold_msa_glf_ts,2000,51.877278,70.089353


In [35]:
from sklearn.metrics import accuracy_score, f1_score


true_labels = test_df['dialect_label'].tolist()

if len(true_labels) != len(model_dialects_pred):
    print("Warning: Length of true_labels and dialects_pred do not match. Cannot calculate metrics.")
else:
    # Calculate Accuracy
    accuracy = accuracy_score(true_labels, model_dialects_pred)
    print(f"Accuracy: {accuracy:.4f}")

    # Calculate F1-macro
    f1_macro = f1_score(true_labels, model_dialects_pred, average='macro')
    print(f"F1-macro: {f1_macro:.4f}")

    # Calculate F1-weighted
    f1_weighted = f1_score(true_labels, model_dialects_pred, average='weighted')
    print(f"F1-weighted: {f1_weighted:.4f}")

Accuracy: 0.9657
F1-macro: 0.7853
F1-weighted: 0.9816
