When choosing a model two approaches were given to us.Either creating our own model using TF-IDF , or using an already made model like BERT, ROBERTA etc

The first model below is a model using a TF-IDF Vectorizer and Naive Bayes—a classic approach..

# STEP 4-TF IDF AND NAIVE BAYES

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Charger les données
import pandas as pd
df = pd.read_csv("train_cleaned_numeric_labels.csv")

# Transformer les textes en vecteurs TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_tfidf = vectorizer.fit_transform(df["Text"])

# Séparer les données en train/test
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df["Label"], test_size=0.2, random_state=42)

# Entraîner le modèle Naive Bayes
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

# Prédictions
y_pred = model_nb.predict(X_test)

# Évaluer l'accuracy
print(f"Accuracy du modèle Naive Bayes : {accuracy_score(y_test, y_pred):.4f}")


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split

# Charger les données
df = pd.read_csv("train_cleaned_numeric_labels.csv")

# Tokenisation des textes
tokenizer = Tokenizer(num_words=50000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["Text"])
X_seq = tokenizer.texts_to_sequences(df["Text"])

# Padding des séquences
X_pad = pad_sequences(X_seq, maxlen=200, padding='post', truncating='post')

# Séparer les données en train/test
X_train, X_test, y_train, y_test = train_test_split(X_pad, df["Label"], test_size=0.2, random_state=42)

# Création du modèle LSTM
model_lstm = Sequential([
    Embedding(input_dim=50000, output_dim=128, input_length=200),
    SpatialDropout1D(0.2),
    LSTM(128, return_sequences=False),
    Dense(128, activation="relu"),
    Dense(len(df["Label"].unique()), activation="softmax")
])

# Compilation du modèle
model_lstm.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Entraînement
model_lstm.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Évaluation
loss, acc = model_lstm.evaluate(X_test, y_test)
print(f"Accuracy du modèle LSTM : {acc:.4f}")


# STEP 5-Pre-existing Model

The code below is our first attempt at using models.

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Charger le tokenizer et le modèle BERT multilingue
model_name = 'bert-base-multilingual-cased'  # ou 'xlm-roberta-base' pour XLM-RoBERTa
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=386)  # 386 labels pour tes 386 langues

# Charger le dataset
import pandas as pd
df = pd.read_csv("train_cleaned_numeric_labels.csv")

# Tokenisation des textes
def tokenize_function(examples):
    return tokenizer(examples['Text'], padding='max_length', truncation=True, max_length=512)

# Appliquer la tokenisation au dataset
tokenized_dataset = df[['Text', 'Label']].rename(columns={'Text': 'text', 'Label': 'label'})
tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

# Diviser le dataset en train/test
train_dataset, test_dataset = train_test_split(tokenized_dataset, test_size=0.2, random_state=42)

# Convertir en Dataset Hugging Face
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

# Définir les arguments pour l'entraînement
training_args = TrainingArguments(
    output_dir='./results',          
    evaluation_strategy="epoch",     
    learning_rate=2e-5,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    num_train_epochs=3,             
    weight_decay=0.01,              
    logging_dir='./logs',            
    logging_steps=200,
)

# Fonction d'évaluation pour calculer l'accuracy
def compute_metrics(p):
    predictions, labels = p
    preds = torch.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, preds)}

# Créer le Trainer Hugging Face
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,            
    compute_metrics=compute_metrics       
)

# Entraîner le modèle
trainer.train()

# Évaluer le modèle
results = trainer.evaluate()
print(f"Accuracy sur le dataset de test : {results['eval_accuracy']:.4f}")



# THE CODE WITH AN ACCURACY OF 0.73

The result of our code is in the .ipynb named : Models.ipynb (we ran it on another file so in order to see the results we needed to split it ). However,the code below is the explanation of our model.

In [None]:
!pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

In [None]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# Configuration du GPU (s'il est disponible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# Filtrer les classes qui ont au moins 2 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 2].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# Réduction du dataset (max 25 000 exemples)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# Chargement du modèle RoBERTa 
model_name = "roberta-large"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# Activation du `gradient_checkpointing` (just to save the model)
model.gradient_checkpointing_enable()

# Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  #  max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Création des DataLoaders
batch_size = 32  #  Optimisé pour GPU
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Optimisation :
learning_rate = 2e-5  
epochs = 5  
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

#  Mixed Precision Training
scaler = GradScaler()

#  Entraînement 
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast(): 
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print(" Training Complete")