In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Jun__8_16:49:14_PDT_2022
Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0


In [1]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens pour accélérer
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 5 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 5].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples pour rapidité)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle DistilBERT (Rapide et performant)
model_name = "distilbert-base-uncased"  # 📌 Plus rapide que DeBERTa
print(f"🚀 Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Batch optimisé pour rapidité
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR ajusté pour rapidité
epochs = 7  # ✅ Réduction des époques pour accélérer
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")


  from .autonotebook import tqdm as notebook_tqdm


🚀 Using device: cuda
🚀 Loading distilbert-base-uncased...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():  # ✅ Mixed Precision Training
Epoch 1/7: 100%|██████████| 663/663 [01:22<00:00,  7.99it/s, loss=4.3890]


Epoch 1/7 - Avg Training Loss: 5.4645


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.19it/s]


Validation Accuracy: 0.2294


  with autocast():  # ✅ Mixed Precision Training
Epoch 2/7: 100%|██████████| 663/663 [01:26<00:00,  7.66it/s, loss=2.7561]


Epoch 2/7 - Avg Training Loss: 3.5443


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.19it/s]


Validation Accuracy: 0.4334


  with autocast():  # ✅ Mixed Precision Training
Epoch 3/7: 100%|██████████| 663/663 [01:27<00:00,  7.57it/s, loss=1.8300]


Epoch 3/7 - Avg Training Loss: 2.4533


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.99it/s]


Validation Accuracy: 0.5233


  with autocast():  # ✅ Mixed Precision Training
Epoch 4/7: 100%|██████████| 663/663 [01:28<00:00,  7.51it/s, loss=1.8888]


Epoch 4/7 - Avg Training Loss: 1.9162


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.91it/s]


Validation Accuracy: 0.5882


  with autocast():  # ✅ Mixed Precision Training
Epoch 5/7: 100%|██████████| 663/663 [01:28<00:00,  7.47it/s, loss=1.6611]


Epoch 5/7 - Avg Training Loss: 1.6036


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.80it/s]


Validation Accuracy: 0.6162


  with autocast():  # ✅ Mixed Precision Training
Epoch 6/7: 100%|██████████| 663/663 [01:29<00:00,  7.44it/s, loss=1.3162]


Epoch 6/7 - Avg Training Loss: 1.4190


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.76it/s]


Validation Accuracy: 0.6183


  with autocast():  # ✅ Mixed Precision Training
Epoch 7/7: 100%|██████████| 663/663 [01:29<00:00,  7.44it/s, loss=1.0920]


Epoch 7/7 - Avg Training Loss: 1.3199


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.69it/s]

Validation Accuracy: 0.6387
✅ Training Complete





In [1]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 2 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 2].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle RoBERTa (Meilleur que DistilBERT)
model_name = "roberta-base"  # 📌 Plus puissant que DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Activation du `gradient_checkpointing` (économie de mémoire)
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Optimisé pour GPU
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR plus bas pour une meilleure convergence
epochs = 10  # ✅ Augmenté pour une meilleure précision
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")


  from .autonotebook import tqdm as notebook_tqdm


🚀 Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():  # ✅ Mixed Precision Training
Epoch 1/10: 100%|██████████| 664/664 [01:54<00:00,  5.79it/s, loss=4.8026]


Epoch 1/10 - Avg Training Loss: 5.4961


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.68it/s]


Validation Accuracy: 0.2970


  with autocast():  # ✅ Mixed Precision Training
Epoch 2/10: 100%|██████████| 664/664 [01:46<00:00,  6.23it/s, loss=3.1351]


Epoch 2/10 - Avg Training Loss: 3.8583


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.4597


  with autocast():  # ✅ Mixed Precision Training
Epoch 3/10: 100%|██████████| 664/664 [01:47<00:00,  6.19it/s, loss=1.9742]


Epoch 3/10 - Avg Training Loss: 2.5508


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.50it/s]


Validation Accuracy: 0.5733


  with autocast():  # ✅ Mixed Precision Training
Epoch 4/10: 100%|██████████| 664/664 [01:46<00:00,  6.21it/s, loss=1.2819]


Epoch 4/10 - Avg Training Loss: 1.8354


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.6110


  with autocast():  # ✅ Mixed Precision Training
Epoch 5/10: 100%|██████████| 664/664 [01:46<00:00,  6.21it/s, loss=1.5340]


Epoch 5/10 - Avg Training Loss: 1.4243


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.6750


  with autocast():  # ✅ Mixed Precision Training
Epoch 6/10: 100%|██████████| 664/664 [01:47<00:00,  6.21it/s, loss=0.8184]


Epoch 6/10 - Avg Training Loss: 1.1545


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.6928


  with autocast():  # ✅ Mixed Precision Training
Epoch 7/10: 100%|██████████| 664/664 [01:47<00:00,  6.21it/s, loss=0.9530]


Epoch 7/10 - Avg Training Loss: 0.9738


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.52it/s]


Validation Accuracy: 0.7136


  with autocast():  # ✅ Mixed Precision Training
Epoch 8/10: 100%|██████████| 664/664 [01:47<00:00,  6.20it/s, loss=0.5812]


Epoch 8/10 - Avg Training Loss: 0.8431


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.7263


  with autocast():  # ✅ Mixed Precision Training
Epoch 9/10: 100%|██████████| 664/664 [01:47<00:00,  6.19it/s, loss=0.9668]


Epoch 9/10 - Avg Training Loss: 0.7561


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.50it/s]


Validation Accuracy: 0.7331


  with autocast():  # ✅ Mixed Precision Training
Epoch 10/10: 100%|██████████| 664/664 [01:47<00:00,  6.20it/s, loss=0.7178]


Epoch 10/10 - Avg Training Loss: 0.7026


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]

Validation Accuracy: 0.7364
✅ Training Complete





In [None]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 2 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 2].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle RoBERTa (Meilleur que DistilBERT)
model_name = "microsoft/deberta-v3-large"  # 📌 Plus puissant que DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Activation du `gradient_checkpointing` (économie de mémoire)
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Optimisé pour GPU
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR plus bas pour une meilleure convergence
epochs = 10  # ✅ Augmenté pour une meilleure précision
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")

🚀 Using device: cuda


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():  # ✅ Mixed Precision Training
Epoch 1/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=4.7588]


Epoch 1/10 - Avg Training Loss: 5.6270


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.2169


  with autocast():  # ✅ Mixed Precision Training
Epoch 2/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=2.4141]


Epoch 2/10 - Avg Training Loss: 3.2152


Validating: 100%|██████████| 74/74 [00:31<00:00,  2.31it/s]


Validation Accuracy: 0.4983


  with autocast():  # ✅ Mixed Precision Training
Epoch 3/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.9851]


Epoch 3/10 - Avg Training Loss: 1.5922


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.6475


  with autocast():  # ✅ Mixed Precision Training
Epoch 4/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.9995]


Epoch 4/10 - Avg Training Loss: 1.0448


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7081


  with autocast():  # ✅ Mixed Precision Training
Epoch 5/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.4428]


Epoch 5/10 - Avg Training Loss: 0.7092


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7352


  with autocast():  # ✅ Mixed Precision Training
Epoch 6/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.3385]


Epoch 6/10 - Avg Training Loss: 0.5267


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7521


  with autocast():  # ✅ Mixed Precision Training
Epoch 7/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.4267]


Epoch 7/10 - Avg Training Loss: 0.4022


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7589


  with autocast():  # ✅ Mixed Precision Training
Epoch 8/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.4626]


Epoch 8/10 - Avg Training Loss: 0.3156


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7674


  with autocast():  # ✅ Mixed Precision Training
Epoch 9/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.1822]


Epoch 9/10 - Avg Training Loss: 0.2421


Validating: 100%|██████████| 74/74 [00:31<00:00,  2.31it/s]


Validation Accuracy: 0.7716


  with autocast():  # ✅ Mixed Precision Training
Epoch 10/10:  34%|███▍      | 226/664 [03:02<05:53,  1.24it/s, loss=0.3311]

In [3]:
# Process test data
print("Processing test data...")
test_path = "test_without_labels.csv"
test_df = pd.read_csv(test_path)

# Clean test data
test_df["Cleaned_Text"] = test_df["Text"].apply(clean_text)

# Create test dataset and dataloader
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, 
                                  max_length=max_length, return_tensors="pt")
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item
    
    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_df["Cleaned_Text"], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Generate predictions
model.eval()
all_test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating predictions"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        # Get predictions
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_test_preds.extend(preds)

# Convert predictions back to original labels
predicted_labels = [id_to_label[pred_id] for pred_id in all_test_preds]

# Create submission file
submission = pd.DataFrame({
    "ID": test_df.index + 1,  # Start IDs from 1
    "Label": predicted_labels
})

# Save submission file
submission_path = "submission_bert.csv"
submission.to_csv(submission_path, index=False)

print(f"✅ Submission file '{submission_path}' generated successfully!")

Processing test data...


Generating predictions: 100%|██████████| 5956/5956 [05:10<00:00, 19.21it/s]


✅ Submission file 'submission_bert.csv' generated successfully!


In [4]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
import warnings
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load training data
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path)
train_df_cleaned = train_df.dropna(subset=["Label"])

def clean_text(text, max_length=512):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df_cleaned["Cleaned_Text"] = train_df_cleaned["Text"].apply(lambda x: clean_text(x, max_length=256))
train_df_cleaned = train_df_cleaned[train_df_cleaned["Cleaned_Text"].str.len() > 0]

labels = train_df_cleaned["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df_cleaned["label_id"] = train_df_cleaned["Label"].map(label_to_id)

# Balance dataset
min_samples = 20
valid_classes = train_df_cleaned["Label"].value_counts()[lambda x: x >= min_samples].index
train_df_filtered = train_df_cleaned[train_df_cleaned["Label"].isin(valid_classes)]

# Stratified split
X_train, X_val, y_train, y_val = train_test_split(
    train_df_filtered["Cleaned_Text"], 
    train_df_filtered["label_id"], 
    test_size=0.1,
    random_state=42, 
    stratify=train_df_filtered["label_id"]
)

# Load model & tokenizer
model_name = "bert-base-uncased"  # Upgrade to full BERT
print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_to_id)).to(device)

# Define Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, 
                                  max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Training setup
learning_rate = 3e-5  # Slightly lower learning rate
epochs = 6
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

# Training loop
print("Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    print(f"Epoch {epoch+1}/{epochs} - Avg loss: {train_loss / len(train_loader):.4f}")
    
    model.eval()
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())
    
    val_accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {val_accuracy:.4f}")

# Final evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Final evaluation"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(batch['labels'].cpu().numpy())

final_accuracy = accuracy_score(all_labels, all_preds)
print(f"Final validation accuracy: {final_accuracy:.4f}")
print(classification_report(all_labels, all_preds))

Using device: cuda
Loading bert-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...


Epoch 1/6:  40%|███▉      | 2127/5344 [22:02<33:19,  1.61it/s, loss=1.7069]


KeyboardInterrupt: 

In [2]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

torch.backends.cuda.matmul.allow_tf32 = True  # ✅ Accélération supplémentaire

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=128):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 3 échantillons (moins strict)
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 3].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Vérifier la taille du dataset après filtrage
print(f"Nombre total d'exemples après filtrage : {len(train_df)}")

# ✅ Réduction du dataset (max 20 000 exemples pour vitesse)
max_samples = 20000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle **correctement initialisé**
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_to_id)
).to(device)

# ✅ Réinitialiser les poids de classification
torch.nn.init.xavier_uniform_(model.classifier.out_proj.weight)
torch.nn.init.zeros_(model.classifier.out_proj.bias)

# ✅ Activation de `gradient_checkpointing`
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Plus stable
gradient_accumulation_steps = 2  # ✅ Accumulation pour économiser la mémoire GPU

train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ Learning rate plus stable
epochs = 10  # ✅ Légèrement augmenté pour convergence
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement optimisé (1h max)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for step, batch in enumerate(progress_bar):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():
            outputs = model(**batch)
            loss = outputs.loss / gradient_accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")

🚀 Using device: cuda
Nombre total d'exemples après filtrage : 190087


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():
Epoch 1/10: 100%|██████████| 531/531 [00:35<00:00, 14.88it/s, loss=2.8405]


Epoch 1/10 - Avg Training Loss: 2.9818


Validating: 100%|██████████| 59/59 [00:03<00:00, 19.52it/s]


Validation Accuracy: 0.0515


  with autocast():
Epoch 2/10: 100%|██████████| 531/531 [00:36<00:00, 14.39it/s, loss=2.4766]


Epoch 2/10 - Avg Training Loss: 2.5694


Validating: 100%|██████████| 59/59 [00:03<00:00, 19.64it/s]


Validation Accuracy: 0.2175


  with autocast():
Epoch 3/10: 100%|██████████| 531/531 [00:36<00:00, 14.40it/s, loss=2.4141]


Epoch 3/10 - Avg Training Loss: 2.0705


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.85it/s]


Validation Accuracy: 0.3247


  with autocast():
Epoch 4/10: 100%|██████████| 531/531 [00:36<00:00, 14.42it/s, loss=2.0706]


Epoch 4/10 - Avg Training Loss: 1.6889


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.73it/s]


Validation Accuracy: 0.3989


  with autocast():
Epoch 5/10: 100%|██████████| 531/531 [00:36<00:00, 14.41it/s, loss=1.5877]


Epoch 5/10 - Avg Training Loss: 1.3878


Validating: 100%|██████████| 59/59 [00:03<00:00, 19.08it/s]


Validation Accuracy: 0.4769


  with autocast():
Epoch 6/10: 100%|██████████| 531/531 [00:37<00:00, 14.35it/s, loss=0.3924]


Epoch 6/10 - Avg Training Loss: 1.1640


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.74it/s]


Validation Accuracy: 0.5347


  with autocast():
Epoch 7/10: 100%|██████████| 531/531 [00:36<00:00, 14.46it/s, loss=0.6785]


Epoch 7/10 - Avg Training Loss: 1.0025


Validating: 100%|██████████| 59/59 [00:03<00:00, 19.30it/s]


Validation Accuracy: 0.5634


  with autocast():
Epoch 8/10: 100%|██████████| 531/531 [00:37<00:00, 14.30it/s, loss=0.6249]


Epoch 8/10 - Avg Training Loss: 0.8795


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.75it/s]


Validation Accuracy: 0.6069


  with autocast():
Epoch 9/10: 100%|██████████| 531/531 [00:36<00:00, 14.43it/s, loss=1.2419]


Epoch 9/10 - Avg Training Loss: 0.7849


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.74it/s]


Validation Accuracy: 0.6212


  with autocast():
Epoch 10/10: 100%|██████████| 531/531 [00:36<00:00, 14.56it/s, loss=0.8906]


Epoch 10/10 - Avg Training Loss: 0.7077


Validating: 100%|██████████| 59/59 [00:02<00:00, 20.18it/s]

Validation Accuracy: 0.6403
✅ Training Complete





In [1]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 2 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 2].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle RoBERTa (Meilleur que DistilBERT)
model_name = "roberta-large"  # 📌 Plus puissant que DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Activation du `gradient_checkpointing` (économie de mémoire)
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Optimisé pour GPU
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR plus bas pour une meilleure convergence
epochs = 5  # ✅ Augmenté pour une meilleure précision
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")

  from .autonotebook import tqdm as notebook_tqdm


🚀 Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():  # ✅ Mixed Precision Training
Epoch 1/5: 100%|██████████| 664/664 [09:00<00:00,  1.23it/s, loss=4.0703]


Epoch 1/5 - Avg Training Loss: 5.1601


Validating: 100%|██████████| 74/74 [00:48<00:00,  1.53it/s]


Validation Accuracy: 0.3445


  with autocast():  # ✅ Mixed Precision Training
Epoch 2/5: 100%|██████████| 664/664 [09:02<00:00,  1.22it/s, loss=2.1680]


Epoch 2/5 - Avg Training Loss: 2.7878


Validating: 100%|██████████| 74/74 [00:49<00:00,  1.48it/s]


Validation Accuracy: 0.5746


  with autocast():  # ✅ Mixed Precision Training
Epoch 3/5: 100%|██████████| 664/664 [09:02<00:00,  1.22it/s, loss=1.4139]


Epoch 3/5 - Avg Training Loss: 1.5903


Validating: 100%|██████████| 74/74 [00:50<00:00,  1.47it/s]


Validation Accuracy: 0.6678


  with autocast():  # ✅ Mixed Precision Training
Epoch 4/5: 100%|██████████| 664/664 [09:02<00:00,  1.22it/s, loss=1.0661]


Epoch 4/5 - Avg Training Loss: 1.1267


Validating: 100%|██████████| 74/74 [00:50<00:00,  1.47it/s]


Validation Accuracy: 0.7174


  with autocast():  # ✅ Mixed Precision Training
Epoch 5/5: 100%|██████████| 664/664 [09:03<00:00,  1.22it/s, loss=1.3816]


Epoch 5/5 - Avg Training Loss: 0.8968


Validating: 100%|██████████| 74/74 [00:49<00:00,  1.49it/s]

Validation Accuracy: 0.7246
✅ Training Complete





In [None]:
# Process test data
print("Processing test data...")
test_path = "test_without_labels.csv"
test_df = pd.read_csv(test_path)

# Clean test data
test_df["Cleaned_Text"] = test_df["Text"].apply(clean_text)

# Create test dataset and dataloader
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, 
                                  max_length=max_length, return_tensors="pt")
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item
    
    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_df["Cleaned_Text"], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Generate predictions
model.eval()
all_test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating predictions"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        # Get predictions
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_test_preds.extend(preds)

# Convert predictions back to original labels
predicted_labels = [id_to_label[pred_id] for pred_id in all_test_preds]

# Create submission file
submission = pd.DataFrame({
    "ID": test_df.index + 1,  # Start IDs from 1
    "Label": predicted_labels
})

# Save submission file
submission_path = "submission.csv"
submission.to_csv(submission_path, index=False)

print(f"✅ Submission file '{submission_path}' generated successfully!")

Processing test data...


Generating predictions: 100%|█████████▉| 5949/5956 [35:26<00:02,  2.83it/s]

In [1]:
#%% Initial Setup
!pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate wandb -q
!pip install sentencepiece tensorboardx -q

#%% Imports
import pandas as pd
import numpy as np
import torch
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

#%% Configuration
class Config:
    # Model
    MODEL_NAME = "microsoft/deberta-v3-large"  # Modèle state-of-the-art
    MAX_LENGTH = 256  # Augmenter la longueur contextuelle
    DROPOUT = 0.1
    
    # Training
    EPOCHS = 5
    BATCH_SIZE = 8  # Réduit pour gérer la mémoire
    LR = 2e-5
    WARMUP_STEPS = 100
    WEIGHT_DECAY = 0.01
    GRAD_CLIP = 1.0
    
    # Data
    SAMPLE_LIMIT = 50000  # Augmenter la taille de l'échantillon
    MIN_SAMPLES_PER_CLASS = 20
    TEST_SIZE = 0.1
    
    # Advanced
    USE_CLASS_WEIGHTS = True
    USE_FP16 = True  # Activation du mixed-precision
    
config = Config()

#%% Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#%% Advanced Text Cleaning
def clean_text(text):
    # Nettoyage approfondi
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Hashtags/Mentions
    text = re.sub(r'[^\w\s]', ' ', text)  # Ponctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:config.MAX_LENGTH]

#%% Data Loading & Processing
def load_data(path):
    df = pd.read_csv(path)
    df["Cleaned_Text"] = df["Text"].apply(clean_text)
    df = df[df["Cleaned_Text"].str.len() > 0]
    return df

# Load and preprocess data
train_df = load_data("train_submission.csv").dropna(subset=["Label"])
test_df = load_data("test_without_labels.csv")

# Gestion avancée des classes
class_counts = train_df["Label"].value_counts()
valid_classes = class_counts[class_counts >= config.MIN_SAMPLES_PER_CLASS].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# Stratified sampling
train_df = train_df.groupby("Label", group_keys=False).apply(
    lambda x: x.sample(min(len(x), config.SAMPLE_LIMIT // len(valid_classes)), 
)

# Label mapping
label_to_id = {label: i for i, label in enumerate(valid_classes)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# Split stratifié
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"],
    train_df["label_id"],
    test_size=config.TEST_SIZE,
    stratify=train_df["label_id"],
    random_state=42
)

#%% Class Weight Calculation
if config.USE_CLASS_WEIGHTS:
    class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
else:
    class_weights = None

#%% Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    config.MODEL_NAME,
    num_labels=len(label_to_id),
    attention_probs_dropout_prob=config.DROPOUT,
    hidden_dropout_prob=config.DROPOUT
).to(device)

#%% Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=config.MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
            
        return item

# Create datasets
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)

#%% Data Loaders
def create_loaders():
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.BATCH_SIZE*2,
        shuffle=False,
        pin_memory=True
    )
    
    return train_loader, val_loader

train_loader, val_loader = create_loaders()

#%% Optimizer & Scheduler
optimizer = AdamW(
    model.parameters(),
    lr=config.LR,
    weight_decay=config.WEIGHT_DECAY
)

total_steps = len(train_loader) * config.EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config.WARMUP_STEPS,
    num_training_steps=total_steps
)

#%% Mixed Precision Training
scaler = torch.cuda.amp.GradScaler(enabled=config.USE_FP16)

#%% Training Loop
def train_epoch(model, loader):
    model.train()
    total_loss = 0
    progress_bar = tqdm(loader, desc="Training")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
        
        with torch.cuda.amp.autocast(enabled=config.USE_FP16):
            outputs = model(**inputs)
            loss = outputs.loss
            
            if config.USE_CLASS_WEIGHTS:
                logits = outputs.logits
                loss = torch.nn.functional.cross_entropy(
                    logits.view(-1, model.config.num_labels),
                    inputs['labels'].view(-1),
                    weight=class_weights
                )
        
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.GRAD_CLIP)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
    
    return total_loss / len(loader)

#%% Evaluation Function
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'labels': batch['labels'].to(device)
            }
            
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(inputs['labels'].cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    return {
        'loss': total_loss / len(loader),
        'accuracy': accuracy,
        'f1': f1
    }

#%% Training with Early Stopping
best_accuracy = 0
patience_counter = 0
patience = 2

for epoch in range(config.EPOCHS):
    print(f"\nEpoch {epoch+1}/{config.EPOCHS}")
    
    # Training
    train_loss = train_epoch(model, train_loader)
    
    # Evaluation
    val_metrics = evaluate(model, val_loader)
    
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_metrics['loss']:.4f}")
    print(f"Val Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Val F1: {val_metrics['f1']:.4f}")
    
    # Early Stopping
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
        print("New best model saved!")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

#%% Load Best Model
model.load_state_dict(torch.load('best_model.pt'))

#%% Final Evaluation
final_metrics = evaluate(model, val_loader)
print("\nFinal Evaluation:")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"F1 Score: {final_metrics['f1']:.4f}")
print(classification_report(y_val, all_preds, target_names=label_to_id.keys()))

#%% Test Predictions
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=config.MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

test_dataset = TestDataset(test_df["Cleaned_Text"], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE*2)

#%% Generate Predictions
model.eval()
test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        test_preds.extend(preds)

#%% Create Submission
submission = pd.DataFrame({
    "ID": test_df.index + 1,
    "Label": [id_to_label[pred] for pred in test_preds]
})

submission.to_csv("submission_final.csv", index=False)
print("Submission file created!")

SyntaxError: '(' was never closed (628005962.py, line 81)

In [1]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, get_linear_schedule_with_warmup
import warnings
warnings.filterwarnings('ignore')

# Advanced text cleaning function
def advanced_clean_text(text, max_length=512):
    if not isinstance(text, str):
        text = str(text)
    
    # Normalize Unicode characters
    text = text.lower()
    
    # Keep some punctuation that might be language-specific
    text = re.sub(r'[^\w\s\'"-]', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text[:max_length]

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load training data
print("Loading and preprocessing data...")
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path)

# Remove missing labels and clean data
train_df_cleaned = train_df.dropna(subset=["Label"])
train_df_cleaned["Cleaned_Text"] = train_df_cleaned["Text"].apply(advanced_clean_text)

# Remove empty texts
train_df_cleaned = train_df_cleaned[train_df_cleaned["Cleaned_Text"].str.len() > 0]

# Create label mapping
labels = train_df_cleaned["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

# Convert labels to IDs
train_df_cleaned["label_id"] = train_df_cleaned["Label"].map(label_to_id)

# Check and filter class distribution
class_counts = train_df_cleaned["Label"].value_counts()
min_samples = 10
valid_classes = class_counts[class_counts >= min_samples].index
train_df_filtered = train_df_cleaned[train_df_cleaned["Label"].isin(valid_classes)]

print(f"Number of classes: {len(valid_classes)}")
print(f"Sample distribution: {class_counts[valid_classes][:5]}...")

# Stratified sampling
max_samples = 20000
if len(train_df_filtered) > max_samples:
    train_df_filtered = train_df_filtered.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(valid_classes)), random_state=42)
    )

# Reset index to avoid potential indexing issues
train_df_filtered = train_df_filtered.reset_index(drop=True)

# Split data with stratification
X_train, X_val, y_train, y_val = train_test_split(
    train_df_filtered["Cleaned_Text"], 
    train_df_filtered["label_id"], 
    test_size=0.1,
    random_state=42, 
    stratify=train_df_filtered["label_id"]
)

# Use XLM-RoBERTa model
model_name = "xlm-roberta-base"
print(f"Loading {model_name}...")

tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_to_id),
    ignore_mismatched_sizes=True  # Add this to handle potential size mismatches
).to(device)

# Custom Dataset with safer data handling
from torch.utils.data import Dataset, DataLoader
import random

class SafeTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        self.encodings = self._prepare_encodings()
    
    def _prepare_encodings(self):
        return self.tokenizer(
            self.texts, 
            truncation=True, 
            padding=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    
    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = SafeTextDataset(X_train, y_train, tokenizer)
val_dataset = SafeTextDataset(X_val, y_val, tokenizer)

# Improved training configuration with a suitable batch size for XLM-RoBERTa
batch_size = 16  # Can be reduced to 8 if memory issues occur
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Advanced training setup
from torch.optim import AdamW

# Optimized hyperparameters for XLM-RoBERTa
learning_rate = 1e-5  # Slightly lower learning rate for XLM-RoBERTa
epochs = 4  # Increased epochs for better performance
weight_decay = 0.01
warmup_ratio = 0.1

# Prepare optimizer and schedule
optimizer = AdamW(
    model.parameters(), 
    lr=learning_rate, 
    weight_decay=weight_decay
)

# Learning rate scheduler
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(warmup_ratio * total_steps),
    num_training_steps=total_steps
)

# Training loop with early stopping and accuracy tracking
from tqdm.auto import tqdm
import time

best_val_accuracy = 0
patience = 3  # Increased patience
no_improve_epochs = 0
training_start_time = time.time()

print("Training model...")
for epoch in range(epochs):
    epoch_start_time = time.time()
    
    # Training phase
    model.train()
    train_loss = 0
    
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    avg_train_loss = train_loss / len(train_loader)
    
    # Validation phase
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            
            val_loss += outputs.loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())
    
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = accuracy_score(all_labels, all_preds)
    
    epoch_time = time.time() - epoch_start_time
    
    print(f"Epoch {epoch+1}/{epochs} - Time: {epoch_time:.2f}s")
    print(f"Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    # Early stopping logic
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        no_improve_epochs = 0
        # Save best model
        torch.save(model.state_dict(), 'best_xlm_roberta_model.pth')
        print(f"✓ New best model saved with accuracy: {val_accuracy:.4f}")
    else:
        no_improve_epochs += 1
        print(f"No improvement for {no_improve_epochs} epochs")
        
    if no_improve_epochs >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs")
        break

total_training_time = time.time() - training_start_time
print(f"Total training time: {total_training_time:.2f} seconds")

# Load best model for final evaluation
model.load_state_dict(torch.load('best_xlm_roberta_model.pth'))
model.eval()

# Final evaluation
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Final evaluation"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(batch['labels'].cpu().numpy())

final_accuracy = accuracy_score(all_labels, all_preds)
print(f"Final validation accuracy: {final_accuracy:.4f}")
print(classification_report(all_labels, all_preds))

# Process test data
print("Processing test data...")
test_path = "test_without_labels.csv"
test_df = pd.read_csv(test_path)

# Clean test data
test_df["Cleaned_Text"] = test_df["Text"].apply(advanced_clean_text)

# Create test dataset
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts.tolist(), 
            truncation=True, 
            padding=True, 
            max_length=max_length, 
            return_tensors="pt"
        )
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_df["Cleaned_Text"], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Generate predictions
model.eval()
all_test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating predictions"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_test_preds.extend(preds)

# Convert predictions back to original labels
predicted_labels = [id_to_label[pred_id] for pred_id in all_test_preds]

# Create submission file
submission = pd.DataFrame({
    "ID": test_df.index + 1,
    "Label": predicted_labels
})

# Save submission file
submission_path = "submission_file.csv"
submission.to_csv(submission_path, index=False)

print(f"✅ Submission file '{submission_path}' generated successfully!")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Loading and preprocessing data...
Number of classes: 374
Sample distribution: Label
tgk    1500
hbs    1000
mon    1000
crh    1000
som    1000
Name: count, dtype: int64...
Loading xlm-roberta-base...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...


Epoch 1/4: 100%|██████████| 1100/1100 [02:18<00:00,  7.92it/s, loss=5.3197]
Validating: 100%|██████████| 123/123 [00:03<00:00, 31.99it/s]


Epoch 1/4 - Time: 142.77s
Train Loss: 5.7318, Val Loss: 5.0986
Validation Accuracy: 0.2757
✓ New best model saved with accuracy: 0.2757


Epoch 2/4: 100%|██████████| 1100/1100 [02:24<00:00,  7.59it/s, loss=4.6401]
Validating: 100%|██████████| 123/123 [00:03<00:00, 33.80it/s]


Epoch 2/4 - Time: 148.63s
Train Loss: 4.8803, Val Loss: 4.2738
Validation Accuracy: 0.3867
✓ New best model saved with accuracy: 0.3867


Epoch 3/4: 100%|██████████| 1100/1100 [02:23<00:00,  7.68it/s, loss=4.7164]
Validating: 100%|██████████| 123/123 [00:03<00:00, 33.87it/s]


Epoch 3/4 - Time: 146.94s
Train Loss: 4.2757, Val Loss: 3.8597
Validation Accuracy: 0.4440
✓ New best model saved with accuracy: 0.4440


Epoch 4/4: 100%|██████████| 1100/1100 [02:24<00:00,  7.61it/s, loss=4.1735]
Validating: 100%|██████████| 123/123 [00:03<00:00, 34.21it/s]


Epoch 4/4 - Time: 148.15s
Train Loss: 3.9681, Val Loss: 3.7116
Validation Accuracy: 0.4588


KeyboardInterrupt: 

In [None]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 2 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 2].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle RoBERTa (Meilleur que DistilBERT)
model_name = "roberta-large"  # 📌 Plus puissant que DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Activation du `gradient_checkpointing` (économie de mémoire)
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Optimisé pour GPU
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR plus bas pour une meilleure convergence
epochs = 5  # ✅ Augmenté pour une meilleure précision
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")