In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Jun__8_16:49:14_PDT_2022
Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0


In [1]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens pour accélérer
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 5 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 5].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples pour rapidité)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle DistilBERT (Rapide et performant)
model_name = "distilbert-base-uncased"  # 📌 Plus rapide que DeBERTa
print(f"🚀 Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Batch optimisé pour rapidité
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR ajusté pour rapidité
epochs = 7  # ✅ Réduction des époques pour accélérer
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")


  from .autonotebook import tqdm as notebook_tqdm


🚀 Using device: cuda
🚀 Loading distilbert-base-uncased...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():  # ✅ Mixed Precision Training
Epoch 1/7: 100%|██████████| 663/663 [01:22<00:00,  7.99it/s, loss=4.3890]


Epoch 1/7 - Avg Training Loss: 5.4645


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.19it/s]


Validation Accuracy: 0.2294


  with autocast():  # ✅ Mixed Precision Training
Epoch 2/7: 100%|██████████| 663/663 [01:26<00:00,  7.66it/s, loss=2.7561]


Epoch 2/7 - Avg Training Loss: 3.5443


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.19it/s]


Validation Accuracy: 0.4334


  with autocast():  # ✅ Mixed Precision Training
Epoch 3/7: 100%|██████████| 663/663 [01:27<00:00,  7.57it/s, loss=1.8300]


Epoch 3/7 - Avg Training Loss: 2.4533


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.99it/s]


Validation Accuracy: 0.5233


  with autocast():  # ✅ Mixed Precision Training
Epoch 4/7: 100%|██████████| 663/663 [01:28<00:00,  7.51it/s, loss=1.8888]


Epoch 4/7 - Avg Training Loss: 1.9162


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.91it/s]


Validation Accuracy: 0.5882


  with autocast():  # ✅ Mixed Precision Training
Epoch 5/7: 100%|██████████| 663/663 [01:28<00:00,  7.47it/s, loss=1.6611]


Epoch 5/7 - Avg Training Loss: 1.6036


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.80it/s]


Validation Accuracy: 0.6162


  with autocast():  # ✅ Mixed Precision Training
Epoch 6/7: 100%|██████████| 663/663 [01:29<00:00,  7.44it/s, loss=1.3162]


Epoch 6/7 - Avg Training Loss: 1.4190


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.76it/s]


Validation Accuracy: 0.6183


  with autocast():  # ✅ Mixed Precision Training
Epoch 7/7: 100%|██████████| 663/663 [01:29<00:00,  7.44it/s, loss=1.0920]


Epoch 7/7 - Avg Training Loss: 1.3199


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.69it/s]

Validation Accuracy: 0.6387
✅ Training Complete





In [1]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 2 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 2].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle RoBERTa (Meilleur que DistilBERT)
model_name = "roberta-base"  # 📌 Plus puissant que DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Activation du `gradient_checkpointing` (économie de mémoire)
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Optimisé pour GPU
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR plus bas pour une meilleure convergence
epochs = 10  # ✅ Augmenté pour une meilleure précision
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")


  from .autonotebook import tqdm as notebook_tqdm


🚀 Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():  # ✅ Mixed Precision Training
Epoch 1/10: 100%|██████████| 664/664 [01:54<00:00,  5.79it/s, loss=4.8026]


Epoch 1/10 - Avg Training Loss: 5.4961


Validating: 100%|██████████| 74/74 [00:07<00:00,  9.68it/s]


Validation Accuracy: 0.2970


  with autocast():  # ✅ Mixed Precision Training
Epoch 2/10: 100%|██████████| 664/664 [01:46<00:00,  6.23it/s, loss=3.1351]


Epoch 2/10 - Avg Training Loss: 3.8583


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.4597


  with autocast():  # ✅ Mixed Precision Training
Epoch 3/10: 100%|██████████| 664/664 [01:47<00:00,  6.19it/s, loss=1.9742]


Epoch 3/10 - Avg Training Loss: 2.5508


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.50it/s]


Validation Accuracy: 0.5733


  with autocast():  # ✅ Mixed Precision Training
Epoch 4/10: 100%|██████████| 664/664 [01:46<00:00,  6.21it/s, loss=1.2819]


Epoch 4/10 - Avg Training Loss: 1.8354


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.6110


  with autocast():  # ✅ Mixed Precision Training
Epoch 5/10: 100%|██████████| 664/664 [01:46<00:00,  6.21it/s, loss=1.5340]


Epoch 5/10 - Avg Training Loss: 1.4243


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.6750


  with autocast():  # ✅ Mixed Precision Training
Epoch 6/10: 100%|██████████| 664/664 [01:47<00:00,  6.21it/s, loss=0.8184]


Epoch 6/10 - Avg Training Loss: 1.1545


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.6928


  with autocast():  # ✅ Mixed Precision Training
Epoch 7/10: 100%|██████████| 664/664 [01:47<00:00,  6.21it/s, loss=0.9530]


Epoch 7/10 - Avg Training Loss: 0.9738


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.52it/s]


Validation Accuracy: 0.7136


  with autocast():  # ✅ Mixed Precision Training
Epoch 8/10: 100%|██████████| 664/664 [01:47<00:00,  6.20it/s, loss=0.5812]


Epoch 8/10 - Avg Training Loss: 0.8431


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]


Validation Accuracy: 0.7263


  with autocast():  # ✅ Mixed Precision Training
Epoch 9/10: 100%|██████████| 664/664 [01:47<00:00,  6.19it/s, loss=0.9668]


Epoch 9/10 - Avg Training Loss: 0.7561


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.50it/s]


Validation Accuracy: 0.7331


  with autocast():  # ✅ Mixed Precision Training
Epoch 10/10: 100%|██████████| 664/664 [01:47<00:00,  6.20it/s, loss=0.7178]


Epoch 10/10 - Avg Training Loss: 0.7026


Validating: 100%|██████████| 74/74 [00:07<00:00, 10.51it/s]

Validation Accuracy: 0.7364
✅ Training Complete





In [None]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 2 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 2].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle RoBERTa (Meilleur que DistilBERT)
model_name = "microsoft/deberta-v3-large"  # 📌 Plus puissant que DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Activation du `gradient_checkpointing` (économie de mémoire)
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Optimisé pour GPU
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR plus bas pour une meilleure convergence
epochs = 10  # ✅ Augmenté pour une meilleure précision
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")

🚀 Using device: cuda


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():  # ✅ Mixed Precision Training
Epoch 1/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=4.7588]


Epoch 1/10 - Avg Training Loss: 5.6270


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.2169


  with autocast():  # ✅ Mixed Precision Training
Epoch 2/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=2.4141]


Epoch 2/10 - Avg Training Loss: 3.2152


Validating: 100%|██████████| 74/74 [00:31<00:00,  2.31it/s]


Validation Accuracy: 0.4983


  with autocast():  # ✅ Mixed Precision Training
Epoch 3/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.9851]


Epoch 3/10 - Avg Training Loss: 1.5922


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.6475


  with autocast():  # ✅ Mixed Precision Training
Epoch 4/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.9995]


Epoch 4/10 - Avg Training Loss: 1.0448


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7081


  with autocast():  # ✅ Mixed Precision Training
Epoch 5/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.4428]


Epoch 5/10 - Avg Training Loss: 0.7092


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7352


  with autocast():  # ✅ Mixed Precision Training
Epoch 6/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.3385]


Epoch 6/10 - Avg Training Loss: 0.5267


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7521


  with autocast():  # ✅ Mixed Precision Training
Epoch 7/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.4267]


Epoch 7/10 - Avg Training Loss: 0.4022


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7589


  with autocast():  # ✅ Mixed Precision Training
Epoch 8/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.4626]


Epoch 8/10 - Avg Training Loss: 0.3156


Validating: 100%|██████████| 74/74 [00:32<00:00,  2.31it/s]


Validation Accuracy: 0.7674


  with autocast():  # ✅ Mixed Precision Training
Epoch 9/10: 100%|██████████| 664/664 [08:56<00:00,  1.24it/s, loss=0.1822]


Epoch 9/10 - Avg Training Loss: 0.2421


Validating: 100%|██████████| 74/74 [00:31<00:00,  2.31it/s]


Validation Accuracy: 0.7716


  with autocast():  # ✅ Mixed Precision Training
Epoch 10/10:  34%|███▍      | 226/664 [03:02<05:53,  1.24it/s, loss=0.3311]

In [3]:
# Process test data
print("Processing test data...")
test_path = "test_without_labels.csv"
test_df = pd.read_csv(test_path)

# Clean test data
test_df["Cleaned_Text"] = test_df["Text"].apply(clean_text)

# Create test dataset and dataloader
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, 
                                  max_length=max_length, return_tensors="pt")
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item
    
    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_df["Cleaned_Text"], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Generate predictions
model.eval()
all_test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating predictions"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        # Get predictions
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_test_preds.extend(preds)

# Convert predictions back to original labels
predicted_labels = [id_to_label[pred_id] for pred_id in all_test_preds]

# Create submission file
submission = pd.DataFrame({
    "ID": test_df.index + 1,  # Start IDs from 1
    "Label": predicted_labels
})

# Save submission file
submission_path = "submission_bert.csv"
submission.to_csv(submission_path, index=False)

print(f"✅ Submission file '{submission_path}' generated successfully!")

Processing test data...


Generating predictions: 100%|██████████| 5956/5956 [05:10<00:00, 19.21it/s]


✅ Submission file 'submission_bert.csv' generated successfully!


In [2]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

torch.backends.cuda.matmul.allow_tf32 = True  # ✅ Accélération supplémentaire

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=128):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 3 échantillons (moins strict)
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 3].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Vérifier la taille du dataset après filtrage
print(f"Nombre total d'exemples après filtrage : {len(train_df)}")

# ✅ Réduction du dataset (max 20 000 exemples pour vitesse)
max_samples = 20000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle **correctement initialisé**
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_to_id)
).to(device)

# ✅ Réinitialiser les poids de classification
torch.nn.init.xavier_uniform_(model.classifier.out_proj.weight)
torch.nn.init.zeros_(model.classifier.out_proj.bias)

# ✅ Activation de `gradient_checkpointing`
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Plus stable
gradient_accumulation_steps = 2  # ✅ Accumulation pour économiser la mémoire GPU

train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ Learning rate plus stable
epochs = 10  # ✅ Légèrement augmenté pour convergence
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement optimisé (1h max)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for step, batch in enumerate(progress_bar):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():
            outputs = model(**batch)
            loss = outputs.loss / gradient_accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")

🚀 Using device: cuda
Nombre total d'exemples après filtrage : 190087


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():
Epoch 1/10: 100%|██████████| 531/531 [00:35<00:00, 14.88it/s, loss=2.8405]


Epoch 1/10 - Avg Training Loss: 2.9818


Validating: 100%|██████████| 59/59 [00:03<00:00, 19.52it/s]


Validation Accuracy: 0.0515


  with autocast():
Epoch 2/10: 100%|██████████| 531/531 [00:36<00:00, 14.39it/s, loss=2.4766]


Epoch 2/10 - Avg Training Loss: 2.5694


Validating: 100%|██████████| 59/59 [00:03<00:00, 19.64it/s]


Validation Accuracy: 0.2175


  with autocast():
Epoch 3/10: 100%|██████████| 531/531 [00:36<00:00, 14.40it/s, loss=2.4141]


Epoch 3/10 - Avg Training Loss: 2.0705


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.85it/s]


Validation Accuracy: 0.3247


  with autocast():
Epoch 4/10: 100%|██████████| 531/531 [00:36<00:00, 14.42it/s, loss=2.0706]


Epoch 4/10 - Avg Training Loss: 1.6889


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.73it/s]


Validation Accuracy: 0.3989


  with autocast():
Epoch 5/10: 100%|██████████| 531/531 [00:36<00:00, 14.41it/s, loss=1.5877]


Epoch 5/10 - Avg Training Loss: 1.3878


Validating: 100%|██████████| 59/59 [00:03<00:00, 19.08it/s]


Validation Accuracy: 0.4769


  with autocast():
Epoch 6/10: 100%|██████████| 531/531 [00:37<00:00, 14.35it/s, loss=0.3924]


Epoch 6/10 - Avg Training Loss: 1.1640


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.74it/s]


Validation Accuracy: 0.5347


  with autocast():
Epoch 7/10: 100%|██████████| 531/531 [00:36<00:00, 14.46it/s, loss=0.6785]


Epoch 7/10 - Avg Training Loss: 1.0025


Validating: 100%|██████████| 59/59 [00:03<00:00, 19.30it/s]


Validation Accuracy: 0.5634


  with autocast():
Epoch 8/10: 100%|██████████| 531/531 [00:37<00:00, 14.30it/s, loss=0.6249]


Epoch 8/10 - Avg Training Loss: 0.8795


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.75it/s]


Validation Accuracy: 0.6069


  with autocast():
Epoch 9/10: 100%|██████████| 531/531 [00:36<00:00, 14.43it/s, loss=1.2419]


Epoch 9/10 - Avg Training Loss: 0.7849


Validating: 100%|██████████| 59/59 [00:02<00:00, 19.74it/s]


Validation Accuracy: 0.6212


  with autocast():
Epoch 10/10: 100%|██████████| 531/531 [00:36<00:00, 14.56it/s, loss=0.8906]


Epoch 10/10 - Avg Training Loss: 0.7077


Validating: 100%|██████████| 59/59 [00:02<00:00, 20.18it/s]

Validation Accuracy: 0.6403
✅ Training Complete





In [1]:
# 📌 Installer les packages nécessaires si besoin
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm accelerate

import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

# ✅ Détection du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Chargement des données
train_path = "train_submission.csv"
train_df = pd.read_csv(train_path).dropna(subset=["Label"])

# ✅ Nettoyage des textes
def clean_text(text, max_length=256):  # ✅ Réduction à 256 tokens
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:max_length]

train_df["Cleaned_Text"] = train_df["Text"].apply(clean_text)
train_df = train_df[train_df["Cleaned_Text"].str.len() > 0]

# ✅ Encodage des labels
labels = train_df["Label"].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
train_df["label_id"] = train_df["Label"].map(label_to_id)

# ✅ Filtrer les classes qui ont au moins 2 échantillons
valid_classes = train_df["Label"].value_counts()
valid_classes = valid_classes[valid_classes >= 2].index
train_df = train_df[train_df["Label"].isin(valid_classes)]

# ✅ Réduction du dataset (max 25 000 exemples)
max_samples = 25000
if len(train_df) > max_samples:
    train_df = train_df.groupby("Label", group_keys=False).apply(
        lambda x: x.sample(min(len(x), max_samples // len(labels)), random_state=42)
    )

# ✅ Division Train/Validation
X_train, X_val, y_train, y_val = train_test_split(
    train_df["Cleaned_Text"], train_df["label_id"], test_size=0.1, stratify=train_df["label_id"], random_state=42
)

# ✅ Chargement du modèle RoBERTa (Meilleur que DistilBERT)
model_name = "roberta-large"  # 📌 Plus puissant que DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

# ✅ Activation du `gradient_checkpointing` (économie de mémoire)
model.gradient_checkpointing_enable()

# ✅ Définition du Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # ✅ max_length optimisé
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Création des DataLoaders
batch_size = 32  # ✅ Optimisé pour GPU
train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ✅ Optimisation : AdamW + Scheduler
learning_rate = 2e-5  # ✅ LR plus bas pour une meilleure convergence
epochs = 5  # ✅ Augmenté pour une meilleure précision
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=epochs * len(train_loader))

# ✅ Mixed Precision Training
scaler = GradScaler()

# ✅ Entraînement (Optimisé)
print("🚀 Training model...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():  # ✅ Mixed Precision Training
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1}/{epochs} - Avg Training Loss: {train_loss / len(train_loader):.4f}")

    # ✅ Validation
    model.eval()
    all_preds, all_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch['labels'].cpu().numpy())

    print(f"Validation Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

print("✅ Training Complete")

  from .autonotebook import tqdm as notebook_tqdm


🚀 Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


🚀 Training model...


  with autocast():  # ✅ Mixed Precision Training
Epoch 1/5: 100%|██████████| 664/664 [09:00<00:00,  1.23it/s, loss=4.0703]


Epoch 1/5 - Avg Training Loss: 5.1601


Validating: 100%|██████████| 74/74 [00:48<00:00,  1.53it/s]


Validation Accuracy: 0.3445


  with autocast():  # ✅ Mixed Precision Training
Epoch 2/5: 100%|██████████| 664/664 [09:02<00:00,  1.22it/s, loss=2.1680]


Epoch 2/5 - Avg Training Loss: 2.7878


Validating: 100%|██████████| 74/74 [00:49<00:00,  1.48it/s]


Validation Accuracy: 0.5746


  with autocast():  # ✅ Mixed Precision Training
Epoch 3/5: 100%|██████████| 664/664 [09:02<00:00,  1.22it/s, loss=1.4139]


Epoch 3/5 - Avg Training Loss: 1.5903


Validating: 100%|██████████| 74/74 [00:50<00:00,  1.47it/s]


Validation Accuracy: 0.6678


  with autocast():  # ✅ Mixed Precision Training
Epoch 4/5: 100%|██████████| 664/664 [09:02<00:00,  1.22it/s, loss=1.0661]


Epoch 4/5 - Avg Training Loss: 1.1267


Validating: 100%|██████████| 74/74 [00:50<00:00,  1.47it/s]


Validation Accuracy: 0.7174


  with autocast():  # ✅ Mixed Precision Training
Epoch 5/5: 100%|██████████| 664/664 [09:03<00:00,  1.22it/s, loss=1.3816]


Epoch 5/5 - Avg Training Loss: 0.8968


Validating: 100%|██████████| 74/74 [00:49<00:00,  1.49it/s]

Validation Accuracy: 0.7246
✅ Training Complete





In [None]:
# Process test data
print("Processing test data...")
test_path = "test_without_labels.csv"
test_df = pd.read_csv(test_path)

# Clean test data
test_df["Cleaned_Text"] = test_df["Text"].apply(clean_text)

# Create test dataset and dataloader
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, 
                                  max_length=max_length, return_tensors="pt")
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item
    
    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_df["Cleaned_Text"], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Generate predictions
model.eval()
all_test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating predictions"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        # Get predictions
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_test_preds.extend(preds)

# Convert predictions back to original labels
predicted_labels = [id_to_label[pred_id] for pred_id in all_test_preds]

# Create submission file
submission = pd.DataFrame({
    "ID": test_df.index + 1,  # Start IDs from 1
    "Label": predicted_labels
})

# Save submission file
submission_path = "submission.csv"
submission.to_csv(submission_path, index=False)

print(f"✅ Submission file '{submission_path}' generated successfully!")

Processing test data...


Generating predictions: 100%|█████████▉| 5949/5956 [35:26<00:02,  2.83it/s]