In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm import tqdm
import random

# Configuration initiale
device = torch.device("cpu")  # Forcé sur CPU pour éviter les erreurs CUDA
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Chargement et nettoyage des données
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Limiter la taille pour accélérer les tests (commenter pour dataset complet)
# train_df = train_df.head(1000)
# test_df = test_df.head(1000)

def clean_text(text):
    return ' '.join(str(text).split()).strip() if pd.notna(text) else "neutral"

train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# Validation des données
print("Train columns:", train_df.columns)
print("Test columns:", test_df.columns)
print("NaN in train_df['text']:", train_df['text'].isna().sum())
print("NaN in train_df['sentiment']:", train_df['sentiment'].isna().sum())

# Mapping des sentiments
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
print("Unique sentiments before mapping:", train_df['sentiment'].unique())
train_df['label'] = train_df['sentiment'].map(sentiment_map)

# Supprimer les lignes avec des étiquettes invalides
invalid_rows = train_df[train_df['label'].isna()]
if not invalid_rows.empty:
    print("Invalid sentiments found:", invalid_rows['sentiment'].unique())
    train_df = train_df.dropna(subset=['label'])
    print("Removed invalid rows. New train_df size:", len(train_df))

# Validation des étiquettes après nettoyage
print("Unique labels after mapping:", train_df['label'].unique())
if not all(train_df['label'].isin([0, 1, 2])):
    raise ValueError("Labels must be in [0, 1, 2]")


# Tokenization avec DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

def encode_texts(texts, max_len=64):
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

# Encodage
encoded_train = encode_texts(train_df['text'].tolist())
input_ids = encoded_train['input_ids']
attention_masks = encoded_train['attention_mask']
labels = torch.tensor(train_df['label'].values, dtype=torch.long)
print(f"Input IDs shape: {input_ids.shape}")
print(f"Labels shape: {labels.shape}")

# Préparation des DataLoaders
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Réduit pour CPU
val_loader = DataLoader(val_dataset, batch_size=4)

# Initialisation du modèle
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

# Configuration de l'entraînement
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 2  # Réduit pour accélérer
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Boucle d'entraînement
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, lbls = batch

        model.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=lbls)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    print(f"Perte moyenne: {total_loss/len(train_loader):.4f}")

# Validation
model.eval()
val_loss = 0
val_accuracy = 0
for batch in tqdm(val_loader, desc="Validation"):
    batch = tuple(t.to(device) for t in batch)
    inputs, masks, lbls = batch
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks, labels=lbls)
    val_loss += outputs.loss.item()
    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    val_accuracy += (preds == lbls.cpu().numpy()).mean()

print(f"Validation Loss: {val_loss/len(val_loader):.4f}")
print(f"Validation Accuracy: {val_accuracy/len(val_loader):.4f}")

# Prédiction
def predict(texts):
    model.eval()
    predictions = []

    encoded = encode_texts(texts)
    pred_loader = DataLoader(TensorDataset(encoded['input_ids'], encoded['attention_mask']), batch_size=4)

    for batch in tqdm(pred_loader, desc="Predicting"):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks = batch

        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks)

        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(preds)

    return [['negative', 'neutral', 'positive'][p] for p in predictions]

# Génération des résultats
test_predictions = predict(test_df['text'].tolist())
submission = pd.DataFrame({'id': test_df['id'], 'sentiment': test_predictions})
submission.to_csv('submission.csv', index=False)
print("Prédictions sauvegardées dans submission.csv")