In [1]:
# Etape 1 Importation des bibliothèque necessaires

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from sklearn.metrics import classification_report, accuracy_score
from google.colab import drive

# Monter Google Drive
drive.mount('/content/drive')

# Étape 2 : Charger les données à partir des fichiers CSV téléchargé (voir questionnaire de l'examen)

df_train = pd.read_csv('/content/drive/My Drive/train.csv')
df_test = pd.read_csv('/content/drive/My Drive/test.csv')
df_val = pd.read_csv('/content/drive/My Drive/valid.csv')

# Étape 3 : Créer un Dataset personnalisé

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialiser le tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Créer les datasets

MAX_LENGTH = 128
BATCH_SIZE = 16

train_dataset = SentimentDataset(df_train['review'], df_train['polarity'], tokenizer, MAX_LENGTH)
test_dataset = SentimentDataset(df_test['review'], df_test['polarity'], tokenizer, MAX_LENGTH)
val_dataset = SentimentDataset(df_val['review'], df_val['polarity'], tokenizer, MAX_LENGTH)

# Créer les DataLoaders

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Étape 4 : Créer le modèle BERT

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Configurer le modèle pour l'entraînement

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

# Étape 5 : Entraîner le modèle

EPOCHS = 2

for epoch in range(EPOCHS):
    model.train()
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    for i, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Afficher l'itération et la perte
        if (i + 1) % 1000 == 0:  # Afficher toutes les 1000 itérations
            print(f'  Iteration {i + 1}/{len(train_dataloader)}, Loss: {loss.item():.4f}')

# Étape 6 :Sauvegarde du modèle entraîné

torch.save(model.state_dict(), '/content/drive/My Drive/classification.pth')
print("Modèle sauvegardé avec succès.")

# Étape 7 : Évaluer le modèle sur l'ensemble de test

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch['labels'].cpu().numpy())

# Calculer l'accuracy pour l'ensemble de test

test_accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy sur l'ensemble de test : {test_accuracy:.4f}")

# Afficher le rapport de classification pour l'ensemble de test

print("Rapport de classification sur l'ensemble de test :")
print(classification_report(true_labels, predictions, target_names=["Négatif", "Positif"]))

# Étape 8 : Évaluer le modèle sur l'ensemble de validation

val_predictions, val_true_labels = [], []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        val_predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        val_true_labels.extend(batch['labels'].cpu().numpy())

# Calculer l'accuracy pour l'ensemble de validation

val_accuracy = accuracy_score(val_true_labels, val_predictions)
print(f"Accuracy sur l'ensemble de validation : {val_accuracy:.4f}")

# Afficher le rapport de classification pour l'ensemble de validation

print("Rapport de classification sur l'ensemble de validation :")
print(classification_report(val_true_labels, val_predictions, target_names=["Négatif", "Positif"]))

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
  Iteration 1000/10000, Loss: 0.3123
  Iteration 2000/10000, Loss: 0.5282
  Iteration 3000/10000, Loss: 0.1874
  Iteration 4000/10000, Loss: 0.1920
  Iteration 5000/10000, Loss: 0.3244
  Iteration 6000/10000, Loss: 0.2063
  Iteration 7000/10000, Loss: 0.1781
  Iteration 8000/10000, Loss: 0.2246
  Iteration 9000/10000, Loss: 0.2035
  Iteration 10000/10000, Loss: 0.1948
Epoch 2/2
  Iteration 1000/10000, Loss: 0.2977
  Iteration 2000/10000, Loss: 0.4362
  Iteration 3000/10000, Loss: 0.0286
  Iteration 4000/10000, Loss: 0.1166
  Iteration 5000/10000, Loss: 0.0930
  Iteration 6000/10000, Loss: 0.1892
  Iteration 7000/10000, Loss: 0.2472
  Iteration 8000/10000, Loss: 0.1013
  Iteration 9000/10000, Loss: 0.1476
  Iteration 10000/10000, Loss: 0.1134
Modèle sauvegardé avec succès.
Accuracy sur l'ensemble de test : 0.9181
Rapport de classification sur l'ensemble de test :
              precision    recall  f1-score   support

     Négatif       0.96      0.88      0.92     10408
     P