**1. L'importation des bibliothèques**

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import Dataset

**2. Chargement des données**

In [2]:
data = pd.read_excel('/content/hotel_reviews_preprocessed.xlsx')

**3. Préparation des données pour le fine-tunning**

In [3]:
X = data['Normalized_Review']
y = data['Sentiment']
y = y.map({'Positive': 1, 'Negative': 0})

**4. Chargement du modèle CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment**

In [5]:
MODEL_NAME = 'CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3, ignore_mismatched_sizes=True)
# Préparer QLoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Classification de séquence
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "key", "value"],
)
model = get_peft_model(model, lora_config)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [6]:
#Déplacement du modèle sur GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30000, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default

**5. Crétion d'une classe personnalisée pour le dataset**

In [7]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

**6.Préparation les données d'entrée pour l'entraînement**

In [8]:
encodings = tokenizer(list(X), truncation=True, padding=True, max_length=256)
dataset = TextDataset(encodings, list(y))

**7. Configuration des arguments d'entraînement**

In [9]:
training_args = TrainingArguments(
    output_dir="./camelbert-finetuned-qlora",
    num_train_epochs=10,  # Nombre d'époques
    learning_rate=1e-4,  # Taux d'apprentissage pour QLoRA
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",  # Évaluation à chaque époque
    save_strategy="epoch",  # Sauvegarde à chaque époque
    fp16=True,  # Utiliser le calcul en précision mixte
    save_total_limit=2,
    push_to_hub=False,  # Ne pas envoyer les modèles sur le Hub
)



**8. Création l'objet Trainer**

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


**9. Lancement du fine-tuning**

In [11]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.5936,0.554933
2,0.1073,0.220019
3,0.2376,0.146399
4,0.1399,0.115268
5,0.2093,0.103223
6,0.0941,0.096143
7,0.2672,0.078656
8,0.0285,0.073192
9,0.0049,0.063772
10,0.0028,0.063094


TrainOutput(global_step=2500, training_loss=0.387016287188977, metrics={'train_runtime': 416.7061, 'train_samples_per_second': 47.971, 'train_steps_per_second': 5.999, 'total_flos': 2657054946170880.0, 'train_loss': 0.387016287188977, 'epoch': 10.0})

**La sauvegarde du Model**

In [13]:
from google.colab import drive
import os

drive.mount('/content/drive')

drive_model_dir = "/content/drive/My Drive/camelbert-finetuned"

model.save_pretrained(drive_model_dir)
tokenizer.save_pretrained(drive_model_dir)

print(f"Le modèle est sauvegardé dans : {drive_model_dir}")


Mounted at /content/drive
Le modèle est sauvegardé dans : /content/drive/My Drive/camelbert-finetuned


**10. Évaluation du modèle fine-tuné**

In [14]:
from sklearn.metrics import classification_report
import torch

test_data = pd.read_excel('/content/hotel_reviews_test.xlsx')
X_test = test_data['Normalized_Review']
y_test = test_data['Sentiment'].map({'Positive': 1, 'Negative': 0})

encodings_test = tokenizer(list(X_test), truncation=True, padding=True, max_length=256)
test_dataset = TextDataset(encodings_test, list(y_test))

# Faire des prédictions sur le jeu de test
predictions = trainer.predict(test_dataset)

predicted_labels = predictions.predictions.argmax(axis=-1)  # Indices des classes prédites

# Afficher le classification_report
print(classification_report(y_test, predicted_labels, target_names=["Negative", "Positive"]))


              precision    recall  f1-score   support

    Negative       0.95      0.95      0.95      1190
    Positive       0.93      0.93      0.93       809

    accuracy                           0.94      1999
   macro avg       0.94      0.94      0.94      1999
weighted avg       0.94      0.94      0.94      1999



**11. Test du modele**

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Chargement du tokenizer et le modèle depuis Google Drive
model = AutoModelForSequenceClassification.from_pretrained(drive_model_dir)
tokenizer = AutoTokenizer.from_pretrained(drive_model_dir)

print("Modèle et tokenizer chargés depuis Google Drive.")

Modèle et tokenizer chargés depuis Google Drive.


In [20]:
# Vérifier si un GPU est disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Fonction pour prédire le sentiment d'une critique
def predict_sentiment(review):
    # Tokeniser la revue
    inputs = tokenizer(review, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Faire la prédiction
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()

    if predicted_class == 1:
        return "Positif"
    else:
        return "Négatif"

while True:
    review = input("Entrez une critique (ou 'quit' pour quitter) : ")

    if review.lower() == 'quit':
        print("Au revoir!")
        break

    sentiment = predict_sentiment(review)
    print(f"Sentiment prédit : {sentiment}")

Entrez une critique (ou 'quit' pour quitter) :  استمتعت بكل لحظة في هذا الفندق. الإفطار كان لذيذًا والخدمات ممتازة. سأعود بالتأكيد
Sentiment prédit : Positif
Entrez une critique (ou 'quit' pour quitter) :  الفندق قديم، يحتاج إلى تجديد
Sentiment prédit : Négatif
Entrez une critique (ou 'quit' pour quitter) : موقع الفندق ممتاز، قريب من كل شيء
Sentiment prédit : Positif
Entrez une critique (ou 'quit' pour quitter) : quit
Au revoir!
