In [2]:
# =========================
# 1. Imports
# =========================
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# =========================
# 2. Charger les données
# =========================
data = pd.read_excel("../hotel_reviews_preprocessed.xlsx")

# Vérifier les colonnes disponibles
print("Colonnes disponibles:", data.columns.tolist())

# Utiliser les bonnes colonnes
review_col = 'review' if 'review' in data.columns else 'Normalized_Review'
sentiment_col = 'Sentiment'

X = data[review_col][:2000]
y = data[sentiment_col].map({"Positive": 1, "Negative": 0})[:2000]

print("Samples:", len(X))
print("Labels:\n", y.value_counts())

Colonnes disponibles: ['no', 'Hotel name', 'rating', 'user type', 'room type', 'nights', 'review', 'Sentiment']
Samples: 2000
Labels:
 Sentiment
0    1140
1     860
Name: count, dtype: int64


In [4]:
# =========================
# 3. Split Train / Test
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [6]:
# =========================
# 4. Modèle AraBERT
# =========================
MODEL_NAME = "aubmindlab/bert-base-arabertv2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
# =========================
# 5. Dataset
# =========================
class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.enc = tokenizer(
            list(texts),
            truncation=True,
            padding=True,
            max_length=64
        )
        self.labels = list(labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.enc.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(X_train, y_train)
test_dataset  = ReviewDataset(X_test, y_test)

# =========================
# 6. Metrics
# =========================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [9]:
# =========================
# 7. Entraînement
# =========================
training_args = TrainingArguments(
    output_dir="./results_arabert",
    num_train_epochs=2,
    per_device_train_batch_size=32,
    learning_rate=2e-5,
    logging_steps=20,
    save_strategy="no",
    eval_strategy="epoch",
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

NameError: name 'compute_metrics' is not defined

In [None]:
# =========================
# 8. Sauvegarde du modèle
# =========================
SAVE_PATH = "../arabert_sentiment_model"
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

print("✅ AraBERT sauvegardé")

✅ AraBERT sauvegardé


In [None]:
# =========================
# 9. Test rapide
# =========================
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    return "Positif" if torch.argmax(logits) == 1 else "Négatif"

print(predict("الفندق ممتاز والخدمة رائعة"))
print(predict("التجربة كانت سيئة جدا"))

Positif
Négatif
