In [None]:
# Importer les biblioth√®ques
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from scipy.sparse import hstack, csr_matrix
import joblib
import nltk
from nltk.corpus import stopwords
from flask import Flask, request, jsonify
from flask_cors import CORS

# -----------------------------
# 0Ô∏è‚É£ T√©l√©charger les stopwords fran√ßais
# -----------------------------
nltk.download('stopwords')
french_stopwords = stopwords.words('french')

# -----------------------------
# 1Ô∏è‚É£ Charger le dataset multilingue (SMS Spam Multilingual) et filtrer le fran√ßais
# -----------------------------
dataset = load_dataset("dbarbedillo/SMS_Spam_Multilingual_Collection_Dataset")

# R√©cup√©rer uniquement les messages fran√ßais
texts_fr = [x["text_fr"] for x in dataset["train"]]  # colonne pour fran√ßais
labels = [1 if x["labels"] == "spam" else 0 for x in dataset["train"]]  # colonne "labels"

print("Nombre de messages :", len(texts_fr))
print("Labels disponibles :", set(labels))
print("Exemple message :", texts_fr[0], "| Label :", labels[0])

# -----------------------------
# 2Ô∏è‚É£ Split train/test
# -----------------------------
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts_fr, labels, test_size=0.2, random_state=42, stratify=labels
)

# -----------------------------
# 3Ô∏è‚É£ TF-IDF (avec stopwords fran√ßais)
# -----------------------------
vectorizer = TfidfVectorizer(stop_words=french_stopwords, max_df=0.9, max_features=2000)
X_train_tfidf = vectorizer.fit_transform(X_train_texts)
X_test_tfidf = vectorizer.transform(X_test_texts)

# -----------------------------
# 4Ô∏è‚É£ Features manuelles suppl√©mentaires
# -----------------------------
def add_manual_features(texts):
    """
    Features simples pour le spam :
    - nb de chiffres
    - nb de symboles sp√©ciaux !,$,‚Ç¨
    - longueur du message
    """
    digits = [sum(c.isdigit() for c in t) for t in texts]
    symbols = [sum(c in "!$‚Ç¨" for c in t) for t in texts]
    length = [len(t) for t in texts]
    return np.array([digits, symbols, length]).T

X_train_manual = csr_matrix(add_manual_features(X_train_texts))
X_test_manual = csr_matrix(add_manual_features(X_test_texts))

# -----------------------------
# 5Ô∏è‚É£ Combiner TF-IDF + features manuelles
# -----------------------------
X_train_final = hstack([X_train_tfidf, X_train_manual])
X_test_final = hstack([X_test_tfidf, X_test_manual])

print("Donn√©es combin√©es TF-IDF + features manuelles ‚úÖ")

# -----------------------------
# 6Ô∏è‚É£ Entra√Æner Random Forest
# -----------------------------
rf_model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced'
)
rf_model.fit(X_train_final, y_train)
print("Mod√®le Random Forest entra√Æn√© ‚úÖ")

# -----------------------------
# 7Ô∏è‚É£ √âvaluer le mod√®le
# -----------------------------
preds = rf_model.predict(X_test_final)
print("Accuracy :", accuracy_score(y_test, preds))
print("\nClassification Report :\n", classification_report(y_test, preds, target_names=["HAM", "SPAM"]))

# -----------------------------
# 8Ô∏è‚É£ Sauvegarder le mod√®le et le vectorizer
# -----------------------------
joblib.dump(rf_model, "spam_rf_model_fr.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer_fr.pkl")
print("Mod√®le et vectorizer sauvegard√©s ‚úÖ")

# -----------------------------
# 9Ô∏è‚É£ Fonction pour pr√©dire un nouveau message
# -----------------------------
def predict_message(message):
    tfidf_vec = vectorizer.transform([message])
    manual_vec = csr_matrix(add_manual_features([message]))
    combined_vec = hstack([tfidf_vec, manual_vec])
    pred = rf_model.predict(combined_vec)[0]
    conf = rf_model.predict_proba(combined_vec)[0][pred] * 100
    return {"prediction": "SPAM" if pred==1 else "HAM", "confidence": round(conf,2)}

# -----------------------------
# üîü Tests
# -----------------------------
message_test_spam = "Je n'y connaissais rien en informatique ou en finance. En suivant simplement les instructions pendant 10 minutes par jour, j'ai g√©n√©r√© 4 500 ‚Ç¨ d√®s la premi√®re semaine. Ma vie a totalement chang√©, merci !"
message_test_ham = "Salut, on se retrouve demain pour le cours de maths ?"

result_spam = predict_message(message_test_spam)
result_ham = predict_message(message_test_ham)

print("\nTest SPAM :", message_test_spam)
print("Pr√©diction :", result_spam["prediction"], "| Confiance :", result_spam["confidence"], "%")

print("\nTest HAM :", message_test_ham)
print("Pr√©diction :", result_ham["prediction"], "| Confiance :", result_ham["confidence"], "%")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Nombre de messages : 5572
Labels disponibles : {0, 1}
Exemple message : Allez jusqu'√† Jurong point, fou.. Disponible seulement dans bugis n grand monde la e buffet... Cine il y a eu plus... | Label : 0
Donn√©es combin√©es TF-IDF + features manuelles ‚úÖ
Mod√®le Random Forest entra√Æn√© ‚úÖ
Accuracy : 0.9883408071748879

Classification Report :
               precision    recall  f1-score   support

         HAM       0.99      1.00      0.99       966
        SPAM       1.00      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Mod√®le et vectorizer sauvegard√©s ‚úÖ

Test SPAM : Je n'y connaissais rien en informatique ou en finance. En suivant simplement les instructions pendant 10 minutes par jour, j'ai g√©n√©r√© 4 500 ‚Ç¨ d√®s la premi√®re semaine. Ma vie a totalement chang√©, merci !
Pr√©diction : SPAM | Confiance : 67.67 %

Test HAM : Salut, o