In [2]:
# ============================================
# üî• R√âENTRA√éNEMENT DU MOD√àLE FINAL
# ============================================

print("="*70)
print("üî• R√âENTRA√éNEMENT DU MOD√àLE LOGISTICREGRESSION")
print("="*70)

import pandas as pd
import numpy as np
import joblib
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

# ============================================
# 1. CHARGEMENT DES DONN√âES
# ============================================

print("\nüì¶ 1. CHARGEMENT DES DONN√âES COMPL√àTES")
print("-"*70)

df = pd.read_csv('../data/spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

print(f"‚úÖ Donn√©es charg√©es: {len(df)} messages")
print(f"   ‚Ä¢ HAM: {sum(df['label'] == 'ham')} messages")
print(f"   ‚Ä¢ SPAM: {sum(df['label'] == 'spam')} messages")

# Convertir les labels
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# ============================================
# 2. NETTOYAGE DU TEXTE
# ============================================

print("\nüßπ 2. NETTOYAGE DU TEXTE")
print("-"*70)

def clean_text(text):
    """Nettoie le texte"""
    text = str(text).lower()
    
    # Supprimer URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Supprimer emails
    text = re.sub(r'\S+@\S+', '', text)
    
    # Supprimer num√©ros de t√©l√©phone
    text = re.sub(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', '', text)
    
    # Supprimer ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Supprimer chiffres
    text = re.sub(r'\d+', '', text)
    
    # Supprimer caract√®res sp√©ciaux
    text = re.sub(r'[^\w\s]', '', text)
    
    return text

df['clean_message'] = df['message'].apply(clean_text)
print("‚úÖ Texte nettoy√©")

# ============================================
# 3. VECTORISATION TF-IDF
# ============================================

print("\nüî§ 3. VECTORISATION TF-IDF")
print("-"*70)

# Cr√©er le vectorizer
vectorizer = TfidfVectorizer(
    max_features=1000,
    min_df=5,
    max_df=0.7,
    stop_words='english',
    ngram_range=(1, 2)
)

# Ajuster et transformer
X_tfidf = vectorizer.fit_transform(df['clean_message'])
print(f"‚úÖ TF-IDF cr√©√©: {X_tfidf.shape[1]} features")

# ============================================
# 4. FEATURES NUM√âRIQUES (16 features)
# ============================================

print("\nüî¢ 4. EXTRACTION DES FEATURES NUM√âRIQUES")
print("-"*70)

def extract_numeric_features(text):
    """Extrait 16 features num√©riques"""
    # 1. Longueur
    char_count = len(text)
    word_count = len(text.split())
    avg_word_length = char_count / max(word_count, 1)
    
    # 2. Mots suspects de spam
    spam_keywords = [
        'free', 'win', 'winner', 'won', 'prize', 'cash', 
        'urgent', 'congratulations', 'claim', 'offer',
        'money', 'guaranteed', 'risk', 'limited'
    ]
    
    keyword_features = []
    for keyword in spam_keywords[:8]:  # Prendre 8 mots-cl√©s
        keyword_features.append(1 if keyword in text.lower() else 0)
    
    # 3. Ponctuation et majuscules
    exclamation_count = text.count('!')
    question_count = text.count('?')
    upper_case_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
    
    # 4. Flags
    is_long_message = 1 if char_count > 100 else 0
    has_punctuation = 1 if ('!' in text or '?' in text) else 0
    
    # Total: 16 features
    features = [
        char_count,
        word_count,
        avg_word_length,
        *keyword_features,  # 8 features
        exclamation_count,
        question_count,
        upper_case_ratio,
        is_long_message,
        has_punctuation
    ]
    
    return np.array(features, dtype=np.float32)

# Extraire pour tous les messages
numeric_features_list = []
for text in df['message']:
    numeric_features = extract_numeric_features(text)
    numeric_features_list.append(numeric_features)

X_numeric = np.array(numeric_features_list)
X_numeric_sparse = csr_matrix(X_numeric)

print(f"‚úÖ Features num√©riques extraites: {X_numeric.shape[1]} features")

# ============================================
# 5. COMBINAISON DES FEATURES
# ============================================

print("\nüîó 5. COMBINAISON DES FEATURES")
print("-"*70)

# Combiner TF-IDF (1000) + Num√©riques (16) = 1016
X_combined = hstack([X_tfidf, X_numeric_sparse])
y = df['label_num'].values

print(f"‚úÖ Features combin√©es: {X_combined.shape[1]} features total")
print(f"   ‚Ä¢ TF-IDF: {X_tfidf.shape[1]}")
print(f"   ‚Ä¢ Num√©riques: {X_numeric.shape[1]}")
print(f"   ‚Ä¢ Cibles: {len(y)}")

# ============================================
# 6. SPLIT TRAIN/TEST
# ============================================

print("\nüìä 6. SPLIT DES DONN√âES")
print("-"*70)

X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

print(f"‚úÖ Split termin√©:")
print(f"   ‚Ä¢ Train: {X_train.shape[0]} √©chantillons")
print(f"   ‚Ä¢ Test: {X_test.shape[0]} √©chantillons")

# ============================================
# 7. ENTRA√éNEMENT DU MOD√àLE
# ============================================

print("\nü§ñ 7. ENTRA√éNEMENT DU MOD√àLE LOGISTICREGRESSION")
print("-"*70)

# Cr√©er et entra√Æner le mod√®le
model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced',  # Important car donn√©es d√©s√©quilibr√©es
    C=1.0,
    solver='liblinear'
)

model.fit(X_train, y_train)
print("‚úÖ Mod√®le entra√Æn√©")

# ============================================
# 8. √âVALUATION
# ============================================

print("\nüìà 8. √âVALUATION DU MOD√àLE")
print("-"*70)

# Pr√©dictions
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

# M√©triques
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print(f"üìä PERFORMANCES:")
print(f"   ‚Ä¢ Accuracy (train): {train_accuracy:.2%}")
print(f"   ‚Ä¢ Accuracy (test):  {test_accuracy:.2%}")
print(f"   ‚Ä¢ Precision:        {precision:.2%}")
print(f"   ‚Ä¢ Recall:           {recall:.2%}")
print(f"   ‚Ä¢ F1-Score:         {f1:.2%}")

# Rapport d√©taill√©
print(f"\nüìã RAPPORT DE CLASSIFICATION:")
print(classification_report(y_test, y_pred, target_names=['HAM', 'SPAM']))

# ============================================
# 9. SAUVEGARDE DES MOD√àLES
# ============================================

print("\nüíæ 9. SAUVEGARDE DES MOD√àLES")
print("-"*70)

import os
os.makedirs('../models', exist_ok=True)

# Sauvegarder le mod√®le
joblib.dump(model, '../models/logistic_regression_model_final.joblib')
print("‚úÖ Mod√®le sauvegard√©: models/logistic_regression_model_final.joblib")

# Sauvegarder le vectorizer
joblib.dump(vectorizer, '../models/tfidf_vectorizer_final.joblib')
print("‚úÖ Vectorizer sauvegard√©: models/tfidf_vectorizer_final.joblib")

# Sauvegarder l'encoder de labels
label_encoder = {'ham': 0, 'spam': 1}
joblib.dump(label_encoder, '../models/label_encoder_final.joblib')
print("‚úÖ Label encoder sauvegard√©: models/label_encoder_final.joblib")

# ============================================
# 10. TESTS DE PR√âDICTION
# ============================================

print("\nüß™ 10. TESTS DE PR√âDICTION")
print("-"*70)

test_messages = [
    ("Congratulations! You won a free iPhone! Call now to claim!", "SPAM"),
    ("Hey, are we still meeting for lunch tomorrow?", "HAM"),
    ("URGENT: Your bank account has been compromised.", "SPAM"),
    ("Don't forget to buy milk on your way home", "HAM"),
    ("FREE entry to win ¬£1000 cash prize. Text WIN now!", "SPAM"),
    ("Meeting rescheduled to 3 PM. See you then.", "HAM")
]

print("üìù Tests de pr√©diction:")
print("-"*40)

for message, expected in test_messages:
    # Nettoyer
    clean_msg = clean_text(message)
    
    # TF-IDF
    tfidf_features = vectorizer.transform([clean_msg])
    
    # Features num√©riques
    numeric_features = extract_numeric_features(message)
    numeric_features_sparse = csr_matrix(numeric_features.reshape(1, -1))
    
    # Combiner
    features = hstack([tfidf_features, numeric_features_sparse])
    
    # Pr√©dire
    pred_num = model.predict(features)[0]
    pred_proba = model.predict_proba(features)[0]
    
    prediction = 'SPAM' if pred_num == 1 else 'HAM'
    spam_prob = pred_proba[1]
    
    # Afficher
    print(f"üì® '{message[:50]}...'")
    print(f"   ‚Üí Pr√©dit: {prediction} ({spam_prob:.2%})")
    print(f"   ‚Üí Attendu: {expected}")
    print(f"   ‚Üí {'‚úÖ CORRECT' if prediction == expected else '‚ùå INCORRECT'}")
    print()

# ============================================
# 11. MISE √Ä JOUR DE L'API
# ============================================

print("\nüåê 11. MISE √Ä JOUR DE L'API")
print("-"*70)

# Lire le fichier API actuel
with open('../api/app.py', 'r', encoding='utf-8') as f:
    api_content = f.read()

# Remplacer les chemins des mod√®les
api_content = api_content.replace(
    "'../models/logistic_regression_model.joblib'",
    "'../models/logistic_regression_model_final.joblib'"
)
api_content = api_content.replace(
    "'../models/tfidf_vectorizer.joblib'",
    "'../models/tfidf_vectorizer_final.joblib'"
)
api_content = api_content.replace(
    "'../models/label_encoder.joblib'",
    "'../models/label_encoder_final.joblib'"
)

# Sauvegarder
with open('../api/app.py', 'w', encoding='utf-8') as f:
    f.write(api_content)

print("‚úÖ API mise √† jour avec le nouveau mod√®le")

# ============================================
# 12. R√âSUM√â FINAL
# ============================================

print("\n" + "="*70)
print("üéâ R√âENTRA√éNEMENT TERMIN√â !")
print("="*70)

print(f"\nüìä PERFORMANCES FINALES:")
print(f"   ‚Ä¢ Accuracy:  {test_accuracy:.2%}")
print(f"   ‚Ä¢ F1-Score:  {f1:.2%}")

print(f"\nüìÅ MOD√àLES SAUVEGARD√âS:")
print(f"   ‚Ä¢ logistic_regression_model_final.joblib")
print(f"   ‚Ä¢ tfidf_vectorizer_final.joblib")
print(f"   ‚Ä¢ label_encoder_final.joblib")

print(f"\nüöÄ POUR TESTER:")
print(f"   1. Red√©marre l'API: python api/app.py")
print(f"   2. Teste: python test_api.py")

print(f"\n‚úÖ PROJET COMPLET √Ä 100% !")
# ============================================
# üîß CORRECTION DU LABEL ENCODER
# ============================================

print("\nüîß CORRECTION DU LABEL ENCODER")
print("-"*70)

from sklearn.preprocessing import LabelEncoder

# Cr√©er un vrai LabelEncoder
label_encoder_obj = LabelEncoder()
label_encoder_obj.fit(['ham', 'spam'])

# Sauvegarder
joblib.dump(label_encoder_obj, '../models/label_encoder_final.joblib')
print("‚úÖ Vrai LabelEncoder sauvegard√©")

# Mettre √† jour l'API
with open('../api/app.py', 'r', encoding='utf-8') as f:
    api_content = f.read()

# Laisser le code tel quel (l'API utilise d√©j√† inverse_transform)
with open('../api/app.py', 'w', encoding='utf-8') as f:
    f.write(api_content)

print("‚úÖ API pr√™te avec le vrai LabelEncoder")

üî• R√âENTRA√éNEMENT DU MOD√àLE LOGISTICREGRESSION

üì¶ 1. CHARGEMENT DES DONN√âES COMPL√àTES
----------------------------------------------------------------------
‚úÖ Donn√©es charg√©es: 5572 messages
   ‚Ä¢ HAM: 4825 messages
   ‚Ä¢ SPAM: 747 messages

üßπ 2. NETTOYAGE DU TEXTE
----------------------------------------------------------------------
‚úÖ Texte nettoy√©

üî§ 3. VECTORISATION TF-IDF
----------------------------------------------------------------------
‚úÖ TF-IDF cr√©√©: 1000 features

üî¢ 4. EXTRACTION DES FEATURES NUM√âRIQUES
----------------------------------------------------------------------
‚úÖ Features num√©riques extraites: 16 features

üîó 5. COMBINAISON DES FEATURES
----------------------------------------------------------------------
‚úÖ Features combin√©es: 1016 features total
   ‚Ä¢ TF-IDF: 1000
   ‚Ä¢ Num√©riques: 16
   ‚Ä¢ Cibles: 5572

üìä 6. SPLIT DES DONN√âES
----------------------------------------------------------------------
‚úÖ Split termi