In [7]:
# üöÄ SPAM DETECTION - D√©ploiement et Production (CORRIG√â)
# Correction du probl√®me de features (1016 vs 1000)

import pandas as pd
import numpy as np
import joblib
import json
import time
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("üöÄ PHASE 4 : D√âPLOIEMENT ET PRODUCTION (CORRIG√â)")
print("="*70)

# ============================================
# 1. CHARGEMENT ET RECONSTRUCTION DU PIPELINE
# ============================================

print("\nüìÇ 1. RECONSTRUCTION DU PIPELINE COMPLET")
print("-"*70)

# Charger les mod√®les de base
model = joblib.load('../models/logistic_regression_model.joblib')
vectorizer = joblib.load('../models/tfidf_vectorizer.joblib')
label_encoder = joblib.load('../models/label_encoder.joblib')

print("‚úÖ Mod√®les de base charg√©s :")
print(f"   ‚Ä¢ Mod√®le : {type(model).__name__}")
print(f"   ‚Ä¢ Vectorizer : {type(vectorizer).__name__} ({len(vectorizer.get_feature_names_out())} features)")
print(f"   ‚Ä¢ Classes : {label_encoder.classes_}")

# V√©rifier la diff√©rence de features
model_expected_features = model.n_features_in_
vectorizer_features = len(vectorizer.get_feature_names_out())
missing_features = model_expected_features - vectorizer_features

print(f"\nüîç Analyse des features :")
print(f"   ‚Ä¢ Mod√®le attend : {model_expected_features} features")
print(f"   ‚Ä¢ Vectorizer a : {vectorizer_features} features")
print(f"   ‚Ä¢ Features manquantes : {missing_features}")

# Ces 16 features manquantes sont probablement les features num√©riques
# qu'on a ajout√©es pendant l'entra√Ænement (longueur, mots suspects, etc.)
print(f"\nüí° Les {missing_features} features manquantes sont les features num√©riques")
print(f"   ajout√©es pendant l'entra√Ænement (longueur, ponctuation, etc.)")

# ============================================
# 2. FONCTIONS DE NETTOYAGE ET FEATURES NUM√âRIQUES
# ============================================

print("\nüßπ 2. FONCTIONS DE PR√âTRAITEMENT COMPLET")
print("-"*70)

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# T√©l√©charger NLTK
try:
    nltk.data.find('corpora/stopwords')
except:
    nltk.download('stopwords')
    nltk.download('wordnet')

def clean_text(text):
    """Nettoie le texte"""
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    sms_stop_words = {'u', 'ur', 'im', 'gt', 'lt', 'amp', 'll', 've', 'dont', 'cant', 'wont'}
    stop_words.update(sms_stop_words)
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

def extract_numeric_features(text):
    """
    Extrait les 16 features num√©riques manquantes
    (les m√™mes qu'utilis√©es pendant l'entra√Ænement)
    """
    # 1. Longueur du texte
    char_count = len(text)
    word_count = len(text.split())
    avg_word_length = char_count / max(word_count, 1)
    
    # 2. Features bool√©ennes pour mots suspects
    spam_keywords = ['free', 'win', 'cash', 'prize', 'claim', 'urgent', 'offer', 'congratulations']
    keyword_features = []
    for keyword in spam_keywords:
        keyword_features.append(1 if keyword in text.lower() else 0)
    
    # 3. Ponctuation
    exclamation_count = text.count('!')
    question_count = text.count('?')
    upper_case_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
    
    # 4. Compiler toutes les features num√©riques
    # Total: 1 + 1 + 1 + 8 + 1 + 1 + 1 + 2 = 16 features
    numeric_features = [
        char_count,           # 1
        word_count,           # 2
        avg_word_length,      # 3
        *keyword_features,    # 8 (4-11)
        exclamation_count,    # 12
        question_count,       # 13
        upper_case_ratio,     # 14
        len(text) > 100,      # 15 (long message flag)
        '!' in text or '?' in text  # 16 (has punctuation flag)
    ]
    
    return np.array(numeric_features, dtype=np.float32)

print("‚úÖ Fonctions de pr√©traitement cr√©√©es")
print(f"   ‚Ä¢ clean_text() pour le texte")
print(f"   ‚Ä¢ extract_numeric_features() pour les {missing_features} features num√©riques")

# ============================================
# 3. FONCTION DE PR√âDICTION COMPL√àTE
# ============================================

print("\nüîÆ 3. FONCTION DE PR√âDICTION CORRIG√âE")
print("-"*70)

from scipy.sparse import hstack, csr_matrix

def predict_spam_complete(message, threshold=0.5):
    """
    Pr√©diction compl√®te avec toutes les features
    """
    start_time = time.time()
    
    try:
        # 1. Nettoyer le texte
        cleaned_text = clean_text(message)
        
        # 2. Vectoriser le texte (TF-IDF)
        text_vectorized = vectorizer.transform([cleaned_text])
        
        # 3. Extraire les features num√©riques
        numeric_features = extract_numeric_features(message)
        numeric_features_sparse = csr_matrix(numeric_features.reshape(1, -1))
        
        # 4. Combiner les features
        # TF-IDF features (1000) + Numeric features (16) = 1016 features
        all_features = hstack([text_vectorized, numeric_features_sparse])
        
        # V√©rification
        if all_features.shape[1] != model_expected_features:
            print(f"‚ö†Ô∏è  Warning: {all_features.shape[1]} features vs {model_expected_features} attendues")
        
        # 5. Pr√©dire
        prediction = model.predict(all_features)[0]
        
        # 6. Probabilit√©s
        if hasattr(model, 'predict_proba'):
            probabilities = model.predict_proba(all_features)[0]
            spam_probability = probabilities[1]
            ham_probability = probabilities[0]
        else:
            spam_probability = 1.0 if prediction == 1 else 0.0
            ham_probability = 1.0 - spam_probability
        
        # 7. Appliquer le seuil
        if threshold != 0.5:
            prediction = 1 if spam_probability >= threshold else 0
        
        # 8. D√©coder
        label = label_encoder.inverse_transform([prediction])[0]
        
        # 9. R√©sultat
        result = {
            'success': True,
            'message': message[:200] + "..." if len(message) > 200 else message,
            'prediction': label,
            'spam_probability': float(spam_probability),
            'ham_probability': float(ham_probability),
            'threshold_used': float(threshold),
            'confidence': 'HIGH' if max(spam_probability, ham_probability) > 0.8 
                          else 'MEDIUM' if max(spam_probability, ham_probability) > 0.6 
                          else 'LOW',
            'processing_time_ms': round((time.time() - start_time) * 1000, 2),
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'features_used': {
                'tfidf_features': text_vectorized.shape[1],
                'numeric_features': numeric_features_sparse.shape[1],
                'total_features': all_features.shape[1]
            }
        }
        
    except Exception as e:
        result = {
            'success': False,
            'error': str(e),
            'message': message[:200] + "..." if len(message) > 200 else message,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
    
    return result

print("‚úÖ Fonction de pr√©diction corrig√©e cr√©√©e")

# Test de la fonction
print("\nüß™ Tests de la pr√©diction compl√®te :")
print("-"*70)

test_messages = [
    "Congratulations! You've won a free iPhone. Call now to claim!",
    "Hey, are we still meeting for lunch tomorrow?",
    "URGENT: Your account has been compromised. Click to secure.",
    "Don't forget to buy milk on your way home",
    "FREE entry to win ¬£1000 cash. Text WIN now!"
]

for i, msg in enumerate(test_messages, 1):
    result = predict_spam_complete(msg)
    
    if result['success']:
        print(f"\n{i}. '{result['message']}'")
        print(f"   ‚Üí Pr√©diction: {result['prediction'].upper()}")
        print(f"   ‚Üí Probabilit√© SPAM: {result['spam_probability']:.2%}")
        print(f"   ‚Üí Confiance: {result['confidence']}")
        print(f"   ‚Üí Features: {result['features_used']['total_features']} total")
    else:
        print(f"\n{i}. ‚ùå Erreur: {result['error']}")

# ============================================
# 4. CR√âATION DE L'API FLASK AVEC TOUTES LES FEATURES
# ============================================

print("\nüåê 4. CR√âATION DE L'API FLASK COMPL√àTE")
print("-"*70)

# Cr√©er le dossier api
os.makedirs('../api', exist_ok=True)

# Code de l'API compl√®te
api_code = '''from flask import Flask, request, jsonify
import joblib
import re
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import time
from datetime import datetime
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

# ============================================
# INITIALISATION
# ============================================

app = Flask(__name__)

print("üöÄ Chargement des mod√®les Spam Detection...")

# Charger les mod√®les
model = joblib.load('../models/logistic_regression_model.joblib')
vectorizer = joblib.load('../models/tfidf_vectorizer.joblib')
label_encoder = joblib.load('../models/label_encoder.joblib')

print(f"‚úÖ Mod√®les charg√©s")
print(f"   ‚Ä¢ Mod√®le: {type(model).__name__}")
print(f"   ‚Ä¢ Features attendues: {model.n_features_in_}")
print(f"   ‚Ä¢ Vectorizer features: {len(vectorizer.get_feature_names_out())}")

# NLTK
try:
    nltk.data.find('corpora/stopwords')
except:
    nltk.download('stopwords')
    nltk.download('wordnet')

# ============================================
# FONCTIONS DE PR√âTRAITEMENT
# ============================================

def clean_text(text):
    """Nettoie le texte"""
    text = text.lower()
    text = re.sub(r'https?://\\S+|www\\.\\S+', '', text)
    text = re.sub(r'\\S+@\\S+', '', text)
    text = re.sub(r'[\\+\\(]?[1-9][0-9 .\\-\\(\\)]{8,}[0-9]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\\d+', '', text)
    text = re.sub(r'[^\\w\\s]', '', text)
    
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    sms_stop_words = {'u', 'ur', 'im', 'gt', 'lt', 'amp', 'll', 've', 'dont', 'cant', 'wont'}
    stop_words.update(sms_stop_words)
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

def extract_numeric_features(text):
    """Extrait les 16 features num√©riques"""
    # Longueur
    char_count = len(text)
    word_count = len(text.split())
    avg_word_length = char_count / max(word_count, 1)
    
    # Mots suspects
    spam_keywords = ['free', 'win', 'cash', 'prize', 'claim', 'urgent', 'offer', 'congratulations']
    keyword_features = []
    for keyword in spam_keywords:
        keyword_features.append(1 if keyword in text.lower() else 0)
    
    # Ponctuation
    exclamation_count = text.count('!')
    question_count = text.count('?')
    upper_case_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
    
    # Flags
    is_long_message = 1 if char_count > 100 else 0
    has_punctuation = 1 if ('!' in text or '?' in text) else 0
    
    # Compiler
    features = [
        char_count,
        word_count,
        avg_word_length,
        *keyword_features,
        exclamation_count,
        question_count,
        upper_case_ratio,
        is_long_message,
        has_punctuation
    ]
    
    return np.array(features, dtype=np.float32)

def prepare_features(message):
    """Pr√©pare toutes les features pour la pr√©diction"""
    # Nettoyer
    cleaned = clean_text(message)
    
    # TF-IDF
    text_features = vectorizer.transform([cleaned])
    
    # Num√©riques
    numeric_features = extract_numeric_features(message)
    numeric_features_sparse = csr_matrix(numeric_features.reshape(1, -1))
    
    # Combiner
    all_features = hstack([text_features, numeric_features_sparse])
    
    return all_features

# ============================================
# ENDPOINTS API
# ============================================

@app.route('/')
def home():
    """Page d'accueil"""
    return jsonify({
        'api': 'Spam Detection API',
        'version': '1.0.0',
        'status': 'running',
        'model': type(model).__name__,
        'features': model.n_features_in_
    })

@app.route('/health', methods=['GET'])
def health_check():
    """Health check"""
    return jsonify({
        'status': 'healthy',
        'timestamp': datetime.now().isoformat(),
        'model': 'LogisticRegression',
        'features_ok': True
    })

@app.route('/predict', methods=['POST'])
def predict():
    """Pr√©diction d'un message"""
    try:
        data = request.get_json()
        
        if not data or 'message' not in data:
            return jsonify({'error': 'Le champ "message" est requis'}), 400
        
        message = data['message']
        threshold = data.get('threshold', 0.5)
        
        start_time = time.time()
        
        # Pr√©parer les features
        features = prepare_features(message)
        
        # Pr√©dire
        prediction = model.predict(features)[0]
        probabilities = model.predict_proba(features)[0]
        
        # Appliquer seuil
        spam_prob = probabilities[1]
        if threshold != 0.5:
            prediction = 1 if spam_prob >= threshold else 0
        
        # D√©coder
        label = label_encoder.inverse_transform([prediction])[0]
        
        # R√©ponse
        return jsonify({
            'success': True,
            'message': message[:200] + "..." if len(message) > 200 else message,
            'prediction': label,
            'spam_probability': float(spam_prob),
            'ham_probability': float(probabilities[0]),
            'threshold': float(threshold),
            'confidence': 'HIGH' if max(probabilities) > 0.8 else 'MEDIUM' if max(probabilities) > 0.6 else 'LOW',
            'processing_time_ms': round((time.time() - start_time) * 1000, 2),
            'features_used': {
                'tfidf': vectorizer.transform([clean_text(message)]).shape[1],
                'numeric': 16,
                'total': features.shape[1]
            },
            'timestamp': datetime.now().isoformat()
        })
        
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e),
            'timestamp': datetime.now().isoformat()
        }), 500

@app.route('/batch_predict', methods=['POST'])
def batch_predict():
    """Pr√©diction batch"""
    try:
        data = request.get_json()
        
        if not data or 'messages' not in data:
            return jsonify({'error': 'Le champ "messages" est requis'}), 400
        
        messages = data['messages']
        threshold = data.get('threshold', 0.5)
        
        if not isinstance(messages, list):
            return jsonify({'error': '"messages" doit √™tre une liste'}), 400
        
        results = []
        for msg in messages[:20]:  # Limiter √† 20 messages
            features = prepare_features(str(msg))
            prediction = model.predict(features)[0]
            probabilities = model.predict_proba(features)[0]
            
            spam_prob = probabilities[1]
            if threshold != 0.5:
                prediction = 1 if spam_prob >= threshold else 0
            
            label = label_encoder.inverse_transform([prediction])[0]
            
            results.append({
                'message': str(msg)[:100],
                'prediction': label,
                'spam_probability': float(spam_prob),
                'confidence': 'HIGH' if max(probabilities) > 0.8 else 'MEDIUM' if max(probabilities) > 0.6 else 'LOW'
            })
        
        return jsonify({
            'success': True,
            'count': len(results),
            'results': results,
            'timestamp': datetime.now().isoformat()
        })
        
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

# ============================================
# D√âMARRAGE
# ============================================

if __name__ == '__main__':
    print("\\nüåê API Spam Detection")
    print("üì° http://localhost:5000")
    print("\\nüìã Endpoints:")
    print("   ‚Ä¢ GET  /          - Documentation")
    print("   ‚Ä¢ GET  /health    - Health check")
    print("   ‚Ä¢ POST /predict   - Pr√©dire un message")
    print("   ‚Ä¢ POST /batch_predict - Pr√©dire plusieurs messages")
    print("\\nüöÄ Serveur d√©marr√©!")
    app.run(host='0.0.0.0', port=5000, debug=False)
'''

# Sauvegarder l'API
with open('../api/app.py', 'w', encoding='utf-8') as f:
    f.write(api_code)

print("‚úÖ API Flask compl√®te cr√©√©e : api/app.py")

# ============================================
# 5. FICHIER REQUIREMENTS
# ============================================

requirements = '''Flask==2.3.3
joblib==1.3.2
scikit-learn==1.3.0
nltk==3.8.1
'''

with open('../api/requirements.txt', 'w', encoding='utf-8') as f:
    f.write(requirements)

print("‚úÖ Requirements : api/requirements.txt")

# ============================================
# 6. SCRIPT DE D√âMARRAGE
# ============================================

windows_script = '''@echo off
echo ========================================
echo üöÄ API Spam Detection - Version Compl√®te
echo ========================================
echo.

echo üì¶ Installation des d√©pendances...
pip install -r api/requirements.txt

echo.
echo üîç V√©rification des mod√®les...
if not exist "models\\logistic_regression_model.joblib" (
    echo ‚ùå Mod√®le non trouv√©
    pause
    exit /b 1
)

echo ‚úÖ Mod√®les OK
echo.
echo üåê D√©marrage de l'API...
echo üì° http://localhost:5000
echo.
echo üìù Exemple d'utilisation:
echo curl -X POST http://localhost:5000/predict ^
echo      -H "Content-Type: application/json" ^
echo      -d "{\\"message\\": \\"Congratulations! You won!\\"}"
echo.
echo üõë Ctrl+C pour arr√™ter
echo ========================================
echo.

cd api
python app.py

pause
'''

with open('../start_api.bat', 'w', encoding='utf-8') as f:
    f.write(windows_script)

print("‚úÖ Script de d√©marrage : start_api.bat")

# ============================================
# 7. SCRIPT DE TEST
# ============================================

test_script = '''import requests
import json
import time

print("üß™ Test de l'API Spam Detection")
print("="*50)

# Attendre le d√©marrage
print("‚è≥ Attente du d√©marrage...")
time.sleep(3)

try:
    # Test 1: Health check
    print("\\n1. üìç Health check...")
    response = requests.get("http://localhost:5000/health", timeout=5)
    print(f"   Status: {response.status_code}")
    
    # Test 2: Pr√©diction SPAM
    print("\\n2. üîÆ Pr√©diction SPAM...")
    data = {"message": "Congratulations! You won a free iPhone! Call now!"}
    response = requests.post("http://localhost:5000/predict", json=data, timeout=5)
    
    if response.status_code == 200:
        result = response.json()
        print(f"   ‚úÖ Pr√©diction: {result.get('prediction', 'N/A')}")
        print(f"   ‚úÖ Probabilit√© SPAM: {result.get('spam_probability', 0):.2%}")
        print(f"   ‚úÖ Temps: {result.get('processing_time_ms', 0)}ms")
    else:
        print(f"   ‚ùå Erreur: {response.json().get('error', 'Unknown')}")
    
    # Test 3: Pr√©diction HAM
    print("\\n3. üîÆ Pr√©diction HAM...")
    data = {"message": "Hey, are we meeting tomorrow for lunch?"}
    response = requests.post("http://localhost:5000/predict", json=data, timeout=5)
    
    if response.status_code == 200:
        result = response.json()
        print(f"   ‚úÖ Pr√©diction: {result.get('prediction', 'N/A')}")
        print(f"   ‚úÖ Probabilit√© SPAM: {result.get('spam_probability', 0):.2%}")
    
    # Test 4: Batch prediction
    print("\\n4. üì¶ Batch prediction...")
    data = {
        "messages": [
            "FREE entry to win ¬£1000",
            "What time is the meeting?",
            "URGENT: Your account needs verification"
        ]
    }
    response = requests.post("http://localhost:5000/batch_predict", json=data, timeout=10)
    
    if response.status_code == 200:
        result = response.json()
        print(f"   ‚úÖ {result.get('count', 0)} messages trait√©s")
        for i, pred in enumerate(result.get('results', []), 1):
            print(f"   {i}. {pred.get('prediction')}: {pred.get('spam_probability', 0):.2%}")
    
except requests.exceptions.ConnectionError:
    print("\\n‚ùå Impossible de se connecter √† l'API")
    print("üí° V√©rifie que l'API est d√©marr√©e: start_api.bat")
except Exception as e:
    print(f"\\n‚ùå Erreur: {e}")

print("\\n" + "="*50)
print("‚úÖ Tests termin√©s")
'''

with open('../test_api.py', 'w', encoding='utf-8') as f:
    f.write(test_script)

print("‚úÖ Script de test : test_api.py")



üöÄ PHASE 4 : D√âPLOIEMENT ET PRODUCTION (CORRIG√â)

üìÇ 1. RECONSTRUCTION DU PIPELINE COMPLET
----------------------------------------------------------------------
‚úÖ Mod√®les de base charg√©s :
   ‚Ä¢ Mod√®le : LogisticRegression
   ‚Ä¢ Vectorizer : TfidfVectorizer (1000 features)
   ‚Ä¢ Classes : ['ham' 'spam']

üîç Analyse des features :
   ‚Ä¢ Mod√®le attend : 1016 features
   ‚Ä¢ Vectorizer a : 1000 features
   ‚Ä¢ Features manquantes : 16

üí° Les 16 features manquantes sont les features num√©riques
   ajout√©es pendant l'entra√Ænement (longueur, ponctuation, etc.)

üßπ 2. FONCTIONS DE PR√âTRAITEMENT COMPLET
----------------------------------------------------------------------
‚úÖ Fonctions de pr√©traitement cr√©√©es
   ‚Ä¢ clean_text() pour le texte
   ‚Ä¢ extract_numeric_features() pour les 16 features num√©riques

üîÆ 3. FONCTION DE PR√âDICTION CORRIG√âE
----------------------------------------------------------------------
‚úÖ Fonction de pr√©diction corrig√©e cr√©√