In [1]:
import sys
import os
import pandas as pd
import time
import requests

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler

# Si tu veux utiliser RAPIDS GPU, décommente et installe l'environnement adapté :
# import cudf
# from cuml.linear_model import LogisticRegression as cuLogisticRegression
# from cuml.metrics import accuracy_score as cu_accuracy_score, recall_score as cu_recall_score, precision_score as cu_precision_score, roc_auc_score as cu_roc_auc_score
# from cuml.model_selection import train_test_split as cu_train_test_split

# === Chemins & URLs ===
DATA_PATH = "../data/creditcard.csv"
OUTPUT_CPU_CSV = "../models/resultats_auc_cpu.csv"
OUTPUT_RAPIDS_CSV = "../models/resultats_auc_rapids.csv"

INGESTION_URL = "http://localhost:8001"
TRAIN_URL = "http://localhost:8002"
PREDICT_URL = "http://localhost:8003"
COMPARE_URL = "http://localhost:8004"

# === Vérification des microservices ===
print("\n🔍 Vérification des microservices :")
for name, url in [
    ("ingestion-service", INGESTION_URL),
    ("train-service", TRAIN_URL),
    ("predict-service", PREDICT_URL),
    ("compare-service", COMPARE_URL)
]:
    try:
        res = requests.get(f"{url}/status")
        print(f"✅ {name} OK :", res.json())
    except Exception as e:
        print(f"❌ {name} erreur :", e)


def train_cpu_model():
    print("📥 Chargement des données CPU depuis :", DATA_PATH)
    df = pd.read_csv(DATA_PATH)

    features = [col for col in df.columns if col != "Class"]
    X = df[features]
    y = df["Class"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    results = {}

    # Logistic Regression CPU
    print("\n🚀 Entraînement LogisticRegression CPU...")
    start = time.time()
    lr = LogisticRegression(max_iter=5000, solver='lbfgs', random_state=42)
    lr.fit(X_train_scaled, y_train)
    train_time_lr = time.time() - start

    y_pred_lr = lr.predict(X_test_scaled)
    y_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]

    results["CPU LogisticRegression"] = {
        "auc": roc_auc_score(y_test, y_proba_lr),
        "accuracy": accuracy_score(y_test, y_pred_lr),
        "recall": recall_score(y_test, y_pred_lr),
        "precision": precision_score(y_test, y_pred_lr),
        "training_time": train_time_lr
    }

    # Random Forest CPU
    print("\n🚀 Entraînement RandomForest CPU...")
    start = time.time()
    rf = RandomForestClassifier(n_estimators=20, random_state=42)
    rf.fit(X_train, y_train)
    train_time_rf = time.time() - start

    y_pred_rf = rf.predict(X_test)
    y_proba_rf = rf.predict_proba(X_test)[:, 1]

    results["CPU RandomForest"] = {
        "auc": roc_auc_score(y_test, y_proba_rf),
        "accuracy": accuracy_score(y_test, y_pred_rf),
        "recall": recall_score(y_test, y_pred_rf),
        "precision": precision_score(y_test, y_pred_rf),
        "training_time": train_time_rf
    }

    # Sauvegarde des résultats dans un CSV pandas
    all_metrics = []
    for model, metrics in results.items():
        row = {"model": model}
        row.update(metrics)
        all_metrics.append(row)

    os.makedirs(os.path.dirname(OUTPUT_CPU_CSV), exist_ok=True)
    df_results = pd.DataFrame(all_metrics)
    df_results.to_csv(OUTPUT_CPU_CSV, index=False)
    print(f"\n✅ Résultats CPU sauvegardés dans {OUTPUT_CPU_CSV}")

    return results


# Option RAPIDS (GPU) - à activer si environnement GPU prêt
# def train_rapids_model():
#     print("\n📥 Chargement des données RAPIDS (GPU)...")
#     df = cudf.read_csv(DATA_PATH)
#     features = [col for col in df.columns if col != "Class"]
#     X = df[features]
#     y = df["Class"]
#
#     X_train, X_test, y_train, y_test = cu_train_test_split(X, y, test_size=0.2, random_state=42)
#
#     print("\n🚀 Entraînement du modèle LogisticRegression (RAPIDS GPU)...")
#     start = time.time()
#     lr = cuLogisticRegression()
#     lr.fit(X_train, y_train)
#     end = time.time()
#
#     y_pred = lr.predict(X_test)
#     y_proba = lr.predict_proba(X_test)[:, 1]
#
#     results = {
#         "RAPIDS LogisticRegression": {
#             "auc": float(cu_roc_auc_score(y_test, y_proba)),
#             "accuracy": float(cu_accuracy_score(y_test, y_pred)),
#             "recall": float(cu_recall_score(y_test, y_pred)),
#             "precision": float(cu_precision_score(y_test, y_pred)),
#             "training_time": end - start
#         }
#     }
#
#     print("\n📈 Résultats RAPIDS :")
#     for metric, value in results["RAPIDS LogisticRegression"].items():
#         print(f"{metric}: {value:.4f}" if isinstance(value, float) else f"{metric}: {value}")
#
#     os.makedirs(os.path.dirname(OUTPUT_RAPIDS_CSV), exist_ok=True)
#     df_results = pd.DataFrame([
#         {"model": model, **metrics}
#         for model, metrics in results.items()
#     ])
#     df_results.to_csv(OUTPUT_RAPIDS_CSV, index=False)
#     print(f"✅ Sauvegarde RAPIDS terminée dans {OUTPUT_RAPIDS_CSV}")
#
#     return results


if __name__ == "__main__":
    # Lancer l'entraînement CPU
    train_cpu_model()

    # Lancer l'entraînement RAPIDS GPU (décommenter si GPU disponible et setup)
    # train_rapids_model()



🔍 Vérification des microservices :
✅ ingestion-service OK : {'status': 'ingestion-service running'}
✅ train-service OK : {'status': 'train-service running'}
✅ predict-service OK : {'status': 'predict-service running'}
✅ compare-service OK : {'status': 'compare-service running'}
📥 Chargement des données CPU depuis : ../data/creditcard.csv

🚀 Entraînement LogisticRegression CPU...

🚀 Entraînement RandomForest CPU...

✅ Résultats CPU sauvegardés dans ../models/resultats_auc_cpu.csv


In [None]:
import cudf
from cuml.linear_model import LogisticRegression as cuLogisticRegression
from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier
from cuml.metrics import accuracy_score, recall_score, precision_score, roc_auc_score
from cuml.model_selection import train_test_split
import pandas as pd
import time
import os

def run_rapids_models(data_path, results_path="../models/resultats_auc_rapids.csv"):
    print("📥 Chargement des données RAPIDS GPU depuis :", data_path)
    df = cudf.read_csv(data_path)

    features = [col for col in df.columns if col != "Class"]
    X = df[features]
    y = df["Class"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results = {}

    # Logistic Regression RAPIDS
    print("\n🚀 Entraînement LogisticRegression RAPIDS GPU...")
    start = time.time()
    lr = cuLogisticRegression()
    lr.fit(X_train, y_train)
    train_time_lr = time.time() - start

    y_pred_lr = lr.predict(X_test)
    y_proba_lr = lr.predict_proba(X_test)[:, 1]

    results["RAPIDS LogisticRegression"] = {
        "auc": float(roc_auc_score(y_test, y_proba_lr)),
        "accuracy": float(accuracy_score(y_test, y_pred_lr)),
        "recall": float(recall_score(y_test, y_pred_lr)),
        "precision": float(precision_score(y_test, y_pred_lr)),
        "training_time": train_time_lr
    }

    # Random Forest RAPIDS
    print("\n🚀 Entraînement RandomForest RAPIDS GPU...")
    start = time.time()
    rf = cuRandomForestClassifier(n_estimators=20, random_state=42)
    rf.fit(X_train, y_train)
    train_time_rf = time.time() - start

    y_pred_rf = rf.predict(X_test)
    y_proba_rf = rf.predict_proba(X_test)[:, 1]

    results["RAPIDS RandomForest"] = {
        "auc": float(roc_auc_score(y_test, y_proba_rf)),
        "accuracy": float(accuracy_score(y_test, y_pred_rf)),
        "recall": float(recall_score(y_test, y_pred_rf)),
        "precision": float(precision_score(y_test, y_pred_rf)),
        "training_time": train_time_rf
    }

    # Sauvegarde des résultats dans un CSV compatible pandas
    all_metrics = []
    for model, metrics in results.items():
        row = {"model": model}
        row.update(metrics)
        all_metrics.append(row)
    df_results = pd.DataFrame(all_metrics)
    os.makedirs(os.path.dirname(results_path), exist_ok=True)
    df_results.to_csv(results_path, index=False)
    print(f"\n✅ Résultats RAPIDS sauvegardés dans {results_path}")

    return results


# Si exécution directe en standalone
if __name__ == "__main__":
    DATA_PATH = "../data/creditcard.csv"
    run_rapids_models(DATA_PATH)
