In [None]:
import os
print(os.getcwd())  # Affiche le rÃ©pertoire courant


In [None]:
import os

# Changer de rÃ©pertoire
os.chdir("C:/Users/jcluy/Python/detection-anomalies-aws-mlflow")

# VÃ©rifier qu'on est bien dans le bon dossier
print("RÃ©pertoire actuel :", os.getcwd())

In [None]:
from src.FonctionsPerso import *
import pandas as pd

# DÃ©finir le chemin proprement
base_dir = "C:/Users/jcluy/Python/detection-anomalies-aws-mlflow"
file_path = os.path.join(base_dir, "data", "raw", "creditcard.csv")

# Charger les donnÃ©es
df = pd.read_csv(file_path, sep=',')

# VÃ©rifier le chargement
print(df.head())


In [None]:
afficher_informations_dataset(df)

In [None]:
df.Class.value_counts()

In [None]:
# VÃ©rifier les valeurs manquantes
missing_values = df.isnull().sum()
missing_values[missing_values > 0]  # Affiche uniquement les colonnes avec des NaN


In [None]:
# VÃ©rifier les doublons
duplicates = df.duplicated().sum()
print(f"Nombre de doublons : {duplicates}")

# Supprimer les doublons si nÃ©cessaire
df = df.drop_duplicates()


In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,6))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [None]:
df_scaled

In [None]:
df.to_csv("data/processed/creditcard_cleaned.csv", index=False)

In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# SÃ©lectionner les colonnes numÃ©riques (hors labels comme 'Class' si prÃ©sent)
num_cols = df.select_dtypes(include=[np.number]).columns

# DÃ©terminer le nombre de lignes et colonnes pour les subplots
n_cols = 3  # Fixe le nombre de colonnes Ã  3
n_rows = int(np.ceil(len(num_cols) / n_cols))  # Calcule dynamiquement le nombre de lignes

# CrÃ©er les subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 3)) 
axes = axes.flatten()  # Aplatir la grille en 1D pour Ã©viter les erreurs d'index

# Boucle pour tracer les histogrammes
for i, col in enumerate(num_cols):
    sns.histplot(df[col], kde=True, ax=axes[i])
    axes[i].set_title(f"Distribution de {col}")

# Supprimer les subplots inutilisÃ©s s'il y en a
for j in range(i + 1, len(axes)):  
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler()
}

scaled_dfs = {}

# DÃ©terminer le nombre de lignes et colonnes pour les subplots
n_cols = 3  # Fixe le nombre de colonnes Ã  3
n_rows = int(np.ceil(len(num_cols) / n_cols))  # Calcule dynamiquement le nombre de lignes

for name, scaler in scalers.items():
    scaled_df = pd.DataFrame(scaler.fit_transform(df[num_cols]), columns=num_cols)
    scaled_dfs[name] = scaled_df

# CrÃ©er les subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 3)) 
axes = axes.flatten()  # Aplatir la grille en 1D pour Ã©viter les erreurs d'index

for i, col in enumerate(num_cols):
    for name, scaled_df in scaled_dfs.items():
        sns.kdeplot(scaled_df[col], ax=axes[i], label=name, fill=True)
    axes[i].set_title(f"Normalisation de {col}")
    axes[i].legend()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.decomposition import PCA

# Appliquer PCA sur les donnÃ©es normalisÃ©es
pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(scaled_dfs["StandardScaler"])  # Changer pour tester d'autres scalers

# Visualiser la variance expliquÃ©e
print(f"Variance expliquÃ©e par les deux premiÃ¨res composantes : {sum(pca.explained_variance_ratio_):.2f}")

# Visualisation du PCA
plt.figure(figsize=(8,6))
plt.scatter(pca_transformed[:,0], pca_transformed[:,1], alpha=0.5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Visualisation des donnÃ©es aprÃ¨s PCA")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca_full = PCA()
pca_full.fit(scaled_dfs["StandardScaler"])  # Changer pour tester un autre scaler

# Calculer la variance expliquÃ©e cumulÃ©e
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

# Tracer le graphe
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.axhline(y=0.8, color='r', linestyle='--')  # Ligne rouge pour 80% de variance expliquÃ©e
plt.xlabel("Nombre de composantes")
plt.ylabel("Variance expliquÃ©e cumulÃ©e")
plt.title("Choix du nombre de composantes pour la PCA")
plt.show()


In [None]:
feature_importance = np.abs(pca_full.components_[0])
feature_names = scaled_dfs["StandardScaler"].columns

# Afficher les features les plus importantes pour PC1
sorted_idx = np.argsort(feature_importance)[::-1]
for i in range(10):
    print(f"{feature_names[sorted_idx[i]]}: {feature_importance[sorted_idx[i]]:.3f}")


In [None]:
# VÃ©rifier les valeurs nÃ©gatives par colonne
neg_values = df[num_cols].lt(-1).sum()
print(neg_values[neg_values > 0])  # Afficher uniquement les colonnes avec des valeurs < -1


In [None]:
df_shifted = df.copy()

for col in num_cols:
    min_val = df_shifted[col].min()
    if min_val < 0:  
        df_shifted[col] += abs(min_val) + 1  # DÃ©calage pour que la plus petite valeur soit 1

# Appliquer la transformation logarithmique aprÃ¨s le dÃ©calage
df_log = np.log1p(df_shifted[num_cols])

In [None]:
df_log = df.copy()
for col in num_cols:
    df_log[col] = np.log1p(df_log[col])  # log(x + 1) pour Ã©viter log(0)


In [None]:
df_log.describe()  # VÃ©rifier les statistiques aprÃ¨s la transformation

In [None]:
print(np.__version__)

In [None]:
import warnings
from sklearn.preprocessing import PowerTransformer

np.warnings = warnings  # Correction de l'import

# SÃ©parer les features (X) et la target (y) avant la transformation
X = df.drop(columns=["Class"])  # Remplace "Class" par le nom exact de la cible
y = df["Class"]  # Garder y intact

# Appliquer la transformation uniquement sur les features
scaler = PowerTransformer()
X_transformed = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [None]:
print(y.unique())  # Doit afficher uniquement [0 1]

In [None]:
from sklearn.model_selection import train_test_split

# # SÃ©lectionner les features (X) et la target (y)
# X = df_transformed.drop(columns=["Class"])  # Remplace "Class" par la vraie colonne cible
# y = df_transformed["Class"]

# SÃ©parer en train (80%) et test (20%)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42, stratify=y)

# VÃ©rification des tailles
print(f"Taille du jeu d'entraÃ®nement : {X_train.shape}")
print(f"Taille du jeu de test : {X_test.shape}")


In [None]:
y.value_counts()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialiser et entraÃ®ner le modÃ¨le
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# PrÃ©dictions
y_pred = model.predict(X_test)

# Ã‰valuation
print(classification_report(y_test, y_pred))

In [None]:
# pip install xgboost

In [None]:
print(y_train.unique())  # VÃ©rifie les valeurs uniques
print(y_train.value_counts())  # Affiche le nombre dâ€™occurrences par classe

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")


In [None]:
# pip install mlflow

In [None]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment("Detection Anomalies")

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # PrÃ©dictions
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Log des rÃ©sultats
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("accuracy", acc)
    
    # Sauvegarde du modÃ¨le
    mlflow.sklearn.log_model(model, "model")

print("ExpÃ©rience enregistrÃ©e dans MLflow ! ðŸŽ¯")


In [None]:
# import sys
# print(sys.executable)

In [None]:
import mlflow
import mlflow.sklearn

# DÃ©finir l'expÃ©rience MLflow (elle sera visible dans MLflow UI)
mlflow.set_experiment("Detection Anomalies")

print("MLflow est activÃ© ! ðŸš€")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

with mlflow.start_run():  # DÃ©marrer un run MLflow
    # DÃ©finir et entraÃ®ner le modÃ¨le
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Faire des prÃ©dictions
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # ðŸ“Œ Enregistrer les paramÃ¨tres
    mlflow.log_param("n_estimators", 100)
    
    # ðŸ“Œ Enregistrer la mÃ©trique dâ€™accuracy
    mlflow.log_metric("accuracy", acc)

    # ðŸ“Œ Enregistrer le modÃ¨le
    input_example = pd.DataFrame([X_train.iloc[0]])  # Exemple d'entrÃ©e
    mlflow.sklearn.log_model(model, "model", input_example=input_example)

print("ExpÃ©rience et modÃ¨le enregistrÃ©s avec succÃ¨s ! ðŸŽ¯")

In [None]:
print(mlflow.get_tracking_uri())  # Voir oÃ¹ les logs sont enregistrÃ©s


In [None]:
experiments = mlflow.search_experiments()
for exp in experiments:
    print(f"ExpÃ©rience : {exp.name} | ID : {exp.experiment_id}")


In [None]:
mlflow.sklearn.autolog()  # Active l'auto-logging
with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

In [None]:
print(mlflow.get_tracking_uri())  # Voir oÃ¹ les logs sont enregistrÃ©s

In [None]:
import os

mlflow_path = "C:/Users/jcluy/mlruns/855462862201448649"  # ID de ton expÃ©rience
print(os.path.exists(mlflow_path))  # VÃ©rifier si le dossier existe


In [None]:
# Dans anaconda prompt
# mlflow ui --backend-store-uri file:///C:/Users/jcluy/Python/detection-anomalies-aws-mlflow/mlruns

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # ðŸ“Œ Calcul des mÃ©triques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # ðŸ“Œ Log des mÃ©triques dans MLflow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # ðŸ“Œ Log du modÃ¨le
    mlflow.sklearn.log_model(model, "model")

print("MÃ©triques et modÃ¨le enregistrÃ©s avec succÃ¨s ! ðŸŽ¯")

In [None]:
runs = mlflow.search_runs(order_by=["start_time desc"])
print(runs[["run_id", "metrics.accuracy"]].head())  # Affiche les derniers runs

In [None]:
# mlflow models serve -m "mlruns/855462862201448649/44dc24312237424b9b341bed1ed4f903/artifacts/model" -p 5001


In [None]:
# import requests
# import json

# url = "http://127.0.0.1:5001/invocations"
# data = {"instances": [[5.1, 3.5, 1.4, 0.2]]}  # Remplace par les vraies features
# headers = {"Content-Type": "application/json"}

# response = requests.post(url, data=json.dumps(data), headers=headers)
# print(response.json())  # PrÃ©diction du modÃ¨le


In [None]:
import requests
import json

url = "http://127.0.0.1:5001/invocations"

# GÃ©nÃ©rer un exemple avec 30 features (remplace par une vraie ligne du dataset)
sample_input = np.random.rand(30).tolist()  # Exemple alÃ©atoire

data = {"instances": [sample_input]}
headers = {"Content-Type": "application/json"}

response = requests.post(url, data=json.dumps(data), headers=headers)
print(response.json())  # Affiche la prÃ©diction
