## Nettoyage

In [83]:
import pandas as pd
import seaborn as sns

data = sns.load_dataset("penguins")

data.head()
data.info()
data.describe()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [84]:
df = data.dropna()
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

## Pipeline

In [85]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

#définir les colonnes catégorielles

categorical_colums = ['island', 'sex']
numerical_colums = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
target_column = 'species'

# Créer un pipeline pour les colonnes catégorielles
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers =[
        ('num',numerical_transformer, numerical_colums),
        ('cat', categorical_transformer,categorical_colums)
    ]
)

X = df.drop(columns=target_column)
y = df[[target_column]]

target_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
y_encoded = target_encoder.fit_transform(y)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])


In [86]:
print(target_encoder.categories_)

[array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)]


In [87]:
from sklearn.model_selection import train_test_split

# Diviser X et y en deux parties : 2/3 pour ML, 1/3 pour production
X_ml, X_prod, y_ml, y_prod = train_test_split(X,y_encoded,test_size=1/3, random_state =42)


# Ensuite, diviser X_ml et y_ml en train et test
X_train, X_test, y_train, y_test = train_test_split(X_ml, y_ml, test_size=0.3, random_state=42)

In [88]:
pipeline.fit(X_train)
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)
X_prod_transformed = pipeline.transform(X_prod)


## Modélisation

In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


model = RandomForestClassifier(random_state=42)
model.fit(X_train_transformed, y_train)

y_pred = model.predict(X_test_transformed)

# Evaluation des performances

accuracy = accuracy_score(y_test, y_pred)
classification = classification_report(y_test, y_pred)


print(f"Accuracy sur le jeu de test : {accuracy:.2f}")
print("Rapport de classification :\n", classification)


#Sauvegarder le pipeline
joblib.dump(pipeline,'../monitoring/app/pipeline.pkl')

#Sauvegarder le modèle entraîné
joblib.dump(model, '../monitoring/app/model.pkl')

# Sauvegarder l'encodeur cible
joblib.dump(target_encoder, '../monitoring/app/target_encoder.pkl')

print("Pipeline, modèle et encodeur cible sauvegardés avec succès")



Accuracy sur le jeu de test : 1.00
Rapport de classification :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        23

   micro avg       1.00      1.00      1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67
 samples avg       1.00      1.00      1.00        67

Pipeline, modèle et encodeur cible sauvegardés avec succès
