In [2]:
from data_loader import DataLoader
from preprocessor import Preprocessor
from models import ModelTrainer
from evaluator import Evaluator

In [3]:
data_loader = DataLoader('data/train.csv')
data_loader.load_data()
X, y = data_loader.split_features_target()

Chargement des donn√©es depuis data/train.csv...
Donn√©es charg√©es: 990 exemples, 194 colonnes
Features (X): (990, 192)
Target (y): (990,)
Nombre de classes: 99


In [4]:
X_train, X_val, y_train, y_val = data_loader.create_train_val_split(X, y)


Division train/validation:
  Train: 792 exemples
  Validation: 198 exemples


In [5]:
preprocessor = Preprocessor()
y_train_encoded, y_val_encoded = preprocessor.encode_labels(y_train, y_val)
X_train_scaled, X_val_scaled = preprocessor.scale_features(X_train, X_val)


Encodage des labels...
  Nombre de classes uniques: 99
Standardisation des features...
  Moyenne des features (train): -0.0000
  Std des features (train): 1.0000


In [6]:
trainer = ModelTrainer(cv_folds=5)
results = trainer.train_all_models(X_train_scaled, y_train_encoded)


ENTRA√éNEMENT DE TOUS LES MOD√àLES

Entra√Ænement: RandomForest
Fitting 5 folds for each of 18 candidates, totalling 90 fits

Meilleurs param√®tres: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Score cross-validation: 0.9773

Entra√Ænement: SVM
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Meilleurs param√®tres: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Score cross-validation: 0.9836

Entra√Ænement: KNN
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Meilleurs param√®tres: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Score cross-validation: 0.9697

Entra√Ænement: LogisticRegression
Fitting 5 folds for each of 6 candidates, totalling 30 fits

Meilleurs param√®tres: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Score cross-validation: 0.9836

Entra√Ænement: GradientBoosting
Fitting 5 folds for each of 18 candidates, totalling 90 fits

Meilleurs param√®tres: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators':

In [9]:
evaluator = Evaluator()
df_results = evaluator.evaluate_all_models(results, X_val_scaled, y_val_encoded)


√âVALUATION SUR L'ENSEMBLE DE VALIDATION

R√©sultats comparatifs:
        model_name  accuracy  precision   recall  f1_score  cv_score
               MLP  0.994949   0.996633 0.994949  0.994613  0.981060
               SVM  0.989899   0.993266 0.989899  0.989226  0.983560
LogisticRegression  0.989899   0.993266 0.989899  0.989226  0.983584
      RandomForest  0.984848   0.988215 0.984848  0.984175  0.977263
               KNN  0.979798   0.986532 0.979798  0.978451  0.969708
  GradientBoosting  0.611111   0.622166 0.611111  0.583703  0.628788

üèÜ MEILLEUR MOD√àLE: MLP
   Accuracy: 0.9949


Fichier test avec soumission!

In [None]:
import pandas as pd 

# Charger test.csv
test_df = pd.read_csv('data/test.csv')
test_ids = test_df['id']
X_test = test_df.drop(['id'], axis=1)

# Pr√©traiter (IMPORTANT: transform seulement, pas fit!)
X_test_scaled = preprocessor.scaler.transform(X_test)

# Pr√©dire avec le meilleur mod√®le
best_model_name = df_results.iloc[0]['model_name']
best_model = results[best_model_name]['model']

y_test_pred_encoded = best_model.predict(X_test_scaled)

# Convertir les pr√©dictions en noms de species
y_test_pred = preprocessor.label_encoder.inverse_transform(y_test_pred_encoded)

# Cr√©er le fichier de soumission
submission = pd.DataFrame({
    'id': test_ids,
    'species': y_test_pred
})

# Sauvegarder
submission.to_csv('submission.csv', index=False)
print(f"Fichier submission.csv cr√©√© avec {len(submission)} pr√©dictions!")

‚úì Fichier submission.csv cr√©√© avec 594 pr√©dictions!
