### Load librairies

In [10]:
import joblib 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

### Import data

In [11]:
X_train = joblib.load("../data/processed/X_train_final.pkl")
X_test = joblib.load("../data/processed/X_test_final.pkl")
y_train = joblib.load("../data/processed/y_train.pkl")
y_test = joblib.load("../data/processed/y_test.pkl")

## Training model

In [12]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
y_pred=rf.predict(X_test)

In [14]:
mse=mean_squared_error(y_test,y_pred)
mae=mean_absolute_error(y_test,y_pred)
R2=r2_score(y_test,y_pred)

print(f'The R² score is: {R2}')
print(f'The mean absolue error is: {mae}')
print(f'The mean squared error is: {mse}')

The R² score is: 0.9861107273754457
The mean absolue error is: 1062.4951359306804
The mean squared error is: 7149327.794848486


We can see that we have a really great R² score and a better mae for this regressor than for the linear one, let's try to get a better error with GridSearch

In [15]:
Grid_rf=GridSearchCV(
    RandomForestRegressor(),
    param_grid = {
    "n_estimators": [100, 200, 300],         # Nombre d'arbres dans la forêt (ntree)
    "max_features": ["sqrt", "log2", 0.5],   # Nombre de features à considérer pour chaque split (mtry)
    "min_samples_leaf": [1, 5, 10],          # Nombre min d'échantillons dans une feuille terminale (ndsize)
    "max_depth": [None, 10, 20],             # Profondeur max des arbres (None = croissance libre)
    "bootstrap": [True, False],              # Méthode de sampling (True = bootstrap classique)
    "min_samples_split": [2, 5, 10]          # Min d’échantillons pour splitter un nœud (réduit sur-apprentissage)
    },
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1
)

In [16]:
Grid_rf.fit(X_train, y_train)

14 fits failed out of a total of 1458.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\CYTech Student\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\Local\pypoetry\Cache\virtualenvs\flight-price-prediction-f9u3PMIM-py3.12\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\CYTech Student\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\Local\pypoetry\Cache\virtualenvs\flight-price-prediction-f9u3PMIM-py3.12\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"{'bootstrap': [True, False], 'max_depth': [None, 10, ...], 'max_features': ['sqrt', 'log2', ...], 'min_samples_leaf': [1, 5, ...], ...}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.5
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [17]:
print("Best params:", Grid_rf.best_params_)
print("Best MAE:", -Grid_rf.best_score_)  

Best params: {'bootstrap': False, 'max_depth': None, 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best MAE: 1150.4238273055591


In [18]:
best_model = Grid_rf.best_estimator_
y_pred_best = best_model.predict(X_test)

mae_test = mean_absolute_error(y_test, y_pred_best)
print("MAE on test set:", mae_test)

MAE on test set: 1084.460819063129


Now let's use this model and these parametres for our API