<a href="https://colab.research.google.com/github/kwnstantinosRoumeliwtis/ML_course/blob/main/random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

df = pd.read_csv("train_hh_features.csv")
target= pd.read_csv("train_hh_gt.csv")
cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
num_cols = df.select_dtypes(include=['number']).columns.tolist()

drop_cols = ['hhid', 'survey_id']

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OrdinalEncoder())
])
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median"))
])
preprocess = ColumnTransformer(transformers=[
    ("drop", "drop", drop_cols),
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])
pipeline = Pipeline([
    ("preprocess", preprocess),
    ("select", SelectKBest(score_func=mutual_info_regression)),
    ("model", RandomForestRegressor(random_state=42))
])

param_grid = {
    "select__k":[10,15,20,30,50,80],
    "model__n_estimators":[100,200],
    "model__max_depth":[10, 15, None],
    "model__min_samples_split":[2, 5],
    "model__min_samples_leaf": [1, 2]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1,
    verbose=2
)

X = df.copy()
y = target["cons_ppp17"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid.fit(X_train[:50000], y_train[:50000])

print("Best parameters:", grid.best_params_)
print("Best CV R2:", grid.best_score_)

y_pred = grid.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)
print("MAPE:",mape)


Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best parameters: {'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 200, 'select__k': 80}
Best CV R2: 0.5954012052951044
MAE: 3.5697867911167687
RMSE: 6.215877184028885
R2: 0.6154374344650716
MAPE: 35.22961138371489


In [None]:
best_pipe = grid.best_estimator_
feature_names = best_pipe.named_steps["preprocess"].get_feature_names_out()
mask = best_pipe.named_steps["select"].get_support()
selected_features = feature_names[mask]
print(selected_features)

['num__hhid' 'num__weight' 'num__strata' 'num__utl_exp_ppp17' 'num__hsize'
 'num__num_children5' 'num__num_children10' 'num__num_children18'
 'num__age' 'num__num_adult_female' 'num__num_adult_male'
 'num__sworkershh' 'num__share_secondary' 'num__sfworkershh'
 'num__region1' 'num__region2' 'num__region3' 'num__region5'
 'num__region6' 'num__region7' 'cat__male' 'cat__owner' 'cat__water'
 'cat__toilet' 'cat__sewer' 'cat__elect' 'cat__water_source'
 'cat__sanitation_source' 'cat__dweltyp' 'cat__employed' 'cat__educ_max'
 'cat__any_nonagric' 'cat__sector1d' 'cat__urban' 'cat__consumed100'
 'cat__consumed300' 'cat__consumed400' 'cat__consumed500'
 'cat__consumed600' 'cat__consumed700' 'cat__consumed800'
 'cat__consumed900' 'cat__consumed1000' 'cat__consumed1100'
 'cat__consumed1200' 'cat__consumed1300' 'cat__consumed1400'
 'cat__consumed1600' 'cat__consumed1700' 'cat__consumed1900'
 'cat__consumed2000' 'cat__consumed2100' 'cat__consumed2200'
 'cat__consumed2300' 'cat__consumed2400' 'cat__c

In [None]:
import joblib
joblib.dump(best_pipe, "preprocess_pipeline.joblib")