# Random forest 1

Dans ce notebook on test notre premier regresseur, un random forest basique qui nous donnera une base de départ pour comparer les futures approches.

On fait très peu de pré-processing, simplement une fenêtre géographique pour enlever quelques outliers en conservant + de 98% des données.

Pour limiter le temps de calcul, on entraine uniquement sur les données qui ont un nombre significatif d'observations, dans un premier temps > 10 puis > 6.
On entraine sur environ 30% puis 40% des données, ce qui est une grosse perte mais qui donne quand même de bon premiers résultats.

# random forest

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from utils import load_train_data
from utils import spearman_corr, rmse
from utils import geo_filter, total_count_filter
import pandas as pd


x, y = load_train_data()

kf = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=42)


model = RandomForestRegressor(
    criterion="squared_error",
    n_estimators=10,
    max_depth=None,
    random_state=42,
    n_jobs=-1
    )

spearman_scores = []
rmse_scores = []

i=1
for train_index, val_index in kf.split(x):
    x_train, x_val = x.iloc[train_index], x.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Sélection des données
    geo_filter_mask = geo_filter(x_train)
    print(f"geo filter mask filtered {geo_filter_mask.sum() /len(x_train):.2%} of data")
    total_count_filter_mask = total_count_filter(x_train, n_min=10)
    print(f"total count filter mask filtered {total_count_filter_mask.sum() / len(x_train):.2%} of data")
    filter_mask = geo_filter_mask & total_count_filter_mask
    print(f"Au total: {filter_mask.sum() / len(x_train):.2%} des données conservées.")
    
    x_train = x_train[filter_mask]
    y_train = y_train[filter_mask]
    model.fit(x_train, y_train.values.flatten())
    
    y_pred = model.predict(x_val)
    
    spearman_scores.append(spearman_corr(y_val.values.flatten(), y_pred))
    rmse_scores.append(rmse(y_val.values.flatten(), y_pred))
    print(f"Fold {i}:\tspearman: {spearman_scores[-1]:.4f},\trmse: {rmse_scores[-1]:.4f}\n")
    
    i+=1

print(f"Score de corrélation de Spearman: {np.mean(spearman_scores):.4f}, std: {np.std(spearman_scores):.4f}")
print(f"RMSE: {np.mean(rmse_scores):.4f}, std: {np.std(rmse_scores):.4f}")



## Test

In [None]:
from utils import load_train_data, load_test_data
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

x_train, y_train = load_train_data()
x_test = load_test_data()

model = RandomForestRegressor(
    criterion="squared_error",
    n_estimators=10,
    max_depth=None,
    random_state=42,
    n_jobs=-1
    )

filter_mask, ratio_filtered_data = filter_data(x_train, n_min=5, geo_filter=True)
x_train = x_train[filter_mask]
y_train = y_train[filter_mask]

model.fit(x_train, y_train.values.flatten())
y_pred = model.predict(x_test)

y_pred = pd.Series(y_pred, index=x_test.index, name="prediction")
y_pred.to_csv("output/y_pred_rdmforest2.csv", index=True, header=False)

In [None]:
y_pred.to_csv("output/y_pred_rdmforest2.csv", index=True, header=True)