In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

In [11]:
df = pd.read_csv("C:/Users/Ilias/documents/cours_mines/kaggle/road_accident/train.csv")
df_test = pd.read_csv("C:/Users/Ilias/documents/cours_mines/kaggle/road_accident/test.csv")
target = df.columns.tolist()[-1]
print(df.shape)
df.head()

(517754, 14)


Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [12]:
for dff in [df, df_test] : 
    dff['is_high_accident_zone'] = (dff['num_reported_accidents'] > 3).astype(bool)
    dff['is_night_foggy'] = ((dff['weather'] == 'foggy') & (dff['lighting'] == 'night')).astype(int)

In [None]:
# ici, on encode les features catégoriques par fréquence d'apparition
def create_frequency_features(train_df, test_df, cols, num, cat):

    train, test = train_df.copy(), test_df.copy()

    for col in cols:
        freq = train[col].value_counts(normalize=True)
        train[f"{col}_freq"] = train[col].map(freq)
        test[f"{col}_freq"] = test[col].map(freq).fillna(train[f"{col}_freq"].mean())

        if col in num:
            for q in [5, 10, 15]:
                try:
                    train[f"{col}_bin{q}"], bins = pd.qcut(train[col], q=q, labels=False, retbins=True, duplicates="drop")
                    test[f"{col}_bin{q}"] = pd.cut(test[col], bins=bins, labels=False, include_lowest=True)
                except Exception:
                    train[f"{col}_bin{q}"] = test[f"{col}_bin{q}"] = 0

    new_num = train.drop(columns=cat+[target]).columns.tolist()
    return train, test, new_num

In [None]:
cols = df.drop(columns=target).columns.tolist()

cat = [col for col in cols if df[col].dtype in ["object","category"] and col != target]

num = [col for col in cols if df[col].dtype not in ["object","category","bool"] and col not in ["id", target]]

df, df_test, new_num = create_frequency_features(df, df_test.copy(), cols, num, cat)

df[cat], df_test[cat] = df[cat].astype("category"), df_test[cat].astype("category")

df = df.drop(columns=["time_of_day", "num_lanes", "road_type", "road_signs_present", "id_freq"])
df_test = df_test.drop(columns=["time_of_day", "num_lanes", "road_type", "road_signs_present", "id_freq"])

df.drop(columns="id", inplace=True)
df.drop_duplicates(inplace=True)

In [15]:
print(df.columns.tolist())

['curvature', 'speed_limit', 'lighting', 'weather', 'public_road', 'holiday', 'school_season', 'num_reported_accidents', 'accident_risk', 'is_high_accident_zone', 'is_night_foggy', 'road_type_freq', 'num_lanes_freq', 'num_lanes_bin5', 'num_lanes_bin10', 'num_lanes_bin15', 'curvature_freq', 'curvature_bin5', 'curvature_bin10', 'curvature_bin15', 'speed_limit_freq', 'speed_limit_bin5', 'speed_limit_bin10', 'speed_limit_bin15', 'lighting_freq', 'weather_freq', 'road_signs_present_freq', 'public_road_freq', 'time_of_day_freq', 'holiday_freq', 'school_season_freq', 'num_reported_accidents_freq', 'num_reported_accidents_bin5', 'num_reported_accidents_bin10', 'num_reported_accidents_bin15', 'is_high_accident_zone_freq', 'is_night_foggy_freq', 'is_night_foggy_bin5', 'is_night_foggy_bin10', 'is_night_foggy_bin15']


avant ça, j'avais fait beaucoup de feature engineering (k-means, produits, ratio...) mais cela créait pas mal de bruit donc j'ai décidé de conserver peu de features

In [None]:
# ici, j'ai au prélable effectué une régression linéaire des features que je trouvais les plus intéressantes. 
# les coefficients correspondent donc à ceux donnés par ma régression
import scipy

def f(X):
    return \
    0.31 * X["curvature"] + \
    0.22 * (X["lighting"] == "night").astype(int) + \
    0.07 * (X["weather"] != "clear").astype(int) + \
    0.17 * (X["speed_limit"] >= 60).astype(int) + \
    0.13 * (X["num_reported_accidents"] > 2).astype(int)

# ici, pour clipper, au lieu de juste remettre tout dans [0,1], on définit une loi normale centrée sur notre valeur de f(X) et d'incertitude sigma
def clip(f):
    def clip_f(X):
        sigma = 0.05
        mu = f(X)
        a, b = -mu/sigma, (1-mu)/sigma
        Phi_a, Phi_b = scipy.stats.norm.cdf(a), scipy.stats.norm.cdf(b)
        phi_a, phi_b = scipy.stats.norm.pdf(a), scipy.stats.norm.pdf(b)
        return mu*(Phi_b-Phi_a)+sigma*(phi_a-phi_b)+1-Phi_b
    return clip_f

z = clip(f)(df)
df["y"] = z.values
w = clip(f)(df_test)
df_test["y"] = w.values

In [17]:
df.head()

Unnamed: 0,curvature,speed_limit,lighting,weather,public_road,holiday,school_season,num_reported_accidents,accident_risk,is_high_accident_zone,...,num_reported_accidents_freq,num_reported_accidents_bin5,num_reported_accidents_bin10,num_reported_accidents_bin15,is_high_accident_zone_freq,is_night_foggy_freq,is_night_foggy_bin5,is_night_foggy_bin10,is_night_foggy_bin15,y
0,0.06,35,daylight,rainy,True,False,True,0,0.13,False,...,0.404968,0,0,0,0.994884,0.899781,0,0,0,0.118153
1,0.99,35,daylight,clear,False,True,True,0,0.35,False,...,0.241947,0,0,0,0.994884,0.899781,0,0,0,0.297
2,0.63,70,dim,clear,True,True,False,0,0.3,False,...,0.28192,1,1,1,0.994884,0.899781,0,0,0,0.389
3,0.07,35,dim,rainy,True,False,False,0,0.21,False,...,0.404968,0,0,0,0.994884,0.899781,0,0,0,0.121128
4,0.58,60,daylight,foggy,False,True,False,0,0.56,False,...,0.404968,0,0,0,0.994884,0.899781,0,0,0,0.474


In [None]:
# j'ai juste gardé les meilleurs paramètres ici
dtrain = xgb.DMatrix(df.drop(columns=target), label=df[target], enable_categorical=True)

xgb_params = {
    'max_depth': 10, 'learning_rate': 0.011,
    'subsample': 0.82, 'colsample_bytree': 0.81,
    'min_child_weight': 3, 'gamma': 0.011,
    'reg_alpha': 0.12, 'reg_lambda': 0.4,
    'max_delta_step': 1, 'colsample_bylevel': 0.86,
    'colsample_bynode': 0.88, 'scale_pos_weight': 0.36,
    'max_bin': 512, 'tree_method': 'hist', "device":"cuda",
    'eval_metric': 'rmse', 'random_state': 42,
}

cv_results = xgb.cv(
    params=xgb_params,
    dtrain=dtrain,
    nfold=5,
    num_boost_round=2000,
    metrics='rmse',
    verbose_eval=100,
    early_stopping_rounds=50
)

print(cv_results.tail())

best_round = cv_results['test-rmse-mean'].idxmin()
best_rmse = cv_results['test-rmse-mean'][best_round]
print(f"Best round: {best_round}, Best CV RMSE: {best_rmse:.7f}")

  return getattr(self.bst, name)(*args, **kwargs)
  return getattr(self.bst, name)(*args, **kwargs)
  self.bst.update(self.dtrain, iteration, fobj)
  self.bst.update(self.dtrain, iteration, fobj)
  self.bst.update(self.dtrain, iteration, fobj)
  self.bst.update(self.dtrain, iteration, fobj)


[0]	train-rmse:0.16480+0.00005	test-rmse:0.16480+0.00019
[100]	train-rmse:0.07593+0.00005	test-rmse:0.07612+0.00029
[200]	train-rmse:0.05817+0.00005	test-rmse:0.05859+0.00027
[300]	train-rmse:0.05573+0.00006	test-rmse:0.05627+0.00025
[400]	train-rmse:0.05539+0.00006	test-rmse:0.05600+0.00024
[500]	train-rmse:0.05531+0.00006	test-rmse:0.05596+0.00023
[600]	train-rmse:0.05528+0.00006	test-rmse:0.05595+0.00023
[700]	train-rmse:0.05527+0.00006	test-rmse:0.05595+0.00023
[800]	train-rmse:0.05527+0.00006	test-rmse:0.05595+0.00023
[900]	train-rmse:0.05527+0.00006	test-rmse:0.05595+0.00023
[1000]	train-rmse:0.05526+0.00005	test-rmse:0.05595+0.00023
[1012]	train-rmse:0.05526+0.00005	test-rmse:0.05595+0.00023
     train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
958         0.055266        0.000055         0.05595       0.000231
959         0.055266        0.000055         0.05595       0.000231
960         0.055266        0.000055         0.05595       0.000231
961         0.055266

In [None]:
last_round = len(cv_results) - 1
xgb_params["n_estimators"] = last_round + 10

<h2> l'heure de la submission ! </h2>

In [None]:
X_train = df.drop(columns=target)
y_train = df[target]

model = XGBRegressor(**xgb_params, enable_categorical=True)
model.fit(X_train, y_train)

pred = model.predict(df_test.drop(columns = "id"))

sub = pd.DataFrame({
    "id": df_test["id"],
    target: pred
})
sub.to_csv("submission.csv", index=False)

  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
