In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error 

In [2]:
train_df = pd.read_csv('data_feather/train_ungrouped.csv', index_col=0)
test_df = pd.read_csv('data_feather/test_ungrouped.csv', index_col=0)

train_df.shape, test_df.shape

((478741, 245), (121259, 245))

In [7]:
train_df.head()

Unnamed: 0,postcode,postcode_short,Accident_ID,Police_Force,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),...,171,172,173,174,175,176,177,178,179,180
0,OX3 9UP,OX3 9,1,34,2,1,2012-12-19,7,13:20,344,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,S35 4EZ,S35 4,2,5,2,1,2012-02-11,4,7:53,102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BN21 2XR,BN21 2,3,1,2,1,2012-02-11,4,16:00,531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TA20 3PT,TA20 3,4,1,1,1,2012-06-05,1,16:50,7,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DN20 0QF,DN20 0,5,46,1,1,2012-06-30,3,13:25,519,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
train_df['Did_Police_Officer_Attend_Scene_of_Accident'] = train_df['Did_Police_Officer_Attend_Scene_of_Accident'].replace({'Yes': 1, 'No' : 0})

In [19]:
cat_columns = ['Day_of_Week','1st_Road_Class', '2nd_Road_Class', 'Pedestrian_Crossing-Human_Control','Pedestrian_Crossing-Physical_Facilities',	'Light_Conditions',
 'Weather_Conditions', 'Road_Surface_Conditions', 'Special_Conditions_at_Site', 'Carriageway_Hazards', 'state', 'Road_Type', 'Speed_limit', 'Urban_or_Rural_Area' , 'Time_category', 
 'hour', 'minutes', 'year', 'month']


drop_columns = cat_columns + ['postcode', 'postcode_short', 'Accident_ID', 'Number_of_Casualties', 'Date', 'Time', 'country', 'postcode_short_1', 'postcode_cleaned', 'Local_Authority_(Highway)', 'Local_Authority_(District)', 'Rural Urban']

In [21]:
train_df.drop(drop_columns, axis=1).dtypes[train_df.drop(drop_columns, axis=1).dtypes == 'object']

Series([], dtype: object)

In [22]:
X_train, X_val, y_train, y_val = train_test_split( train_df.drop(drop_columns, axis=1).values, train_df['Number_of_Casualties'].values, test_size=0.2, random_state=35)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((382992, 214), (95749, 214), (382992,), (95749,))

In [30]:
np.save( 'data/X_train_ungrouped.npy',X_train,)
np.save( 'data/y_train_ungrouped.npy',y_train,)
np.save('data/X_val_ungrouped.npy',X_val,)
np.save('data/y_val_ungrouped.npy',y_val,)


In [23]:
idv_columns = []
for col in train_df.columns:
    if col not in drop_columns:
        idv_columns.append(col)
d_train = xgb.DMatrix(X_train, label=y_train, feature_names = idv_columns)
d_val = xgb.DMatrix(X_val, label=y_val, feature_names = idv_columns)

In [28]:
watchlist = [(d_train, 'train'), (d_val, 'eval')]

In [18]:
train_df[train_df == 'Total'].isna().sum()[train_df[train_df == 'Total'].isna().sum() < 478741]

Rural Urban    55276
dtype: int64

### Optuna

In [24]:
import optuna
import logging
import sys
import os

optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

study_name = "xgb-study_ver4"

storage_name = "sqlite:///{}/{}.db".format(os.getcwd(), study_name) 





In [None]:
optuna.delete_study(study_name, storage_name)

In [25]:

study = optuna.create_study(
study_name=study_name,
storage=storage_name,
# load_if_exists=True,
direction="minimize",
)

[32m[I 2022-03-28 12:04:17,082][0m A new study created in RDB with name: xgb-study_ver4[0m


A new study created in RDB with name: xgb-study_ver4


In [26]:

def objective(trial):

    param = {
    "verbosity": 0, # "n_jobs": 7,
    "objective": "reg:squarederror",
    #defines booster, gblinear for linear functions.
    # "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear","dart"]), 
    # "booster": trial.suggest_categorical("booster", ["gbtree","dart"]),#
    "booster":"gbtree",
    # L2 regularization weight.
    # "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log-True),
    "lambda": trial.suggest_float("lambda",1e-5, 200, log= True), # L1 regularization weight. # "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log-True),
    "alpha": trial.suggest_float("alpha", 1e-5, 100, log=True), 
    #sampling ratio for training data.
    # "subsample": trial.suggest_float("subsample", 8.2, 1.0),
    "subsample":trial.suggest_float("subsample", 0.2, 1.0), # sampling according to each tree. # "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    'eval_metric': 'rmse',
    }

    if param["booster"] in ["gbtree", "dart"]: # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 2, 9,)
        # param["max_depth"] = trial.suggest_categorical("max_depth", [2,3]) #minimum child weight, larger the term more conservative the tree.
        #param["min_child_weight"] trial.suggest_int("min_child_weight", 2, 10) 
        param["min_child_weight"] = trial.suggest_int("min_child_weight",10, 20)
        #param["eta"] trial.suggest_float("eta", le-8, 1.0, log-True)
        param["eta"] = trial.suggest_float("eta", 1e-5, 1.0)
        #defines how selective algorithm is.
        param["gamma"] = trial.suggest_loguniform("gamma", 1e-5, 10) 
        # param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"]== "dart":

        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"]) 
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"]) 
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"]= trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    xgb_model = xgb.train(
        param,
        dtrain=d_train,
        evals = watchlist,
        early_stopping_rounds=50,
        num_boost_round = 1000, maximize = False, verbose_eval=False)
    
    if param["booster"] in  ["gbtree", "dart"]:
        trial.set_user_attr('best_iteration', xgb_model.best_iteration)
        train_predict = xgb_model.predict(d_train, iteration_range = (0, xgb_model.best_iteration))
        val_predict = xgb_model.predict(d_val, iteration_range = (0,xgb_model.best_iteration))
        # val2_predict = xgb_model.predict(d_val2, ntree_limit=xgb_model.best_ntree_limit)
    else:
        train_predict = xgb_model.predict(d_train)
        val_predict = xgb_model.predict(d_val)

    train_rmse = np.sqrt(mean_squared_error(y_train, train_predict))
    val_rmse = np.sqrt(mean_squared_error(y_val, val_predict))
    # val2_rmse = np.sqrt(mean_squared_error(y_val2, val2_predict)))

    trial.set_user_attr('train_rmse',train_rmse)
    trial.set_user_attr('val_rmse',val_rmse)

    return val_rmse


In [29]:
study.optimize(objective, n_trials = 20, show_progress_bar=True)

trials_df = study.trials_dataframe()

trials_df.to_excel(study_name + '.xlsx') 

  self._init_valid()
  0%|          | 0/20 [00:00<?, ?it/s]

Trial 1 finished with value: 0.8215740375662598 and parameters: {'lambda': 0.13603832509855437, 'alpha': 0.8619670757912751, 'subsample': 0.6255485768716286, 'colsample_bytree': 0.31175144722084325, 'max_depth': 9, 'min_child_weight': 14, 'eta': 0.9293625779070444, 'gamma': 0.0008077619386788808}. Best is trial 1 with value: 0.8215740375662598.


  5%|▌         | 1/20 [00:16<05:06, 16.13s/it]

[32m[I 2022-03-28 12:04:54,399][0m Trial 1 finished with value: 0.8215740375662598 and parameters: {'lambda': 0.13603832509855437, 'alpha': 0.8619670757912751, 'subsample': 0.6255485768716286, 'colsample_bytree': 0.31175144722084325, 'max_depth': 9, 'min_child_weight': 14, 'eta': 0.9293625779070444, 'gamma': 0.0008077619386788808}. Best is trial 1 with value: 0.8215740375662598.[0m
Trial 2 finished with value: 0.8124830606267909 and parameters: {'lambda': 0.009920560323679913, 'alpha': 6.320981146933343, 'subsample': 0.3500251484496163, 'colsample_bytree': 0.6808342789055799, 'max_depth': 8, 'min_child_weight': 14, 'eta': 0.10302247547659622, 'gamma': 0.0027178668679451842}. Best is trial 2 with value: 0.8124830606267909.


 10%|█         | 2/20 [00:57<09:12, 30.70s/it]

[32m[I 2022-03-28 12:05:35,292][0m Trial 2 finished with value: 0.8124830606267909 and parameters: {'lambda': 0.009920560323679913, 'alpha': 6.320981146933343, 'subsample': 0.3500251484496163, 'colsample_bytree': 0.6808342789055799, 'max_depth': 8, 'min_child_weight': 14, 'eta': 0.10302247547659622, 'gamma': 0.0027178668679451842}. Best is trial 2 with value: 0.8124830606267909.[0m
Trial 3 finished with value: 0.8119180678741581 and parameters: {'lambda': 30.797693008473477, 'alpha': 0.0057464557159374674, 'subsample': 0.6373236178687477, 'colsample_bytree': 0.6068115568814416, 'max_depth': 2, 'min_child_weight': 13, 'eta': 0.35270192176943177, 'gamma': 0.9309768130274891}. Best is trial 3 with value: 0.8119180678741581.


 15%|█▌        | 3/20 [01:10<06:24, 22.63s/it]

[32m[I 2022-03-28 12:05:48,315][0m Trial 3 finished with value: 0.8119180678741581 and parameters: {'lambda': 30.797693008473477, 'alpha': 0.0057464557159374674, 'subsample': 0.6373236178687477, 'colsample_bytree': 0.6068115568814416, 'max_depth': 2, 'min_child_weight': 13, 'eta': 0.35270192176943177, 'gamma': 0.9309768130274891}. Best is trial 3 with value: 0.8119180678741581.[0m
Trial 4 finished with value: 0.812281709816198 and parameters: {'lambda': 0.0015972727649116354, 'alpha': 0.4294058299457696, 'subsample': 0.8481151996493537, 'colsample_bytree': 0.8548100332843693, 'max_depth': 5, 'min_child_weight': 18, 'eta': 0.6174692840363835, 'gamma': 0.004972902955435845}. Best is trial 3 with value: 0.8119180678741581.


 20%|██        | 4/20 [01:28<05:32, 20.80s/it]

[32m[I 2022-03-28 12:06:06,317][0m Trial 4 finished with value: 0.812281709816198 and parameters: {'lambda': 0.0015972727649116354, 'alpha': 0.4294058299457696, 'subsample': 0.8481151996493537, 'colsample_bytree': 0.8548100332843693, 'max_depth': 5, 'min_child_weight': 18, 'eta': 0.6174692840363835, 'gamma': 0.004972902955435845}. Best is trial 3 with value: 0.8119180678741581.[0m
Trial 5 finished with value: 0.8125027116458333 and parameters: {'lambda': 46.774038345299445, 'alpha': 4.932152382546543e-05, 'subsample': 0.2618664782176587, 'colsample_bytree': 0.347785307287645, 'max_depth': 3, 'min_child_weight': 15, 'eta': 0.6714871583249932, 'gamma': 0.026290540364998224}. Best is trial 3 with value: 0.8119180678741581.


 25%|██▌       | 5/20 [01:36<04:04, 16.30s/it]

[32m[I 2022-03-28 12:06:14,642][0m Trial 5 finished with value: 0.8125027116458333 and parameters: {'lambda': 46.774038345299445, 'alpha': 4.932152382546543e-05, 'subsample': 0.2618664782176587, 'colsample_bytree': 0.347785307287645, 'max_depth': 3, 'min_child_weight': 15, 'eta': 0.6714871583249932, 'gamma': 0.026290540364998224}. Best is trial 3 with value: 0.8119180678741581.[0m
Trial 6 finished with value: 0.8121853293244549 and parameters: {'lambda': 0.006270627581432739, 'alpha': 0.0016259609171090598, 'subsample': 0.3770178225163684, 'colsample_bytree': 0.6763025179657867, 'max_depth': 2, 'min_child_weight': 11, 'eta': 0.8377972816484452, 'gamma': 4.023781484615927}. Best is trial 3 with value: 0.8119180678741581.


 30%|███       | 6/20 [01:46<03:17, 14.14s/it]

[32m[I 2022-03-28 12:06:24,591][0m Trial 6 finished with value: 0.8121853293244549 and parameters: {'lambda': 0.006270627581432739, 'alpha': 0.0016259609171090598, 'subsample': 0.3770178225163684, 'colsample_bytree': 0.6763025179657867, 'max_depth': 2, 'min_child_weight': 11, 'eta': 0.8377972816484452, 'gamma': 4.023781484615927}. Best is trial 3 with value: 0.8119180678741581.[0m
Trial 7 finished with value: 0.8115460662324304 and parameters: {'lambda': 0.07685830339490729, 'alpha': 18.102463789555955, 'subsample': 0.5649712112932131, 'colsample_bytree': 0.9945928090944374, 'max_depth': 4, 'min_child_weight': 16, 'eta': 0.05745250139121573, 'gamma': 1.6490718759348846e-05}. Best is trial 7 with value: 0.8115460662324304.


 35%|███▌      | 7/20 [03:16<08:27, 39.01s/it]

[32m[I 2022-03-28 12:07:54,802][0m Trial 7 finished with value: 0.8115460662324304 and parameters: {'lambda': 0.07685830339490729, 'alpha': 18.102463789555955, 'subsample': 0.5649712112932131, 'colsample_bytree': 0.9945928090944374, 'max_depth': 4, 'min_child_weight': 16, 'eta': 0.05745250139121573, 'gamma': 1.6490718759348846e-05}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 8 finished with value: 0.8140426705614422 and parameters: {'lambda': 0.0451454624971264, 'alpha': 1.6532153108314466e-05, 'subsample': 0.36155408028873703, 'colsample_bytree': 0.4390113443088639, 'max_depth': 7, 'min_child_weight': 10, 'eta': 0.3764001839084664, 'gamma': 0.03495032594357128}. Best is trial 7 with value: 0.8115460662324304.


 40%|████      | 8/20 [03:33<06:24, 32.06s/it]

[32m[I 2022-03-28 12:08:11,988][0m Trial 8 finished with value: 0.8140426705614422 and parameters: {'lambda': 0.0451454624971264, 'alpha': 1.6532153108314466e-05, 'subsample': 0.36155408028873703, 'colsample_bytree': 0.4390113443088639, 'max_depth': 7, 'min_child_weight': 10, 'eta': 0.3764001839084664, 'gamma': 0.03495032594357128}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 9 finished with value: 0.8129483240835688 and parameters: {'lambda': 0.00026211219118207977, 'alpha': 0.002313606701320722, 'subsample': 0.294788733682966, 'colsample_bytree': 0.26517312465614595, 'max_depth': 2, 'min_child_weight': 19, 'eta': 0.9329203749491676, 'gamma': 4.8271205211625415e-05}. Best is trial 7 with value: 0.8115460662324304.


 45%|████▌     | 9/20 [03:40<04:26, 24.24s/it]

[32m[I 2022-03-28 12:08:19,018][0m Trial 9 finished with value: 0.8129483240835688 and parameters: {'lambda': 0.00026211219118207977, 'alpha': 0.002313606701320722, 'subsample': 0.294788733682966, 'colsample_bytree': 0.26517312465614595, 'max_depth': 2, 'min_child_weight': 19, 'eta': 0.9329203749491676, 'gamma': 4.8271205211625415e-05}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 10 finished with value: 0.8124416224562959 and parameters: {'lambda': 0.0008132303702684552, 'alpha': 63.520066882581155, 'subsample': 0.6780742872994591, 'colsample_bytree': 0.7953677729619912, 'max_depth': 8, 'min_child_weight': 11, 'eta': 0.6332273347763263, 'gamma': 0.0006354040492304279}. Best is trial 7 with value: 0.8115460662324304.


 50%|█████     | 10/20 [04:07<04:09, 24.98s/it]

[32m[I 2022-03-28 12:08:45,661][0m Trial 10 finished with value: 0.8124416224562959 and parameters: {'lambda': 0.0008132303702684552, 'alpha': 63.520066882581155, 'subsample': 0.6780742872994591, 'colsample_bytree': 0.7953677729619912, 'max_depth': 8, 'min_child_weight': 11, 'eta': 0.6332273347763263, 'gamma': 0.0006354040492304279}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 11 finished with value: 0.8115921898715172 and parameters: {'lambda': 2.2162552244944675e-05, 'alpha': 0.22277294756390234, 'subsample': 0.9467584499759565, 'colsample_bytree': 0.9822773385238063, 'max_depth': 5, 'min_child_weight': 17, 'eta': 0.019774900449342915, 'gamma': 1.2029157698604746e-05}. Best is trial 7 with value: 0.8115460662324304.


 55%|█████▌    | 11/20 [06:09<08:13, 54.78s/it]

[32m[I 2022-03-28 12:10:48,015][0m Trial 11 finished with value: 0.8115921898715172 and parameters: {'lambda': 2.2162552244944675e-05, 'alpha': 0.22277294756390234, 'subsample': 0.9467584499759565, 'colsample_bytree': 0.9822773385238063, 'max_depth': 5, 'min_child_weight': 17, 'eta': 0.019774900449342915, 'gamma': 1.2029157698604746e-05}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 12 finished with value: 0.8116046883146444 and parameters: {'lambda': 1.726522657112054e-05, 'alpha': 94.18442475071704, 'subsample': 0.9647862469180335, 'colsample_bytree': 0.9525076365699702, 'max_depth': 5, 'min_child_weight': 17, 'eta': 0.10727515736395393, 'gamma': 1.2319727486237732e-05}. Best is trial 7 with value: 0.8115460662324304.


 60%|██████    | 12/20 [07:12<07:37, 57.22s/it]

[32m[I 2022-03-28 12:11:50,802][0m Trial 12 finished with value: 0.8116046883146444 and parameters: {'lambda': 1.726522657112054e-05, 'alpha': 94.18442475071704, 'subsample': 0.9647862469180335, 'colsample_bytree': 0.9525076365699702, 'max_depth': 5, 'min_child_weight': 17, 'eta': 0.10727515736395393, 'gamma': 1.2319727486237732e-05}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 13 finished with value: 0.8117952750514041 and parameters: {'lambda': 1.4910464217506985, 'alpha': 0.11114057768577246, 'subsample': 0.7898945399033179, 'colsample_bytree': 0.9808565592683446, 'max_depth': 4, 'min_child_weight': 17, 'eta': 0.23230764501504114, 'gamma': 8.354793290176669e-05}. Best is trial 7 with value: 0.8115460662324304.


 65%|██████▌   | 13/20 [07:34<05:26, 46.62s/it]

[32m[I 2022-03-28 12:12:13,049][0m Trial 13 finished with value: 0.8117952750514041 and parameters: {'lambda': 1.4910464217506985, 'alpha': 0.11114057768577246, 'subsample': 0.7898945399033179, 'colsample_bytree': 0.9808565592683446, 'max_depth': 4, 'min_child_weight': 17, 'eta': 0.23230764501504114, 'gamma': 8.354793290176669e-05}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 14 finished with value: 0.8115466285681189 and parameters: {'lambda': 0.8866927160669286, 'alpha': 8.048926164495047, 'subsample': 0.48775420127943503, 'colsample_bytree': 0.8535983117320931, 'max_depth': 6, 'min_child_weight': 20, 'eta': 0.011347870410005428, 'gamma': 1.143800871281607e-05}. Best is trial 7 with value: 0.8115460662324304.


 70%|███████   | 14/20 [12:23<11:58, 119.69s/it]

[32m[I 2022-03-28 12:17:01,580][0m Trial 14 finished with value: 0.8115466285681189 and parameters: {'lambda': 0.8866927160669286, 'alpha': 8.048926164495047, 'subsample': 0.48775420127943503, 'colsample_bytree': 0.8535983117320931, 'max_depth': 6, 'min_child_weight': 20, 'eta': 0.011347870410005428, 'gamma': 1.143800871281607e-05}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 15 finished with value: 0.8121961914217797 and parameters: {'lambda': 1.0828593646958744, 'alpha': 5.95282780311327, 'subsample': 0.5083384005419004, 'colsample_bytree': 0.8234038377556236, 'max_depth': 6, 'min_child_weight': 19, 'eta': 0.23162548172954944, 'gamma': 0.00015803871709743064}. Best is trial 7 with value: 0.8115460662324304.


 75%|███████▌  | 15/20 [12:50<07:38, 91.75s/it] 

[32m[I 2022-03-28 12:17:28,576][0m Trial 15 finished with value: 0.8121961914217797 and parameters: {'lambda': 1.0828593646958744, 'alpha': 5.95282780311327, 'subsample': 0.5083384005419004, 'colsample_bytree': 0.8234038377556236, 'max_depth': 6, 'min_child_weight': 19, 'eta': 0.23162548172954944, 'gamma': 0.00015803871709743064}. Best is trial 7 with value: 0.8115460662324304.[0m
Trial 16 finished with value: 0.8115299917661869 and parameters: {'lambda': 1.201480337505094, 'alpha': 6.153571625588849, 'subsample': 0.504092402681802, 'colsample_bytree': 0.8747312054558327, 'max_depth': 4, 'min_child_weight': 20, 'eta': 0.008845036072500034, 'gamma': 0.3500914156369388}. Best is trial 16 with value: 0.8115299917661869.


 80%|████████  | 16/20 [17:23<09:45, 146.29s/it]

[32m[I 2022-03-28 12:22:01,521][0m Trial 16 finished with value: 0.8115299917661869 and parameters: {'lambda': 1.201480337505094, 'alpha': 6.153571625588849, 'subsample': 0.504092402681802, 'colsample_bytree': 0.8747312054558327, 'max_depth': 4, 'min_child_weight': 20, 'eta': 0.008845036072500034, 'gamma': 0.3500914156369388}. Best is trial 16 with value: 0.8115299917661869.[0m
Trial 17 finished with value: 0.8119217059451387 and parameters: {'lambda': 7.345120937039723, 'alpha': 2.2181448283738248, 'subsample': 0.49255048150239367, 'colsample_bytree': 0.49250998150858394, 'max_depth': 4, 'min_child_weight': 20, 'eta': 0.22236947503454585, 'gamma': 0.19763008844635377}. Best is trial 16 with value: 0.8115299917661869.


 85%|████████▌ | 17/20 [17:43<05:24, 108.25s/it]

[32m[I 2022-03-28 12:22:21,295][0m Trial 17 finished with value: 0.8119217059451387 and parameters: {'lambda': 7.345120937039723, 'alpha': 2.2181448283738248, 'subsample': 0.49255048150239367, 'colsample_bytree': 0.49250998150858394, 'max_depth': 4, 'min_child_weight': 20, 'eta': 0.22236947503454585, 'gamma': 0.19763008844635377}. Best is trial 16 with value: 0.8115299917661869.[0m
Trial 18 finished with value: 0.8119037012959218 and parameters: {'lambda': 0.2092962615478824, 'alpha': 0.02107643113631663, 'subsample': 0.7267292496166566, 'colsample_bytree': 0.765423982752005, 'max_depth': 4, 'min_child_weight': 16, 'eta': 0.3991579054588867, 'gamma': 0.2190895474846613}. Best is trial 16 with value: 0.8115299917661869.


 90%|█████████ | 18/20 [17:58<02:40, 80.29s/it] 

[32m[I 2022-03-28 12:22:36,492][0m Trial 18 finished with value: 0.8119037012959218 and parameters: {'lambda': 0.2092962615478824, 'alpha': 0.02107643113631663, 'subsample': 0.7267292496166566, 'colsample_bytree': 0.765423982752005, 'max_depth': 4, 'min_child_weight': 16, 'eta': 0.3991579054588867, 'gamma': 0.2190895474846613}. Best is trial 16 with value: 0.8115299917661869.[0m
Trial 19 finished with value: 0.8117212597919131 and parameters: {'lambda': 6.352404494832408, 'alpha': 23.1998120996337, 'subsample': 0.5304922090746133, 'colsample_bytree': 0.8800662571190168, 'max_depth': 3, 'min_child_weight': 15, 'eta': 0.15050434141261737, 'gamma': 5.023145017545833}. Best is trial 16 with value: 0.8115299917661869.


 95%|█████████▌| 19/20 [18:28<01:05, 65.34s/it]

[32m[I 2022-03-28 12:23:07,010][0m Trial 19 finished with value: 0.8117212597919131 and parameters: {'lambda': 6.352404494832408, 'alpha': 23.1998120996337, 'subsample': 0.5304922090746133, 'colsample_bytree': 0.8800662571190168, 'max_depth': 3, 'min_child_weight': 15, 'eta': 0.15050434141261737, 'gamma': 5.023145017545833}. Best is trial 16 with value: 0.8115299917661869.[0m
Trial 20 finished with value: 0.8121550313107425 and parameters: {'lambda': 127.79256124648207, 'alpha': 0.061988867325612186, 'subsample': 0.42996102470843967, 'colsample_bytree': 0.7245561891667887, 'max_depth': 3, 'min_child_weight': 13, 'eta': 0.46791373137183817, 'gamma': 0.11278324524331994}. Best is trial 16 with value: 0.8115299917661869.


100%|██████████| 20/20 [18:41<00:00, 56.09s/it]

[32m[I 2022-03-28 12:23:20,134][0m Trial 20 finished with value: 0.8121550313107425 and parameters: {'lambda': 127.79256124648207, 'alpha': 0.061988867325612186, 'subsample': 0.42996102470843967, 'colsample_bytree': 0.7245561891667887, 'max_depth': 3, 'min_child_weight': 13, 'eta': 0.46791373137183817, 'gamma': 0.11278324524331994}. Best is trial 16 with value: 0.8115299917661869.[0m





In [31]:
print(study.best_params)
print(study.best_trial.user_attrs)

{'alpha': 6.153571625588849, 'colsample_bytree': 0.8747312054558327, 'eta': 0.008845036072500034, 'gamma': 0.3500914156369388, 'lambda': 1.201480337505094, 'max_depth': 4, 'min_child_weight': 20, 'subsample': 0.504092402681802}
{'best_iteration': 997, 'train_rmse': 0.8077585523513313, 'val_rmse': 0.8115299917661869}


In [35]:

params = study.best_params
watchlist = [(d_train, 'train'), (d_val, 'eval')]
xgb_model_selected = xgb.train(num_boost_round = study.best_trial.user_attrs['best_iteration'] + 1, dtrain=d_train, evals=watchlist, maximize=False, params=params, early_stopping_rounds=50)

[0]	train-rmse:1.25551	eval-rmse:1.25849
[1]	train-rmse:1.24907	eval-rmse:1.25203
[2]	train-rmse:1.24269	eval-rmse:1.24566
[3]	train-rmse:1.23640	eval-rmse:1.23938
[4]	train-rmse:1.23016	eval-rmse:1.23316
[5]	train-rmse:1.22400	eval-rmse:1.22701
[6]	train-rmse:1.21799	eval-rmse:1.22096
[7]	train-rmse:1.21198	eval-rmse:1.21497
[8]	train-rmse:1.20606	eval-rmse:1.20907
[9]	train-rmse:1.20023	eval-rmse:1.20323
[10]	train-rmse:1.19449	eval-rmse:1.19747
[11]	train-rmse:1.18881	eval-rmse:1.19181
[12]	train-rmse:1.18324	eval-rmse:1.18624
[13]	train-rmse:1.17772	eval-rmse:1.18070
[14]	train-rmse:1.17225	eval-rmse:1.17524
[15]	train-rmse:1.16685	eval-rmse:1.16984
[16]	train-rmse:1.16152	eval-rmse:1.16454
[17]	train-rmse:1.15627	eval-rmse:1.15928
[18]	train-rmse:1.15109	eval-rmse:1.15410
[19]	train-rmse:1.14597	eval-rmse:1.14898
[20]	train-rmse:1.14089	eval-rmse:1.14392
[21]	train-rmse:1.13593	eval-rmse:1.13895
[22]	train-rmse:1.13102	eval-rmse:1.13403
[23]	train-rmse:1.12616	eval-rmse:1.12919
[2

In [39]:
train_predict = xgb_model_selected.predict(d_train, iteration_range = (0, xgb_model_selected.best_iteration + 1)) #somehow plus 1 is giving better results
val_predict = xgb_model_selected.predict(d_val,iteration_range = (0, xgb_model_selected.best_iteration + 1))

train_rmse = np.sqrt(mean_squared_error(y_train, train_predict))
val_rmse = np.sqrt(mean_squared_error(y_val, val_predict))

print(train_rmse, val_rmse)

0.8077565534007738 0.8115285228484108


In [40]:
d = xgb_model_selected.get_score(importance_type='gain')
print(type(d))
# d
feature_imp_df = pd.DataFrame(d, index=[0])
feature_imp_df =  feature_imp_df.transpose()


feature_imp_df = feature_imp_df.sort_values(ascending=False, by=0)

feature_imp_df.head(20)

<class 'dict'>


Unnamed: 0,0
Police_Force,39.515789
71,30.781118
11,26.11743
Number_of_Vehicles,18.718742
77,15.56566
76,11.039309
1st_Road_Number,10.987551
8,10.748874
16,8.44001
72,8.358164


In [41]:
np.save('data_feather/Val_predict_xgb_ver5.npy',val_predict, )

In [44]:
test_df['Did_Police_Officer_Attend_Scene_of_Accident'] = test_df['Did_Police_Officer_Attend_Scene_of_Accident'].replace({'Yes': 1, 'No' : 0})

In [45]:
X_test = test_df.drop(drop_columns, axis=1).values
d_test = xgb.DMatrix(X_test, feature_names=idv_columns)
test_predict = xgb_model_selected.predict(d_test, iteration_range = (0, xgb_model_selected.best_iteration + 1))

test_predict[:10]

array([1.6596447, 1.4415164, 1.5962776, 1.5224253, 1.5141215, 1.4527061,
       1.5543197, 1.4188012, 1.5711714, 1.4794848], dtype=float32)

In [57]:
np.save('data/X_test_ungrouped.npy', X_test)

In [47]:
test_predict_df = pd.DataFrame()
test_predict_df['postcode'] = test_df['postcode']
test_predict_df['Accident_risk_index'] = test_predict
test_predict_df.shape

(121259, 2)

In [48]:
test_predict_df.head()

Unnamed: 0,postcode,Accident_risk_index
0,HX2 8WH,1.659645
1,RM8 1DD,1.441516
2,SE23 1NH,1.596278
3,HU10 7QS,1.522425
4,BD23 5JL,1.514122


In [52]:
final_df = test_predict_df.groupby('postcode')['Accident_risk_index'].mean()

In [53]:
submission_df= pd.read_csv('data/sample_submission.csv')
submission_df.head()

Unnamed: 0,postcode,Accident_risk_index
0,AB10 1AU,0
1,AB10 1PG,0
2,AB10 1TT,0
3,AB10 1YP,0
4,AB10 6LQ,0


In [54]:
submission_df['Accident_risk_index'] = final_df.values
submission_df

Unnamed: 0,postcode,Accident_risk_index
0,AB10 1AU,1.420454
1,AB10 1PG,1.388909
2,AB10 1TT,1.564415
3,AB10 1YP,1.459639
4,AB10 6LQ,1.403239
...,...,...
49767,ZE2 9LZ,1.346412
49768,ZE2 9RE,1.379653
49769,ZE2 9RJ,1.330733
49770,ZE2 9SB,1.620900


In [55]:
submission_df.to_csv('data_out/my_submission_xgb_13.csv', index=False)