## In this notebook, we build a baseline xgb model for the flood mapping competition
https://zindi.africa/competitions/inundata-mapping-floods-in-south-africa/data

### The goal is to build a reliable cross-validation workflow that aligns with leaderboard

In [1]:
PATH = '/raid/ml/flood'

import pandas as pd
from xgb_helper import XGBHelper
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedGroupKFold

### Select the correct validation strategy

In [2]:
train = pd.read_csv(f'{PATH}/Train.csv')
test = pd.read_csv(f'{PATH}/Test.csv')
train.head()

Unnamed: 0,event_id,precipitation,label
0,id_spictby0jfsb_X_0,0.0,0
1,id_spictby0jfsb_X_1,0.095438,0
2,id_spictby0jfsb_X_2,1.94956,0
3,id_spictby0jfsb_X_3,3.23216,0
4,id_spictby0jfsb_X_4,0.0,0


In [3]:
test.head()

Unnamed: 0,event_id,precipitation
0,id_j7b6sokflo4k_X_0,0.0
1,id_j7b6sokflo4k_X_1,3.01864
2,id_j7b6sokflo4k_X_2,0.0
3,id_j7b6sokflo4k_X_3,16.6152
4,id_j7b6sokflo4k_X_4,2.56706


In [4]:
train['day'] = train['event_id'].apply(lambda x: x.split('_')[-1]).astype(int)
train['location'] = train['event_id'].apply(lambda x: '_'.join(x.split('_')[1]))

test['day'] = test['event_id'].apply(lambda x: x.split('_')[-1]).astype(int)
test['location'] = test['event_id'].apply(lambda x: '_'.join(x.split('_')[1]))

In [5]:
mask = test.location.isin(train.location.unique())
print('# of locations in test data which can also be found in training data')
mask.sum()

# of locations in test data which can also be found in training data


0

#### Implications
- ##### location should not be used as feature
- ##### train and validation should have locations in common

In [6]:
folds = 4
kf = StratifiedGroupKFold(n_splits=folds)
for i, (tr_idx, val_idx) in enumerate(kf.split(train, train['label'], groups=train['location'])):
    tr = train.iloc[tr_idx]
    val = train.iloc[val_idx]
    mask = val.location.isin(tr.location.unique())
    print('Fold', i)
    print('# of locations in validation which can also be found in training data:', mask.sum())
    print(f'mean target in validation {val.label.mean():.4f} mean target in training {tr.label.mean():.4f}')
    print()

Fold 0
# of locations in validation which can also be found in training data: 0
mean target in validation 0.0006 mean target in training 0.0006

Fold 1
# of locations in validation which can also be found in training data: 0
mean target in validation 0.0006 mean target in training 0.0006

Fold 2
# of locations in validation which can also be found in training data: 0
mean target in validation 0.0006 mean target in training 0.0006

Fold 3
# of locations in validation which can also be found in training data: 0
mean target in validation 0.0006 mean target in training 0.0006



### Train the baseline xgboost model

In [7]:
feas = ['precipitation', 'day']
test['prob'] = 0
for i, (tr_idx, val_idx) in enumerate(kf.split(train, train['label'], groups=train['location'])):
    tr = train.iloc[tr_idx]
    val = train.iloc[val_idx]

    xgb = XGBHelper('classification', params={'max_depth': 3, 'eta':0.02,
                                              'subsample':0.5,
                                              'colsample_bytree':1}, 
                    num_boost_rounds=1000,
                    early_stop_rounds=100)
    xgb.fit(tr[feas], tr['label'], val[feas], val['label'])

    train.loc[val_idx,'prob'] = xgb.predict(val[feas])
    test['prob'] += xgb.predict(test[feas])
    score = log_loss(train.loc[val_idx,'label'], train.loc[val_idx,'prob'])
    print('fold', i, score)
    
score = log_loss(train['label'], train['prob'])
print('final', score)
test['label'] = test['prob']/folds
test[['event_id','label']].to_csv(f'sub_xgb_baseline.csv', index=False, float_format='%.10f')
train.to_csv(f'cv_xgb_baseline.csv', index=False, float_format='%.10f')

[0]	train-logloss:0.12587	val-logloss:0.12588
[100]	train-logloss:0.01899	val-logloss:0.01896
[200]	train-logloss:0.00587	val-logloss:0.00574
[300]	train-logloss:0.00420	val-logloss:0.00400
[400]	train-logloss:0.00397	val-logloss:0.00378
[500]	train-logloss:0.00392	val-logloss:0.00375
[600]	train-logloss:0.00388	val-logloss:0.00375
[673]	train-logloss:0.00387	val-logloss:0.00375
fold 0 0.003744041915032518
[0]	train-logloss:0.12587	val-logloss:0.12587
[100]	train-logloss:0.01896	val-logloss:0.01901
[200]	train-logloss:0.00582	val-logloss:0.00590
[300]	train-logloss:0.00413	val-logloss:0.00423
[400]	train-logloss:0.00390	val-logloss:0.00401
[500]	train-logloss:0.00384	val-logloss:0.00399
[571]	train-logloss:0.00382	val-logloss:0.00399
fold 1 0.003987633668362566
[0]	train-logloss:0.12587	val-logloss:0.12586
[100]	train-logloss:0.01898	val-logloss:0.01895
[200]	train-logloss:0.00583	val-logloss:0.00583
[300]	train-logloss:0.00415	val-logloss:0.00418
[400]	train-logloss:0.00392	val-loglos

### Submit the prediction and we get LB scores: `0.00439346` on public and `0.004000973` on private
The CV score `0.00395` aligns with LB score so our CV strategy works.