In [1]:
import os
import sys
os.environ['CUDA_VISIBLE_DEVICES'] = "3"

In [2]:
import pandas as pd
import numpy as np

import cudf
import cuml
import cupy
import xgboost as xgb
from cuml.metrics import roc_auc_score

from sklearn.model_selection import KFold, GroupKFold
import pickle
from tqdm import tqdm

In [3]:
train = cudf.read_csv('../input/TrainingWiDS2021.csv', index_col=0).reset_index(drop=True) 
test = cudf.read_csv('../input/UnlabeledWiDS2021.csv', index_col=0)


In [4]:
test['diabetes_mellitus'] = -1
test['fold'] = -1

In [5]:
kf = KFold(5, shuffle=True, random_state=33)

train['fold'] = -1

for fold, (train_idx, val_idx) in enumerate(kf.split(train)):
    train.loc[val_idx, 'fold'] = fold

In [6]:
data = cudf.concat([train, test]).reset_index(drop=True)

In [7]:
with open('../input/hospital_map_all', 'rb') as file:
    hospital_map = pickle.load(file)

In [8]:
data['hospital_id'] = data['hospital_id'].map(hospital_map)

In [9]:
data['hospital_id'].isnull().sum()

0

In [10]:
df = data.groupby(['icu_id', 'hospital_id']).encounter_id.count().reset_index()
df = df.groupby('hospital_id').icu_id.count().reset_index()
df.columns = ['hospital_id', 'num_hospital_icu']
df

data = data.merge(df, how='left', on='hospital_id')
data

Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,fold,num_hospital_icu
0,218612,21,65.0,18.87298988,0,Caucasian,M,193.0,Other Hospital,Operating Room / Recovery,...,0,0,0,0,0,0,0,0,2,4
1,160976,21,41.0,34.29975031,0,African American,F,172.7,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,0,0,1,4
2,269792,21,79.0,30.11958687,0,Caucasian,F,165.1,Other Hospital,Other Hospital,...,0,0,0,0,0,0,0,0,0,4
3,212104,21,50.0,53.26874559,0,Caucasian,M,165.1,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,0,0,0,4
4,234614,21,83.0,25.08040527,0,Caucasian,M,182.9,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140386,144195,19,50.0,30.2,0,Caucasian,M,170.0,Floor,Floor,...,0,0,0,0,0,0,0,-1,-1,4
140387,142220,19,77.0,33.8,0,Other/Unknown,F,155.0,Direct Admit,Accident & Emergency,...,0,0,0,0,0,0,0,-1,-1,4
140388,141040,19,88.0,27.1,0,Caucasian,F,155.0,,Floor,...,0,0,0,0,0,0,0,-1,-1,4
140389,143983,19,29.0,25.0,0,Caucasian,F,175.0,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,0,-1,-1,4


In [11]:
df = data.groupby(['icu_id']).encounter_id.count().reset_index()
df.columns = ['icu_id', 'num_icu_encounter']
df

data = data.merge(df, how='left', on='icu_id')
data

Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,fold,num_hospital_icu,num_icu_encounter
0,267230,71,61.0,27.47816147,1,Caucasian,M,165.1,Recovery Room,Operating Room / Recovery,...,0,0,0,0,0,0,0,3,1,179
1,251904,99,62.0,28.45445549,0,Caucasian,F,180.3,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,0,4,1,442
2,263834,21,51.0,67.81498973,0,Caucasian,M,152.4,,Accident & Emergency,...,0,0,0,0,0,0,0,3,4,540
3,272748,21,73.0,35.1017595,0,Caucasian,M,167.6,Direct Admit,Accident & Emergency,...,0,0,0,0,0,0,0,3,4,904
4,257082,21,73.0,34.11841143,0,Caucasian,M,165.1,Floor,Floor,...,0,0,0,0,0,0,0,4,4,865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140386,136976,10056,48.0,,0,Caucasian,F,,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,-1,-1,1,8
140387,136798,10056,81.0,30.1,0,Caucasian,M,187.0,Floor,Floor,...,0,0,0,0,0,0,-1,-1,1,8
140388,144544,10056,81.0,24.2,0,Caucasian,F,160.0,,Accident & Emergency,...,0,0,0,0,0,0,-1,-1,1,8
140389,139803,10035,62.0,32.3,0,Caucasian,F,157.4,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,-1,-1,1,7


In [12]:
dico = cudf.read_csv('../input/DataDictionaryWiDS2021.csv')
dico = dico[dico['Variable Name'] != 'icu_admit_type']
dico = dico[dico['Variable Name'] != 'readmission_status']
dico

categorical = list(dico[dico['Data Type'] == 'string']['Variable Name'].values_host)[1:]
categorical

integer = list(dico[dico['Data Type'] == 'integer']['Variable Name'].values_host)[1:]
categorical, integer

(['ethnicity',
  'gender',
  'hospital_admit_source',
  'icu_admit_source',
  'icu_stay_type',
  'icu_type',
  'apache_2_diagnosis',
  'apache_3j_diagnosis'],
 ['hospital_id',
  'icu_id',
  'gcs_eyes_apache',
  'gcs_motor_apache',
  'gcs_verbal_apache'])

In [13]:
for c in dico.loc[dico['Data Type'] == 'string', 'Variable Name'].values_host:
    if c == 'bmi':
        continue
    data[c] = data[c].factorize()[0]

In [14]:
data.groupby('fold').fold.count()

fold
-1    10234
 0    26032
 1    26032
 2    26031
 3    26031
 4    26031
Name: fold, dtype: int32

In [15]:
min_cols = [c for c in list(dico['Variable Name'].values_host) if c.endswith('_min')]
for c_min in min_cols:
    c_max = c_min[:-3] + 'max'
    filter_ = ((data[c_min] > data[c_max])).values
    if cupy.sum(filter_) > 1:
        print(c_min, cupy.sum(filter_), flush=True)
        tmp = data[c_min].copy()
        data.loc[filter_, c_min] = data.loc[filter_, c_max]
        data.loc[filter_, c_max] = tmp.loc[filter_]

d1_diasbp_invasive_min 6
d1_diasbp_min 19
d1_diasbp_noninvasive_min 18
d1_heartrate_min 37
d1_mbp_invasive_min 13
d1_mbp_min 14
d1_mbp_noninvasive_min 26
d1_resprate_min 9
d1_sysbp_invasive_min 11
d1_sysbp_min 30
d1_sysbp_noninvasive_min 29
d1_temp_min 95
h1_diasbp_invasive_min 7
h1_diasbp_min 45
h1_diasbp_noninvasive_min 43
h1_heartrate_min 59
h1_mbp_invasive_min 2
h1_mbp_min 28
h1_mbp_noninvasive_min 46
h1_resprate_min 87
h1_sysbp_invasive_min 5
h1_sysbp_min 39
h1_sysbp_noninvasive_min 42
h1_temp_min 123
d1_albumin_min 129
d1_bun_min 30
d1_calcium_min 70
d1_creatinine_min 5
d1_hco3_min 185
d1_hemaglobin_min 76
d1_hematocrit_min 19
d1_inr_min 3
d1_lactate_min 4
d1_platelets_min 9
d1_potassium_min 84
d1_sodium_min 95
h1_calcium_min 19
h1_hemaglobin_min 15
h1_hematocrit_min 4
h1_inr_min 3
h1_lactate_min 2
h1_potassium_min 21
d1_arterial_pco2_min 11
d1_arterial_ph_min 43
h1_arterial_ph_min 19
h1_pao2fio2ratio_min 2


In [16]:
NUMERIC_COLS = ['bmi'] + list(dico.loc[dico['Data Type'] == 'numeric', 'Variable Name'].values_host) 
NUMERIC_COLS

MISSING_COLS = []

for c in NUMERIC_COLS:
    if data[c].isnull().sum() > 0:
        print(c, data[c].isnull().sum())
        data['unknown_' + c] = 1 * data[c].isnull()
        MISSING_COLS.append('unknown_' + c)
        data.loc[data[c].isnull(), c] = data.loc[data[c].notnull(), c].mean()


bmi 5505
age 4988
height 2378
weight 4371
albumin_apache 84218
bilirubin_apache 88870
bun_apache 27472
creatinine_apache 26925
fio2_apache 107355
glucose_apache 15906
heart_rate_apache 339
hematocrit_apache 28872
map_apache 472
paco2_apache 107355
paco2_for_ph_apache 107355
pao2_apache 107355
ph_apache 107355
resprate_apache 872
sodium_apache 26512
temp_apache 7282
urineoutput_apache 68357
wbc_apache 31851
d1_diasbp_invasive_max 102393
d1_diasbp_invasive_min 102393
d1_diasbp_max 300
d1_diasbp_min 300
d1_diasbp_noninvasive_max 1785
d1_diasbp_noninvasive_min 1785
d1_heartrate_max 286
d1_heartrate_min 286
d1_mbp_invasive_max 102183
d1_mbp_invasive_min 102183
d1_mbp_max 364
d1_mbp_min 364
d1_mbp_noninvasive_max 2440
d1_mbp_noninvasive_min 2440
d1_resprate_max 738
d1_resprate_min 738
d1_spo2_max 594
d1_spo2_min 594
d1_sysbp_invasive_max 102360
d1_sysbp_invasive_min 102360
d1_sysbp_max 294
d1_sysbp_min 294
d1_sysbp_noninvasive_max 1771
d1_sysbp_noninvasive_min 1771
d1_temp_max 4924
d1_temp_m

In [17]:
delta_cols = []
for c_min in min_cols:
    c_max = c_min[:-3] + 'max'
    c_delta = c_min[:-3] + 'delta'
    data[c_delta] = (data[c_max] - data[c_min])
    delta_cols.append(c_delta)


In [18]:
base_features = []

for base in ['icu_id',
             #'ethnicity',
            #'gender',
            #'hospital_admit_source',
            #'icu_admit_source',
            #'icu_stay_type',
            #'icu_type',
            'apache_2_diagnosis',
            'apache_3j_diagnosis'
            ]:
    
    print(base, flush=True)
    gr = data.groupby(base)
    for c in tqdm(NUMERIC_COLS):
        df = gr[c].mean().reset_index()
        new_col = '_'.join([c, base, 'mean'])
        df.columns = [base, new_col]
        data = data.merge(df, how='left', on=base)
        base_features.append(new_col)


icu_id


100%|██████████| 151/151 [00:30<00:00,  4.93it/s]

apache_2_diagnosis



100%|██████████| 151/151 [00:44<00:00,  3.41it/s]

apache_3j_diagnosis



100%|██████████| 151/151 [00:57<00:00,  2.65it/s]


In [19]:
ohe = cuml.preprocessing.OneHotEncoder(sparse=False)
x = ohe.fit_transform(data[['apache_3j_diagnosis', 'apache_2_diagnosis']])

In [20]:
ohe_features = ['ohe_%d' % i for i in range(x.shape[1])]

for i in range(x.shape[1]):
    data['ohe_%d' % i] = x[:, i]
    

In [21]:
icu_train = set(data.loc[data.fold >= 0]['icu_id'].values_host) - set(data.loc[data.fold < 0]['icu_id'].values_host)
icu_test = set(data.loc[data.fold < 0]['icu_id'].values_host) - set(data.loc[data.fold >= 0]['icu_id'].values_host)

len(icu_train,), len(icu_test)

icu_max = data['icu_id'].max()

data.loc[data['icu_id'].isin(icu_train), 'icu_id'] = icu_max + 1
data.loc[data['icu_id'].isin(icu_test), 'icu_id'] = icu_max + 1

In [22]:
data['hospital_admit_source'].max()

14

In [23]:
set(data.loc[data.fold >= 0]['hospital_admit_source'].values_host) - set(data.loc[data.fold < 0]['hospital_admit_source'].values_host)

{0, 5, 7, 9, 12}

In [24]:
data.loc[data['hospital_admit_source'].isin([0, 5, 7, 9, 12]), 'hospital_admit_source'] = 0

In [25]:
hospital_train = set(data.loc[data.fold >= 0]['hospital_id'].values_host) - set(data.loc[data.fold < 0]['hospital_id'].values_host)
hospital_test = set(data.loc[data.fold < 0]['hospital_id'].values_host) - set(data.loc[data.fold >= 0]['hospital_id'].values_host)

len(hospital_train,), len(hospital_test)

hospital_max = data['hospital_id'].max()

data.loc[data['hospital_id'].isin(hospital_train), 'hospital_id'] = hospital_max + 1
data.loc[data['hospital_id'].isin(hospital_test), 'hospital_id'] = hospital_max + 1

In [26]:
train = data[data.fold >=0 ].reset_index(drop=True)
test = data[data.fold < 0].reset_index(drop=True)

In [27]:
features = (list(dico['Variable Name'].values_host)[1:-1] + MISSING_COLS + ohe_features + delta_cols \
            + ['num_hospital_icu', 'num_icu_encounter'] + base_features)
#features = [f for f in features if f not in ['hospital_admit_source', 'icu_admit_source', 'icu_id',]]
import cuml

NFOLDS = 5
NSEEDS = 5

params =  {
  "objective"           : "binary:logistic",
  "eval_metric"         : "auc",
  "eta"                 : 0.025,
  "max_depth"           : 5,
  "min_child_weight"    : 10,
  "gamma"               : 2,
  "subsample"           : 0.95,
  "colsample_bytree"    : 0.5,
  "alpha"               : 2,
  #"lambda"              : 1,
    'tree_method'       : 'gpu_hist',
    'scale_pos_weight' : 2,
    'num_parallel_tree' : 2,
    #'max_delta_step' : 1,
    'seed':0,
    'max_bin' : 512,
}

num_boost_round = 10000
early_stopping_rounds = 100
smooth = 0.01
verbose_eval = 100
monitor = xgb.callback.print_evaluation(period=verbose_eval)

scores = []
feat_gain = []

x_test = test[features]
d_test = xgb.DMatrix(x_test)
test_pred = 0
oof_pred = cupy.zeros(train.shape[0])

for fold in range(NFOLDS):
    x_train = train.loc[(train['fold'] != fold), features].reset_index(drop=True)
    x_valid = train.loc[(train['fold'] == fold), features]
    y_train = train.loc[(train['fold'] != fold), 'diabetes_mellitus'].reset_index(drop=True)
    y_valid = train.loc[(train['fold'] == fold), 'diabetes_mellitus']

    if 0: #for c in categorical + integer:
        enc = cuml.preprocessing.TargetEncoder(smooth=smooth)
        x_train[c] = enc.fit_transform(x_train[c], y_train)
        x_valid[c] = enc.transform(x_valid[c])
    d_train = xgb.DMatrix(x_train, label=y_train)
    d_valid = xgb.DMatrix(x_valid, label=y_valid)
    
    y_pred = 0
    n_trees = 0
    for seed in range(NSEEDS):
        params['seed'] = seed
        bst = xgb.train(params, d_train, num_boost_round=num_boost_round,
                        evals=[(d_train, 'train'), (d_valid, 'valid')],
                        #feval=eval_auc,
                        #maximize=True,
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=False,
                        callbacks=[monitor],

                       )
        y_pred = y_pred + bst.predict(d_valid, ntree_limit=bst.best_ntree_limit) / NSEEDS
        test_pred = test_pred + bst.predict(d_test, ntree_limit=bst.best_ntree_limit) / NSEEDS / NFOLDS
        n_trees = n_trees + bst.best_ntree_limit /  NSEEDS
        print(bst.best_ntree_limit)
        fscore = bst.get_score(importance_type='total_gain')
        df = cudf.DataFrame()
        df['feature'] = fscore.keys()
        df['gain'] = fscore.values()
        feat_gain.append(df)
        del bst
    score = roc_auc_score(y_valid, y_pred)
    print('fold ', fold, 'ntrees %4d' % int(n_trees), '%0.5f' % score)
    scores.append(score)
    oof_pred[(train['fold'] == fold).values] = y_pred

    del x_train, x_valid
    
print('all score %0.5f' % np.mean(scores))

feat_gain = cudf.concat(feat_gain)

feat_gain = feat_gain.groupby('feature')['gain'].sum().reset_index()
feat_gain = feat_gain.sort_values(by='gain', ascending=False).reset_index(drop=True)
feat_gain



[0]	train-auc:0.81420	valid-auc:0.81383
[100]	train-auc:0.85965	valid-auc:0.85525
[200]	train-auc:0.87682	valid-auc:0.86700
[300]	train-auc:0.88710	valid-auc:0.87209
[400]	train-auc:0.89411	valid-auc:0.87476
[500]	train-auc:0.89973	valid-auc:0.87643
[600]	train-auc:0.90431	valid-auc:0.87759
[700]	train-auc:0.90832	valid-auc:0.87837
[800]	train-auc:0.91199	valid-auc:0.87894
[900]	train-auc:0.91544	valid-auc:0.87936
[1000]	train-auc:0.91866	valid-auc:0.87968
[1100]	train-auc:0.92145	valid-auc:0.87992
[1200]	train-auc:0.92429	valid-auc:0.88007
[1300]	train-auc:0.92701	valid-auc:0.88016
[1400]	train-auc:0.92964	valid-auc:0.88024
[1500]	train-auc:0.93217	valid-auc:0.88028
[1600]	train-auc:0.93460	valid-auc:0.88036
3192
[0]	train-auc:0.81786	valid-auc:0.81741
[100]	train-auc:0.86008	valid-auc:0.85557
[200]	train-auc:0.87660	valid-auc:0.86680
[300]	train-auc:0.88699	valid-auc:0.87194
[400]	train-auc:0.89420	valid-auc:0.87476
[500]	train-auc:0.89968	valid-auc:0.87641
[600]	train-auc:0.90430	va

[1200]	train-auc:0.92436	valid-auc:0.87814
[1300]	train-auc:0.92699	valid-auc:0.87832
[1400]	train-auc:0.92947	valid-auc:0.87850
[1500]	train-auc:0.93200	valid-auc:0.87863
[1600]	train-auc:0.93452	valid-auc:0.87871
[1700]	train-auc:0.93690	valid-auc:0.87877
[1800]	train-auc:0.93920	valid-auc:0.87887
[1900]	train-auc:0.94140	valid-auc:0.87886
3658
[0]	train-auc:0.82003	valid-auc:0.81566
[100]	train-auc:0.86101	valid-auc:0.85165
[200]	train-auc:0.87748	valid-auc:0.86376
[300]	train-auc:0.88755	valid-auc:0.86963
[400]	train-auc:0.89442	valid-auc:0.87272
[500]	train-auc:0.89971	valid-auc:0.87452
[600]	train-auc:0.90427	valid-auc:0.87561
[700]	train-auc:0.90829	valid-auc:0.87639
[800]	train-auc:0.91201	valid-auc:0.87704
[900]	train-auc:0.91531	valid-auc:0.87749
[1000]	train-auc:0.91857	valid-auc:0.87787
[1100]	train-auc:0.92152	valid-auc:0.87818
[1200]	train-auc:0.92435	valid-auc:0.87833
[1300]	train-auc:0.92707	valid-auc:0.87851
[1400]	train-auc:0.92971	valid-auc:0.87861
[1500]	train-auc:0

[300]	train-auc:0.88755	valid-auc:0.86779
[400]	train-auc:0.89463	valid-auc:0.87094
[500]	train-auc:0.90015	valid-auc:0.87279
[600]	train-auc:0.90478	valid-auc:0.87412
[700]	train-auc:0.90878	valid-auc:0.87490
[800]	train-auc:0.91251	valid-auc:0.87564
[900]	train-auc:0.91586	valid-auc:0.87613
[1000]	train-auc:0.91909	valid-auc:0.87648
[1100]	train-auc:0.92201	valid-auc:0.87682
[1200]	train-auc:0.92491	valid-auc:0.87707
[1300]	train-auc:0.92767	valid-auc:0.87726
[1400]	train-auc:0.93028	valid-auc:0.87742
[1500]	train-auc:0.93280	valid-auc:0.87762
[1600]	train-auc:0.93524	valid-auc:0.87774
[1700]	train-auc:0.93762	valid-auc:0.87780
[1800]	train-auc:0.93994	valid-auc:0.87790
[1900]	train-auc:0.94214	valid-auc:0.87796
[2000]	train-auc:0.94429	valid-auc:0.87800
[2100]	train-auc:0.94635	valid-auc:0.87808
[2200]	train-auc:0.94837	valid-auc:0.87810
4292
fold  3 ntrees 4428 0.87834
[0]	train-auc:0.81294	valid-auc:0.80796
[100]	train-auc:0.86042	valid-auc:0.85227
[200]	train-auc:0.87752	valid-au

Unnamed: 0,feature,gain
0,d1_glucose_max,1.572970e+07
1,d1_glucose_delta,8.359605e+06
2,glucose_apache,4.913733e+06
3,bmi,2.911526e+06
4,age,2.339651e+06
...,...,...
933,ohe_66,5.142188e+00
934,ohe_38,4.976558e+00
935,ohe_35,4.741472e+00
936,ohe_410,4.661305e+00


In [29]:
sub = data.loc[(data['fold'] < 0)][['encounter_id']].reset_index(drop=True)
sub

sub['diabetes_mellitus'] = test_pred
sub

sub = sub.sort_values(by='encounter_id').reset_index(drop=True)
sub

sub.to_csv('../submissions/sub_053.csv', index=False)
sub

Unnamed: 0,encounter_id,diabetes_mellitus
0,135000,0.162588
1,135001,0.020004
2,135002,0.051992
3,135003,0.519523
4,135004,0.007455
...,...,...
10229,145996,0.343718
10230,145997,0.403666
10231,145998,0.105970
10232,145999,0.050435


In [30]:
cupy.save('../oofs/oof_053.npy', oof_pred)