In [12]:
import pandas as pd
import numpy as np

### Load data

In [13]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [14]:
train_df.describe()

Unnamed: 0,Id,encounter_id,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,76324.0,76324.0,76324.0,76324.0,76324.0,76324.0,76324.0,76324.0,76324.0,76324.0,76324.0
mean,50921.899193,165342200.0,54262410.0,4.390716,43.099982,1.342238,16.021907,0.371272,0.197461,0.634584,7.422724
std,29368.666069,102671200.0,38595710.0,2.979276,19.6638,1.707989,8.139105,1.28452,0.898641,1.258929,1.936613
min,1.0,12522.0,135.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,25417.75,84936700.0,23420050.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,50926.5,152450200.0,45343940.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,76325.25,230272000.0,87536760.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,101763.0,443854100.0,189481500.0,14.0,132.0,6.0,81.0,42.0,64.0,19.0,16.0


In [15]:
train_df.sample(5)

Unnamed: 0,Id,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
46780,58774,166204104,44194896,AfricanAmerican,Female,[50-60),?,Emergency,Discharged/transferred to another rehab fac in...,Emergency Room,...,No,Down,No,No,No,No,No,Ch,Yes,NO
49258,90411,295399166,50721516,Caucasian,Male,[60-70),?,Elective,Discharged/transferred to SNF,Physician Referral,...,No,Steady,No,No,No,No,No,Ch,Yes,>30
18285,94143,344815940,42335721,AfricanAmerican,Male,[50-60),?,Emergency,Discharged to home,Emergency Room,...,No,Up,No,No,No,No,No,Ch,Yes,NO
25028,40868,126786576,24441066,Caucasian,Male,[70-80),?,Elective,Discharged to home,Physician Referral,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
16177,79585,244334316,99601452,Caucasian,Female,[20-30),?,Urgent,Discharged/transferred to home with home healt...,Physician Referral,...,No,Steady,No,No,No,No,No,No,Yes,>30


### Prepare data

In [16]:
train_df.drop(columns=['Id', 'encounter_id', 'patient_nbr', 'weight', 'payer_code', 'max_glu_serum'], inplace=True)
test_df.drop(columns=['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'max_glu_serum'], inplace=True)

In [17]:
train_df['race'].replace('?', 'Caucasian', inplace=True)
test_df['race'].replace('?', 'Caucasian', inplace=True)

In [18]:
train_df = pd.get_dummies(train_df, columns=['race'], prefix='race')
test_df = pd.get_dummies(test_df, columns=['race'], prefix='race')

In [19]:
train_df['gender'].replace('Unknown/Invalid', 'Female', inplace=True)

train_df = pd.get_dummies(train_df, columns=['gender'], prefix='gender')
test_df = pd.get_dummies(test_df, columns=['gender'], prefix='gender')

In [20]:
cleanup_age = {
    '[0-10)': 5,
    '[10-20)': 15,
    '[20-30)': 25,
    '[30-40)': 35,
    '[40-50)': 45,
    '[50-60)': 55,
    '[60-70)': 65,
    '[70-80)': 75,
    '[80-90)': 85,
    '[90-100)': 95,
}

train_df['age'].replace(cleanup_age, inplace=True)
test_df['age'].replace(cleanup_age, inplace=True)

In [21]:
train_df['admission_type_id'].replace(['Not Available', 'Not Mapped', 'Newborn',
        'Trauma Center'], 'Emergency', inplace=True)
test_df['admission_type_id'].replace(['Not Available', 'Not Mapped', 'Newborn',
        'Trauma Center'], 'Emergency', inplace=True)

train_df = pd.get_dummies(train_df, columns=['admission_type_id'], prefix='ati')
test_df = pd.get_dummies(test_df, columns=['admission_type_id'], prefix='ati')

In [None]:
train_df.loc[train_df['discharge_disposition_id'].value_counts()[train_df['discharge_disposition_id']].values < 500, 'discharge_disposition_id'] = 'Other'
train_df['discharge_disposition_id'].replace(['?', 'Not Mapped'], 'Discharged to home', inplace=True)
train_df = pd.get_dummies(train_df, columns=['discharge_disposition_id'], prefix='ddi')

test_df.loc[test_df['discharge_disposition_id'].value_counts()[test_df['discharge_disposition_id']].values < 200, 'discharge_disposition_id'] = 'Other'
test_df['discharge_disposition_id'].replace(['?', 'Not Mapped'], 'Discharged to home', inplace=True)
test_df = pd.get_dummies(test_df, columns=['discharge_disposition_id'], prefix='ddi')

In [None]:
train_df['admission_source_id'].replace(['Not mapped', 'Not Available'], 'Emergency Room', inplace=True)
train_df.loc[train_df['admission_source_id'].value_counts()[train_df['admission_source_id']].values < 200, 'admission_source_id'] = 'Other'
train_df = pd.get_dummies(train_df, columns=['admission_source_id'], prefix='asi')

In [None]:
test_df['admission_source_id'].replace(['Not mapped', 'Not Available'], 'Emergency Room', inplace=True)
test_df.loc[test_df['admission_source_id'].value_counts()[test_df['admission_source_id']].values < 100, 'admission_source_id'] = 'Other'
test_df = pd.get_dummies(test_df, columns=['admission_source_id'], prefix='asi')

In [None]:
train_df.loc[train_df['medical_specialty'].value_counts()[train_df['medical_specialty']].values < 800, 'medical_specialty'] = 'Other'
train_df['medical_specialty'].replace('?', 'Unknown', inplace=True)
train_df = pd.get_dummies(train_df, columns=['medical_specialty'], prefix='ms')

In [None]:
test_df.loc[test_df['medical_specialty'].value_counts()[test_df['medical_specialty']].values < 300, 'medical_specialty'] = 'Other'
test_df['medical_specialty'].replace('?', 'Unknown', inplace=True)
test_df = pd.get_dummies(test_df, columns=['medical_specialty'], prefix='ms')

In [None]:
diagnoses = ['diag_1', 'diag_2', 'diag_3']
cleanup_diag = {
    1: 'iapd',
    2: 'n',
    3: 'enamdaid',
    4: 'dobabfo',
    5: 'md',
    6: 'donsaso',
    7: 'dotcs',
    8: 'dotrs',
    9: 'dotds',
    10: 'dotgs',
    11: 'copcatp',
    12: 'dotsast',
    13: 'dotmsact',
    14: 'ca',
    15: 'ccoitpp',
    16: 'ssa3dc',
    17: 'iap',
    1000: 'sfihsacwhs',
    1001: 'dotcs'
}

replace_ranges = {
    (1, 139.9),
    (140, 239.9),
    (240, 279.9),
    (280, 289.9),
    (290, 319),
    (320, 389.9),
    (390, 459.9),
    (460, 519.9),
    (520, 579.9),
    (580, 629.9),
    (630, 679.9),
    (680, 709.9),
    (710, 739.9),
    (740, 759.9),
    (760, 779.9),
    (780, 799.9),
    (800, 999.9)
}

for diag in diagnoses:
    train_df[diag].replace('^.*V.*$', 1000, regex=True, inplace=True)
    train_df[diag].replace('^.*E.*$', 1000, regex=True, inplace=True)
    train_df[diag].replace('?', 1001, inplace=True)

    train_df[diag] = train_df[diag].astype(float)
    
    test_df[diag].replace('^.*V.*$', 1000, regex=True, inplace=True)
    test_df[diag].replace('^.*E.*$', 1000, regex=True, inplace=True)
    test_df[diag].replace('?', 1001, inplace=True)

    test_df[diag] = test_df[diag].astype(float)
    
    for idx, range in enumerate(replace_ranges):
        train_df[diag] = np.where(train_df[diag].between(range[0], range[1]), idx, train_df[diag])
        test_df[diag] = np.where(test_df[diag].between(range[0], range[1]), idx, test_df[diag])

    train_df[diag] = train_df[diag].astype(int)

    train_df[diag].replace(cleanup_diag, inplace=True)
    train_df = pd.get_dummies(train_df, columns=[diag], prefix=diag)

    test_df[diag] = test_df[diag].astype(int)

    test_df[diag].replace(cleanup_diag, inplace=True)
    test_df = pd.get_dummies(test_df, columns=[diag], prefix=diag)

In [None]:
train_df = pd.get_dummies(train_df, columns=['insulin'], prefix='insulin')
test_df = pd.get_dummies(test_df, columns=['insulin'], prefix='insulin')

In [None]:
cleanup_change = {
    'No': 0,
    'Ch': 1
}

train_df['change'].replace(cleanup_change, inplace=True)
test_df['change'].replace(cleanup_change, inplace=True)

In [None]:
cleanup_med = {
    'Yes': 1,
    'No': 0
}

train_df['diabetesMed'].replace(cleanup_med, inplace=True)
test_df['diabetesMed'].replace(cleanup_med, inplace=True)

In [None]:
col = [ 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 ]

train_df.drop(columns=col, inplace=True)
test_df.drop(columns=col, inplace=True)

In [None]:
cleanup_readmitted = {
    'NO': 0,
    '>30': 1,
    '<30': 2
}

train_df.replace(cleanup_readmitted, inplace=True)

### Train model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
        train_df.drop(columns=['readmitted']),
        train_df['readmitted'],
        test_size=0.25, 
        random_state=12)

In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()
min_max.fit(X_train)

X_train = pd.DataFrame(min_max.transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(min_max.transform(X_valid), columns=X_valid.columns)

In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(
        eta = 0.145,
        min_child_weight = 6.5,
        subsample = 0.7
        )

model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

y_pred = model.predict(X_valid)
y_pred2 = model.predict(X_train)
print(f'Train: {accuracy_score(y_train, y_pred2)}')
print(f'Test: {accuracy_score(y_valid, y_pred)}')

Train: 0.6240064287336443
Test: 0.5948325559457052


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

model = xgb.XGBClassifier(
        eta = 0.145,
        min_child_weight = 6.5,
        subsample = 0.7,
        max_depth = 6
        )

model.fit(train_df.drop(columns='readmitted'), train_df['readmitted'])

# kfold = StratifiedKFold(n_splits=4)
# results = cross_val_score(model, train_df.drop(columns = ['readmitted']), train_df['readmitted'], cv=kfold)
# print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
results = pd.DataFrame(test_df['Id'])
test_df.drop(columns='Id', inplace=True)

preds = model.predict(test_df)
results['readmitted'] = preds

In [None]:
results.to_csv('submission.csv', index=False)