In [1]:
import matplotlib.pyplot as plot
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
pd.options.display.width = 200
pd.options.display.max_columns = 50

In [3]:
train = pd.read_csv('../data/in/train.csv', delimiter=';')
test = pd.read_csv('../data/in/test.csv', delimiter=';')

In [4]:
def cleanup_and_generate(train, y, test, use_hand_labels = False):
    ntrain = len(train)
    df_all = pd.concat([train, test]).reset_index(drop=True)

    for c in ['active', 'alco', 'smoke']:
        replacement = test.ix[test[c] != 'None', c].astype('float32').mean()
        df_all.ix[df_all[c]=='None', c] = replacement
        df_all[c] = df_all[c].astype('float32')
    del c, replacement

    df_all['bad_height'] = (df_all.height < 130).values * 1
    df_all['bad_weight'] = (df_all.weight + 120 < df_all.height).values * 1

    # now cleanup height and weight
    df_all['r_height'] = df_all.height
    df_all['r_weight'] = df_all.weight
    df_all.ix[df_all.height < 95, 'r_height'] += 100
    df_all.ix[df_all.bad_weight > 0, 'r_weight'] += 100
    df_all.ix[(df_all.height < 100).values * (df_all.weight > 90).values, 'r_height'] += 100

    df_all['BWI'] = df_all.weight / (df_all.height / 100) / (df_all.height / 100)
    df_all['bad_bwi'] = (df_all.BWI > 60).values * 1 + (df_all.BWI < 10).values * 1
    df_all['r_BWI'] = df_all.r_weight / (df_all.r_height / 100) / (df_all.r_height / 100)

    df_all['bad_ap_hi'] = 0
    df_all.ix[(df_all.ap_hi < 80).values + (df_all.ap_hi > 220).values, 'bad_ap_hi'] = 1
    df_all['bad_ap_lo'] = 0
    df_all.ix[(df_all.ap_lo < 40).values + (df_all.ap_lo > 200).values, 'bad_ap_lo'] = 1

    df_all['r_ap_hi'] = np.abs(df_all.ap_hi)
    df_all['r_ap_lo'] = np.abs(df_all.ap_lo)

    # now cleanup ap_hi, ap_lo
    df_all.ix[df_all.r_ap_hi > 250, 'r_ap_hi'] /= 10
    df_all.ix[df_all.r_ap_hi > 250, 'r_ap_hi'] /= 10
    df_all.ix[df_all.r_ap_hi < 25, 'r_ap_hi'] *= 10
    df_all.ix[df_all.r_ap_hi < 25, 'r_ap_hi'] *= 10
    df_all.ix[df_all.r_ap_lo > 250, 'r_ap_lo'] /= 10
    df_all.ix[df_all.r_ap_lo > 250, 'r_ap_lo'] /= 10
    df_all.ix[df_all.r_ap_lo < 25, 'r_ap_lo'] *= 10
    df_all.ix[df_all.r_ap_lo > 10000, 'r_ap_lo'] /= 100

    df_all['t1'] = np.maximum(df_all.r_ap_hi, df_all.r_ap_lo)
    df_all['t2'] = np.minimum(df_all.r_ap_hi, df_all.r_ap_lo)
    df_all.r_ap_hi = df_all.t1
    df_all.r_ap_lo = df_all.t2
    df_all.drop(['t1', 't2'], axis=1, inplace=True)

    df_all['ap_diff'] = df_all.ap_hi - df_all.ap_lo
    df_all['r_ap_diff'] = df_all.r_ap_hi - df_all.r_ap_lo

    df_all['bad_data_count'] = (df_all.bad_bwi + df_all.bad_height + df_all.bad_weight + df_all.bad_ap_hi + df_all.bad_ap_lo).values
    df_all['has_bad_data'] = df_all.bad_data_count > 0

    df_all['w_div_h'] = df_all.weight / df_all.height
    df_all['h_sub_w'] = df_all.height - df_all.weight
    df_all['ap_hi_sub_w'] = df_all.ap_hi - df_all.weight

    df_all['r_w_div_h'] = df_all.r_weight / df_all.r_height
    df_all['r_h_sub_w'] = df_all.r_height - df_all.r_weight
    df_all['r_ap_hi_sub_w'] = df_all.r_ap_hi - df_all.r_weight

    df_all['age_months'] = df_all.age // 30
    df_all['age_years'] = df_all.age // 365

    return df_all[:ntrain].reindex(), y, df_all[ntrain:].reindex()

train2, y, test2 = cleanup_and_generate(train.drop('cardio', axis=1), train['cardio'], test)

train3 = train2.copy()
test3 = test2.copy()
train3['y'] = y
test3['y'] = -1
df_all = pd.concat([train3, test3])
del train3, test3
df_all.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bad_height,bad_weight,r_height,r_weight,BWI,bad_bwi,r_BWI,bad_ap_hi,bad_ap_lo,r_ap_hi,r_ap_lo,ap_diff,r_ap_diff,bad_data_count,w_div_h,h_sub_w,ap_hi_sub_w,r_w_div_h,r_h_sub_w,r_ap_hi_sub_w,age_months,age_years,y
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,49999.5,19464.86459,1.35082,164.34814,74.180167,129.25349,96.34799,1.36722,1.22533,0.088041,0.053981,0.804388,0.00127,0.0023,164.39314,74.410167,27.55658,0.00107,27.568265,0.00368,0.01442,127.090317,81.582822,32.9055,45.507495,0.02274,0.451348,90.167973,55.073323,0.452344,89.982973,52.680149,648.34433,52.82934,0.04979
std,28867.657797,2470.428376,0.477229,8.211187,14.379494,173.639687,181.437372,0.679922,0.570543,0.27904,0.222501,0.39092,0.035615,0.047903,8.044514,14.800297,6.427456,0.032694,5.412803,0.060552,0.119215,17.06273,9.854409,249.311858,11.842163,0.156599,0.086058,14.315356,173.884723,0.085785,14.375283,19.542637,82.349132,6.774937,0.80456
min,0.0,8865.0,1.0,50.0,10.0,-150.0,-90.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,96.0,28.0,3.471784,0.0,13.053439,0.0,0.0,30.9,0.0,-10800.0,0.0,0.0,0.059172,-125.0,-214.0,0.206667,-61.0,-90.0,295.0,24.0,-1.0
25%,24999.75,17662.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,159.0,65.0,23.875115,0.0,23.875115,0.0,0.0,120.0,80.0,40.0,40.0,0.0,0.393939,82.0,41.0,0.393939,82.0,41.0,588.0,48.0,-1.0
50%,49999.5,19700.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,165.0,72.0,26.370238,0.0,26.397977,0.0,0.0,120.0,80.0,40.0,40.0,0.0,0.436047,93.0,52.0,0.436364,93.0,52.0,656.0,53.0,0.0
75%,74999.25,21324.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,170.0,82.0,30.120482,0.0,30.297784,0.0,0.0,140.0,90.0,50.0,50.0,0.0,0.496774,100.0,63.0,0.496894,100.0,63.0,710.0,58.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,287.0,200.0,543.995244,1.0,152.551775,1.0,1.0,240.0,200.0,15950.0,160.0,3.0,3.155172,170.0,15962.0,1.586538,176.0,178.0,790.0,64.0,1.0


In [5]:
df_all.ix[(df_all.r_ap_hi > 150).values * (df_all.y == 0).values]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bad_height,bad_weight,r_height,r_weight,BWI,bad_bwi,r_BWI,bad_ap_hi,bad_ap_lo,r_ap_hi,r_ap_lo,ap_diff,r_ap_diff,bad_data_count,has_bad_data,w_div_h,h_sub_w,ap_hi_sub_w,r_w_div_h,r_h_sub_w,r_ap_hi_sub_w,age_months,age_years,y
93,126,22821,2,168,80.0,160,100,1,1,0.0,0.0,1.0,0,0,168,80.0,28.344671,0,28.344671,0,0,160.0,100.0,60,60.0,0,False,0.476190,88.0,80.0,0.476190,88.0,80.0,760,62,0
101,136,18718,1,167,80.0,190,90,2,1,0.0,1.0,0.0,0,0,167,80.0,28.685145,0,28.685145,0,0,190.0,90.0,100,100.0,0,False,0.479042,87.0,110.0,0.479042,87.0,110.0,623,51,0
248,342,21727,2,158,53.0,160,90,1,1,0.0,0.0,1.0,0,0,158,53.0,21.230572,0,21.230572,0,0,160.0,90.0,70,70.0,0,False,0.335443,105.0,107.0,0.335443,105.0,107.0,724,59,0
463,663,20286,1,156,64.0,160,90,1,1,0.0,0.0,1.0,0,0,156,64.0,26.298488,0,26.298488,0,0,160.0,90.0,70,70.0,0,False,0.410256,92.0,96.0,0.410256,92.0,96.0,676,55,0
645,923,20284,1,158,86.0,160,100,2,1,0.0,0.0,1.0,0,0,158,86.0,34.449607,0,34.449607,0,0,160.0,100.0,60,60.0,0,False,0.544304,72.0,74.0,0.544304,72.0,74.0,676,55,0
778,1083,16884,2,174,68.0,160,100,1,1,0.0,0.0,1.0,0,0,174,68.0,22.460034,0,22.460034,0,0,160.0,100.0,60,60.0,0,False,0.390805,106.0,92.0,0.390805,106.0,92.0,562,46,0
880,1227,23190,2,168,80.0,160,90,1,1,0.0,0.0,1.0,0,0,168,80.0,28.344671,0,28.344671,0,0,160.0,90.0,70,70.0,0,False,0.476190,88.0,80.0,0.476190,88.0,80.0,773,63,0
886,1236,22562,1,160,72.0,170,90,1,2,0.0,0.0,0.0,0,0,160,72.0,28.125000,0,28.125000,0,0,170.0,90.0,80,80.0,0,False,0.450000,88.0,98.0,0.450000,88.0,98.0,752,61,0
1328,1863,21063,2,174,106.0,220,120,3,3,0.0,0.0,1.0,0,0,174,106.0,35.011230,0,35.011230,0,0,220.0,120.0,100,100.0,0,False,0.609195,68.0,114.0,0.609195,68.0,114.0,702,57,0
1434,2014,21143,2,169,85.0,180,100,1,1,1.0,1.0,0.0,0,0,169,85.0,29.760863,0,29.760863,0,0,180.0,100.0,80,80.0,0,False,0.502959,84.0,95.0,0.502959,84.0,95.0,704,57,0
