In [1]:
import pandas as pd
import numpy as np
import scipy.stats

In [2]:
train_df = pd.read_csv('../data/train_set_basic.csv')
val_df = pd.read_csv('../data/val_set_constant.csv')
test_df = pd.read_csv('../data/test_set_constant.csv')
train_df.shape, val_df.shape, test_df.shape

((50400, 18), (5600, 18), (14000, 18))

In [3]:
full_df = pd.concat([train_df, val_df, test_df])
full_df.shape

(70000, 18)

In [4]:
full_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,14.728733,-1.0,3.170892,-1.0,-1.0,-1.0,-1.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,44.1862,-1.0,0
1,10.405752,9.634615,5.659537,-1.0,-1.0,77.413788,212.671838,4.032519,0,0.88713,96.311597,-1.0,43.218595,-1.0,83.207518,31.217256,-1.0,4
2,15.132737,358.914888,1.842252,3.797487,315.102272,80.500314,-1.0,5.639507,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,45.398211,-1.0,0
3,11.340169,-1.0,1.662209,2.441767,-1.0,97.033963,102.079062,3.506041,1,1.020527,127.281715,-1.0,20.847013,-1.0,62.210273,34.020508,-1.0,6
4,6.691485,-1.0,3.337971,-1.0,-1.0,99.838438,24.119564,2.010694,0,1.957666,34.633063,-1.0,34.612121,-1.0,112.411298,20.074456,-1.0,5


In [5]:
full_df = full_df.replace(-1, np.nan)
full_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,14.728733,,3.170892,,,,,,1,,,,,,,44.1862,,0
1,10.405752,9.634615,5.659537,,,77.413788,212.671838,4.032519,0,0.88713,96.311597,,43.218595,,83.207518,31.217256,,4
2,15.132737,358.914888,1.842252,3.797487,315.102272,80.500314,,5.639507,0,,,,,,,45.398211,,0
3,11.340169,,1.662209,2.441767,,97.033963,102.079062,3.506041,1,1.020527,127.281715,,20.847013,,62.210273,34.020508,,6
4,6.691485,,3.337971,,,99.838438,24.119564,2.010694,0,1.957666,34.633063,,34.612121,,112.411298,20.074456,,5


In [6]:
full_df.mean(axis=0)

hemoglobin                10.238527
ferritin                 209.967975
ret_count                  2.821155
segmented_neutrophils      2.930058
tibc                     334.275990
mcv                       89.997911
serum_iron               135.029508
rbc                        3.348216
gender                     0.546686
creatinine                 1.103034
cholestrol                74.878204
copper                    80.095237
ethanol                   39.887102
folate                    15.261930
glucose                   90.038851
hematocrit                30.715580
tsat                      49.601271
label                      3.334957
dtype: float64

In [7]:
full_df.quantile(q=[0.25, 0.75], axis=0, numeric_only=True)

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0.25,8.066994,69.870252,1.270463,0.762051,222.107602,78.917524,77.792509,2.640591,0.0,0.650572,37.388104,55.181755,19.876469,7.832469,65.12755,24.200981,23.102518,1.0
0.75,12.102028,343.337402,4.342276,4.898243,457.942384,101.092544,192.644818,3.936131,1.0,1.551825,112.243694,105.245232,59.748904,22.715223,115.077189,36.306084,62.608308,5.0


In [8]:
full_df.gender.value_counts()

1    38268
0    31732
Name: gender, dtype: int64

In [9]:
31732/70000*100

45.331428571428575

In [10]:
45.331+54.669

100.0

#### Statistics per class

In [11]:
no_df = full_df[full_df.label==0]
vit_df = full_df[full_df.label==1]
unspec_df = full_df[full_df.label==2]
acd_df = full_df[full_df.label==3]
ida_df = full_df[full_df.label==4]
hem_df = full_df[full_df.label==5]
aplastic_df = full_df[full_df.label==6]
inconc_df = full_df[full_df.label==7]

In [19]:
ida_df.mean(axis=0)

hemoglobin                 9.538985
ferritin                  48.653927
ret_count                  2.975389
segmented_neutrophils      3.582051
tibc                     452.223117
mcv                       77.527442
serum_iron               135.624682
rbc                        3.692585
gender                     0.567397
creatinine                 1.102940
cholestrol                74.700252
copper                    79.763902
ethanol                   39.500795
folate                    15.462156
glucose                   90.152610
hematocrit                28.616955
tsat                      32.674612
label                      4.000000
dtype: float64

In [26]:
hem_df.quantile(q=[0.25, 0.75], axis=0, numeric_only=True)

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0.25,7.740918,123.988165,3.079065,1.819555,203.255852,84.919796,76.467197,2.580171,0.0,0.656044,37.21617,55.200801,20.429634,7.854063,65.136764,23.222754,24.902247,5.0
0.75,11.261577,380.349538,5.00694,5.271293,418.237383,94.93761,191.046589,3.7606,1.0,1.537992,112.418366,105.698463,60.38896,22.746152,115.555001,33.784731,69.688499,5.0


In [68]:
inconc_df.gender.value_counts()

1    3855
0    2866
Name: gender, dtype: int64

In [69]:
inconc_df.gender.value_counts(normalize=True).mul(100).round(2).astype(str) + '%'

1    57.36%
0    42.64%
Name: gender, dtype: object