In [2]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.append('../..')
from modules.many_features import utils, constants

In [3]:
# SEED = 42
# random.seed(SEED)
# np.random.seed(SEED)

In [4]:
feat_arr = [{'name':'hemoglobin', 'lower':12, 'upper':17.2, 'mean':10.3, 'std':2.3, 'min':(0, 5)},
            {'name':'ferritin', 'lower':10, 'upper':263, 'mean':697, 'std':3305, 'min':(1.6, 8)},
            {'name':'ret_count', 'lower':0.5, 'upper':2.5, 'mean':2.6, 'std':2.5, 'min':(0, 0.5)},
            {'name':'segmented_neutrophils', 'lower':0, 'upper':0, 'mean':0, 'std':2.4, 'min':(0, 0)},
            {'name':'tibc', 'lower':250, 'upper':450, 'mean':242, 'std':90, 'min':(3, 100)},
            {'name':'mcv', 'lower':80, 'upper':100, 'mean':90, 'std':8, 'min':(0, 65)}]

anem_feat_arr = [{'name':'hemoglobin', 'lower':12, 'upper':17.2, 'abnorm_lower':6, 'abnorm_upper':18},
                 {'name':'ferritin', 'lower':10, 'upper':263, 'abnorm_lower':0, 'abnorm_upper': 500},
                 {'name':'ret_count', 'lower':0.5, 'upper':2.5, 'abnorm_lower':0, 'abnorm_upper':6},
                 {'name':'segmented_neutrophils', 'lower':0, 'upper':0, 'abnorm_lower':0.1, 'abnorm_upper':7},
                 {'name':'tibc', 'lower':250, 'upper':450, 'abnorm_lower': 100, 'abnorm_upper':500},
                 {'name':'mcv', 'lower':80, 'upper':100, 'abnorm_lower': 75, 'abnorm_upper':105},
                 {'name':'gender', 'lower':0, 'upper':1}, #0=Female 1=male
                ]

#### The data

In [5]:
df= pd.read_csv('../../data/more_features/more_feats_new_labels_0.1.csv')
df = df.fillna(-1)
df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
0,11.183192,187.573466,4.951674,1.661383,316.593436,95.006424,181.242992,6.758433,60.586525,1,2.976104,282.291951,-1.0,28.040619,96.721542,49.530542,23.186628,-1.0,Hemolytic anemia
1,14.387445,-1.0,-1.0,-1.0,-1.0,-1.0,125.249617,6.350652,86.964793,1,2.968983,158.844879,-1.0,98.357508,112.758764,62.464566,7.16892,-1.0,No anemia
2,12.749357,5.012158,3.5028,6.179371,498.418768,76.759285,159.834784,5.018156,67.065069,1,2.705719,442.293823,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,Iron deficiency anemia
3,11.50887,197.180945,1.200125,0.0,457.033309,102.900301,131.177927,6.656823,66.403181,1,1.869338,223.896404,-1.0,111.220307,66.999185,18.353272,14.132423,-1.0,Unspecified anemia
4,9.456656,427.952052,-1.0,0.660252,-1.0,104.543774,-1.0,-1.0,-1.0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Vitamin B12/Folate deficiency anemia


In [6]:
df.label.unique()

array(['Hemolytic anemia', 'No anemia', 'Iron deficiency anemia',
       'Unspecified anemia', 'Vitamin B12/Folate deficiency anemia',
       'Aplastic anemia', 'Anemia of chronic disease',
       'Inconclusive diagnosis'], dtype=object)

#### Some relevant functions

In [7]:
def noisy_feature(df, feature, frac, lower, upper, seed):
    df[feature] = df[feature].sample(frac = 1-frac, random_state = seed)
    null_indices_list = df[df[feature].isnull()].index.tolist()
    df[feature] = df[feature].fillna(random.uniform(lower, upper))
    return df[feature]

In [8]:
def make_noisy(anemia):
    noisy_df = hb_df[hb_df.label == anemia]
    if anemia == 'Unspecified anemia':
         noisy_df['hemoglobin'] = noisy_feature(noisy_df, 'hemoglobin', 0.2, 3, 12, 1)
    elif anemia == 'Hemolytic anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.1, 60, 79, 2)
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.1, 101, 108, 3)
        noisy_df['ret_count'] = noisy_feature(noisy_df, 'ret_count', 0.2, 0.1, 2, 4)
    elif anemia == 'Aplastic anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.1, 60, 79, 5)
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.1, 101, 108, 6)
        noisy_df['ret_count'] = noisy_feature(noisy_df, 'ret_count', 0.2, 2.1, 6, 7)
    elif anemia == 'Iron deficiency anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 80, 110, 8)
        noisy_df['ferritin'] = noisy_feature(noisy_df, 'ferritin', 0.2, 100.1, 120, 9)
    elif anemia == 'Vitamin B12/Folate deficiency anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 60, 99, 10)
        noisy_df['segmented_neutrophils'] = noisy_feature(noisy_df, 'segmented_neutrophils', 0.2, 0, 0, 11)
    elif anemia == 'Anemia of chronic disease':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 80, 110, 12)
        noisy_df['ferritin'] = noisy_feature(noisy_df, 'ferritin', 0.2, 10, 30, 13)
    else:
        print('What the hell is this?')
    return noisy_df

#### 1 - Vitamin B12/Folate deficiecny anemia

In [9]:
vit_df = df[df.label=='Vitamin B12/Folate deficiency anemia']
vit_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
4,9.456656,427.952052,-1.0,0.660252,-1.0,104.543774,-1.0,-1.0,-1.0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Vitamin B12/Folate deficiency anemia
6,6.939906,433.022945,-1.0,3.831731,-1.0,102.390026,-1.0,3.5885,52.360187,1,2.896957,-1.0,-1.0,-1.0,-1.0,5.377878,-1.0,-1.0,Vitamin B12/Folate deficiency anemia
13,10.821902,134.578931,4.438313,3.02213,482.327091,101.919603,125.867191,4.693595,49.337676,0,0.81005,282.426797,-1.0,100.431364,109.07836,63.046145,25.636901,-1.0,Vitamin B12/Folate deficiency anemia
20,11.778437,191.937028,3.918479,1.881728,389.8359,101.602359,169.283019,5.828224,70.656023,1,0.864321,447.608725,0.807224,22.503145,98.530413,35.562075,12.959532,136.43449,Vitamin B12/Folate deficiency anemia
25,9.232404,-1.0,-1.0,0.7097,-1.0,101.375414,109.743201,6.129433,46.584281,0,2.600773,321.032867,-1.0,48.672338,67.69159,34.493253,26.806454,-1.0,Vitamin B12/Folate deficiency anemia


In [12]:
for i in ['mcv', 'segmented_neutrophils']:
    vit_df[i] = vit_df[i].sample(frac=1-0.05)
vit_df.isna().sum()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



hemoglobin                 0
ferritin                   0
ret_count                  0
segmented_neutrophils    900
tibc                       0
mcv                      900
serum_iron                 0
rbc                        0
age                        0
gender                     0
indirect_bilirubin         0
transferrin                0
creatinine                 0
cholestrol                 0
copper                     0
ethanol                    0
folate                     0
glucose                    0
label                      0
dtype: int64

In [16]:
vit_df['mcv'] = vit_df['mcv'].fillna(random.uniform(95, 100))
vit_df['segmented_neutrophils'] = vit_df['segmented_neutrophils'].fillna(-1)
vit_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
4,9.456656,427.952052,-1.0,0.660252,-1.0,104.543774,-1.0,-1.0,-1.0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Vitamin B12/Folate deficiency anemia
6,6.939906,433.022945,-1.0,3.831731,-1.0,102.390026,-1.0,3.5885,52.360187,1,2.896957,-1.0,-1.0,-1.0,-1.0,5.377878,-1.0,-1.0,Vitamin B12/Folate deficiency anemia
13,10.821902,134.578931,4.438313,-1.0,482.327091,101.919603,125.867191,4.693595,49.337676,0,0.81005,282.426797,-1.0,100.431364,109.07836,63.046145,25.636901,-1.0,Vitamin B12/Folate deficiency anemia
20,11.778437,191.937028,3.918479,1.881728,389.8359,101.602359,169.283019,5.828224,70.656023,1,0.864321,447.608725,0.807224,22.503145,98.530413,35.562075,12.959532,136.43449,Vitamin B12/Folate deficiency anemia
25,9.232404,-1.0,-1.0,0.7097,-1.0,98.197134,109.743201,6.129433,46.584281,0,2.600773,321.032867,-1.0,48.672338,67.69159,34.493253,26.806454,-1.0,Vitamin B12/Folate deficiency anemia


In [18]:
vit_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose
count,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0
mean,9.486498,179.891842,0.819654,3.074073,108.191059,102.074346,84.761175,3.666338,45.005189,0.569444,0.814556,189.771251,-0.056218,47.301827,50.570266,28.600774,9.219,23.57415
std,2.014827,165.540411,2.310386,2.324427,160.812083,1.877177,83.967674,2.541389,30.282889,0.495181,1.337818,160.248353,1.100644,50.362186,45.549528,26.814576,10.323252,43.131487
min,6.000016,-1.0,-1.0,-1.0,-1.0,98.197134,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,7.73761,-1.0,-1.0,1.261426,-1.0,100.860634,-1.0,2.842644,23.030772,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,9.458762,154.702565,-1.0,3.098316,-1.0,102.214962,68.447223,4.220283,46.574134,1.0,0.928346,213.797783,-1.0,31.751622,51.047896,24.564453,6.548631,-1.0
75%,11.252786,325.713112,2.738242,5.043542,224.345084,103.612196,158.798852,5.622469,70.88452,1.0,1.978088,329.420784,1.002219,90.816188,91.399064,51.879411,18.159897,47.067286
max,12.998632,499.921689,5.995977,6.999155,499.950779,104.999973,249.919719,6.999964,94.997996,1.0,2.999253,449.976306,1.999873,149.997358,129.98535,79.975425,29.999063,139.777727


#### 2 - Unspecified anemia

In [21]:
unspec_df = df[df.label=='Unspecified anemia']
unspec_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
3,11.50887,197.180945,1.200125,0.0,457.033309,102.900301,131.177927,6.656823,66.403181,1,1.869338,223.896404,-1.0,111.220307,66.999185,18.353272,14.132423,-1.0,Unspecified anemia
9,11.134721,-1.0,-1.0,0.0,-1.0,101.289788,130.274561,5.200105,89.07782,1,0.519895,246.999277,-1.0,100.012213,51.498563,17.313546,10.708672,-1.0,Unspecified anemia
14,11.26794,-1.0,2.339196,0.0,-1.0,102.08677,30.273153,3.301928,74.199225,0,0.769457,431.549854,0.409947,113.581821,109.53382,4.584599,14.719737,-1.0,Unspecified anemia
19,9.683009,80.159398,0.128461,0.0,263.123104,100.680435,220.7293,6.598076,88.110782,1,2.703993,374.628666,0.27146,77.682348,129.169116,64.095456,15.580448,43.865898,Unspecified anemia
23,9.488839,484.020043,2.394415,0.0,201.992233,104.383813,231.105033,2.853901,23.54165,1,0.333596,412.65679,-1.0,33.461756,125.852474,16.680947,21.532891,-1.0,Unspecified anemia


In [22]:
for i in ['mcv', 'segmented_neutrophils']:
    unspec_df[i] = unspec_df[i].sample(frac=1-0.05)
unspec_df.isna().sum()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



hemoglobin                 0
ferritin                   0
ret_count                  0
segmented_neutrophils    452
tibc                       0
mcv                      452
serum_iron                 0
rbc                        0
age                        0
gender                     0
indirect_bilirubin         0
transferrin                0
creatinine                 0
cholestrol                 0
copper                     0
ethanol                    0
folate                     0
glucose                    0
label                      0
dtype: int64

In [23]:
unspec_df['mcv'] = unspec_df['mcv'].fillna(random.uniform(95, 100))
unspec_df['segmented_neutrophils'] = unspec_df['segmented_neutrophils'].fillna(-1)
unspec_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
3,11.50887,197.180945,1.200125,0.0,457.033309,96.375147,131.177927,6.656823,66.403181,1,1.869338,223.896404,-1.0,111.220307,66.999185,18.353272,14.132423,-1.0,Unspecified anemia
9,11.134721,-1.0,-1.0,0.0,-1.0,101.289788,130.274561,5.200105,89.07782,1,0.519895,246.999277,-1.0,100.012213,51.498563,17.313546,10.708672,-1.0,Unspecified anemia
14,11.26794,-1.0,2.339196,0.0,-1.0,102.08677,30.273153,3.301928,74.199225,0,0.769457,431.549854,0.409947,113.581821,109.53382,4.584599,14.719737,-1.0,Unspecified anemia
19,9.683009,80.159398,0.128461,0.0,263.123104,100.680435,220.7293,6.598076,88.110782,1,2.703993,374.628666,0.27146,77.682348,129.169116,64.095456,15.580448,43.865898,Unspecified anemia
23,9.488839,484.020043,2.394415,0.0,201.992233,104.383813,231.105033,2.853901,23.54165,1,0.333596,412.65679,-1.0,33.461756,125.852474,16.680947,21.532891,-1.0,Unspecified anemia


In [24]:
unspec_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose
count,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0
mean,9.486151,159.637895,1.873962,-0.050039,79.238365,102.215769,84.249717,3.640017,44.67483,0.559615,0.799378,186.134669,-0.0673,46.459278,49.546791,28.567087,9.235675,23.019935
std,2.005637,167.254559,2.326746,0.218037,144.648693,1.940082,84.877219,2.5222,30.25299,0.496461,1.339211,160.29662,1.097424,50.092761,45.406533,26.7915,10.384553,42.53887
min,6.001011,-1.0,-1.0,-1.0,-1.0,96.375147,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,7.748757,-1.0,-1.0,0.0,-1.0,101.086698,-1.0,2.82811,22.834029,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,9.475567,109.999868,1.78125,0.0,-1.0,102.399167,65.040805,4.192058,45.972543,1.0,0.916048,210.267636,-1.0,30.514809,49.657314,24.797382,6.640985,-1.0
75%,11.199575,305.127886,3.9135,0.0,131.314287,103.694398,157.997541,5.559776,70.536261,1.0,1.948408,326.227438,0.995074,89.874579,89.9497,51.789469,18.241435,46.127012
max,12.998757,499.561158,5.999136,0.0,499.738036,104.999237,249.883733,6.999948,94.946984,1.0,2.999584,449.978925,1.999928,149.998603,129.99797,79.990797,29.996488,139.970184


In [25]:
unspec_df.isna().sum()

hemoglobin               0
ferritin                 0
ret_count                0
segmented_neutrophils    0
tibc                     0
mcv                      0
serum_iron               0
rbc                      0
age                      0
gender                   0
indirect_bilirubin       0
transferrin              0
creatinine               0
cholestrol               0
copper                   0
ethanol                  0
folate                   0
glucose                  0
label                    0
dtype: int64

#### 3 - Anemia of chronic disease

In [26]:
acd_df = df[df.label=='Anemia of chronic disease']
acd_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
10,11.135318,428.398182,-1.0,-1.0,209.146497,78.688528,164.515328,2.613933,52.107458,1,2.27889,247.051304,1.597835,89.407362,33.988481,35.890618,2.241372,58.013949,Anemia of chronic disease
17,10.199092,396.300771,-1.0,0.383121,244.485759,76.122486,96.58518,4.259322,39.657844,1,1.755565,216.120079,-1.0,132.068015,94.175429,42.028555,14.40951,-1.0,Anemia of chronic disease
22,12.126436,211.275726,0.038403,3.319368,206.580036,75.160106,78.565027,4.97535,26.650942,1,1.509415,235.474768,0.657055,129.380934,111.086028,38.396648,19.505327,101.374596,Anemia of chronic disease
27,6.535154,240.978155,-1.0,3.690522,148.114505,78.366106,193.077593,6.764819,18.866912,1,1.678183,352.632729,0.372056,127.141915,102.427321,24.8677,11.286109,109.990523,Anemia of chronic disease
33,9.080539,118.287498,-1.0,6.426623,257.434925,79.861861,-1.0,6.83596,-1.0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Anemia of chronic disease


In [27]:
for i in ['mcv', 'ferritin', 'tibc']:
    acd_df[i] = acd_df[i].sample(frac=1-0.033)
acd_df.isna().sum()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



hemoglobin                 0
ferritin                 322
ret_count                  0
segmented_neutrophils      0
tibc                     322
mcv                      322
serum_iron                 0
rbc                        0
age                        0
gender                     0
indirect_bilirubin         0
transferrin                0
creatinine                 0
cholestrol                 0
copper                     0
ethanol                    0
folate                     0
glucose                    0
label                      0
dtype: int64

In [28]:
acd_df['mcv'] = acd_df['mcv'].fillna(random.uniform(80, 85))
acd_df['ferritin'] = acd_df['ferritin'].fillna(random.uniform(27, 30))
acd_df['tibc'] = acd_df['tibc'].fillna(random.uniform(450, 460))
acd_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
10,11.135318,428.398182,-1.0,-1.0,209.146497,78.688528,164.515328,2.613933,52.107458,1,2.27889,247.051304,1.597835,89.407362,33.988481,35.890618,2.241372,58.013949,Anemia of chronic disease
17,10.199092,396.300771,-1.0,0.383121,244.485759,76.122486,96.58518,4.259322,39.657844,1,1.755565,216.120079,-1.0,132.068015,94.175429,42.028555,14.40951,-1.0,Anemia of chronic disease
22,12.126436,211.275726,0.038403,3.319368,206.580036,75.160106,78.565027,4.97535,26.650942,1,1.509415,235.474768,0.657055,129.380934,111.086028,38.396648,19.505327,101.374596,Anemia of chronic disease
27,6.535154,240.978155,-1.0,3.690522,148.114505,78.366106,193.077593,6.764819,18.866912,1,1.678183,352.632729,0.372056,127.141915,102.427321,24.8677,11.286109,109.990523,Anemia of chronic disease
33,9.080539,118.287498,-1.0,6.426623,257.434925,79.861861,-1.0,6.83596,-1.0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Anemia of chronic disease


In [29]:
acd_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose
count,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0
mean,9.545389,234.267264,0.414354,2.28634,259.086031,77.630279,83.26239,3.665466,44.390412,0.568881,0.775897,187.393763,-0.068955,46.222987,50.358691,27.885954,9.125749,23.178494
std,2.016049,152.461105,2.158694,2.655859,125.47209,1.549744,83.695479,2.54139,30.547835,0.495258,1.353074,160.425471,1.101041,50.121552,45.694139,26.898381,10.323096,42.985072
min,6.000049,-1.0,-1.0,-1.0,-1.0,75.000936,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,7.840677,101.701194,-1.0,-1.0,168.377631,76.322233,-1.0,2.846329,22.305801,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,9.53654,232.880576,-1.0,2.196146,263.442321,77.587504,65.671875,4.2339,45.801206,1.0,0.89361,212.828437,-1.0,29.247477,51.303291,23.097128,6.342442,-1.0
75%,11.300983,366.129426,1.777969,4.632642,362.477985,78.869283,156.791453,5.609224,70.309686,1.0,1.960765,329.779614,0.985734,89.160722,91.055977,51.334097,18.257961,45.436594
max,12.998543,499.98322,5.995416,6.999746,456.766995,81.116054,249.886965,6.999592,94.980383,1.0,2.999715,449.808088,1.999809,149.972185,129.998641,79.996134,29.995874,139.948106


In [30]:
acd_df.isna().sum()

hemoglobin               0
ferritin                 0
ret_count                0
segmented_neutrophils    0
tibc                     0
mcv                      0
serum_iron               0
rbc                      0
age                      0
gender                   0
indirect_bilirubin       0
transferrin              0
creatinine               0
cholestrol               0
copper                   0
ethanol                  0
folate                   0
glucose                  0
label                    0
dtype: int64

#### 4 - Iron deficiency anemia

In [31]:
ida_df = df[df.label=='Iron deficiency anemia']
ida_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
2,12.749357,5.012158,3.5028,6.179371,498.418768,76.759285,159.834784,5.018156,67.065069,1,2.705719,442.293823,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,Iron deficiency anemia
7,7.299399,57.923466,4.354479,-1.0,455.930031,78.721302,-1.0,4.753823,87.508815,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Iron deficiency anemia
30,8.383559,5.983171,4.89252,3.444228,470.758184,77.466536,183.104614,4.15962,51.654358,1,0.004,405.461045,-1.0,55.449844,54.555321,4.529687,4.68084,-1.0,Iron deficiency anemia
34,7.381669,64.669719,-1.0,-1.0,473.419233,78.938753,-1.0,5.301884,88.433827,0,2.564396,-1.0,-1.0,-1.0,-1.0,55.942383,-1.0,-1.0,Iron deficiency anemia
48,12.577713,54.8205,5.418057,-1.0,457.004738,75.536541,61.734053,3.767722,62.866585,1,0.316076,297.909421,1.14508,84.23805,71.089879,56.237142,12.416872,48.479169,Iron deficiency anemia


In [32]:
for i in ['mcv', 'ferritin', 'tibc']:
    ida_df[i] = acd_df[i].sample(frac=1-0.033)
ida_df.isna().sum()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



hemoglobin                  0
ferritin                 9267
ret_count                   0
segmented_neutrophils       0
tibc                     9267
mcv                      9267
serum_iron                  0
rbc                         0
age                         0
gender                      0
indirect_bilirubin          0
transferrin                 0
creatinine                  0
cholestrol                  0
copper                      0
ethanol                     0
folate                      0
glucose                     0
label                       0
dtype: int64

In [33]:
ida_df['mcv'] = ida_df['mcv'].fillna(random.uniform(80, 85))
ida_df['ferritin'] = ida_df['ferritin'].fillna(random.uniform(100, 130))
ida_df['tibc'] = ida_df['tibc'].fillna(random.uniform(350, 450))
ida_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
2,12.749357,102.608165,3.5028,6.179371,392.192182,84.460898,159.834784,5.018156,67.065069,1,2.705719,442.293823,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,Iron deficiency anemia
7,7.299399,102.608165,4.354479,-1.0,392.192182,84.460898,-1.0,4.753823,87.508815,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Iron deficiency anemia
30,8.383559,102.608165,4.89252,3.444228,392.192182,84.460898,183.104614,4.15962,51.654358,1,0.004,405.461045,-1.0,55.449844,54.555321,4.529687,4.68084,-1.0,Iron deficiency anemia
34,7.381669,102.608165,-1.0,-1.0,392.192182,84.460898,-1.0,5.301884,88.433827,0,2.564396,-1.0,-1.0,-1.0,-1.0,55.942383,-1.0,-1.0,Iron deficiency anemia
48,12.577713,102.608165,5.418057,-1.0,392.192182,84.460898,61.734053,3.767722,62.866585,1,0.316076,297.909421,1.14508,84.23805,71.089879,56.237142,12.416872,48.479169,Iron deficiency anemia


In [34]:
acd_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose
count,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0,9756.0
mean,9.545389,234.267264,0.414354,2.28634,259.086031,77.630279,83.26239,3.665466,44.390412,0.568881,0.775897,187.393763,-0.068955,46.222987,50.358691,27.885954,9.125749,23.178494
std,2.016049,152.461105,2.158694,2.655859,125.47209,1.549744,83.695479,2.54139,30.547835,0.495258,1.353074,160.425471,1.101041,50.121552,45.694139,26.898381,10.323096,42.985072
min,6.000049,-1.0,-1.0,-1.0,-1.0,75.000936,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,7.840677,101.701194,-1.0,-1.0,168.377631,76.322233,-1.0,2.846329,22.305801,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,9.53654,232.880576,-1.0,2.196146,263.442321,77.587504,65.671875,4.2339,45.801206,1.0,0.89361,212.828437,-1.0,29.247477,51.303291,23.097128,6.342442,-1.0
75%,11.300983,366.129426,1.777969,4.632642,362.477985,78.869283,156.791453,5.609224,70.309686,1.0,1.960765,329.779614,0.985734,89.160722,91.055977,51.334097,18.257961,45.436594
max,12.998543,499.98322,5.995416,6.999746,456.766995,81.116054,249.886965,6.999592,94.980383,1.0,2.999715,449.808088,1.999809,149.972185,129.998641,79.996134,29.995874,139.948106


In [35]:
acd_df.isna().sum()

hemoglobin               0
ferritin                 0
ret_count                0
segmented_neutrophils    0
tibc                     0
mcv                      0
serum_iron               0
rbc                      0
age                      0
gender                   0
indirect_bilirubin       0
transferrin              0
creatinine               0
cholestrol               0
copper                   0
ethanol                  0
folate                   0
glucose                  0
label                    0
dtype: int64

#### 5 - Hemolytic anemia

In [None]:
hem_df = df[df.label=='Hemolytic anemia']
hem_df.head()

In [None]:
for i in ['mcv', 'ret_count']:
    hem_df[i] = hem_df[i].sample(frac=1-0.05)
hem_df.isna().sum()

In [None]:
hem_df['mcv'] = hem_df['mcv'].fillna(random.uniform(75, 80))
hem_df['ret_count'] = hem_df['ferritin'].fillna(random.uniform(1.5, 2))
hem_df.head()

In [None]:
hem_df.describe()

In [None]:
hem_df.isna().sum()

#### 6 - Aplastic anemia

In [None]:
aplastic_df = df[df.label=='Aplastic anemia']
aplastic_df.head()

In [None]:
for i in ['mcv', 'ret_count']:
    aplastic_df[i] = aplastic_df[i].sample(frac=1-0.05)
aplastic_df.isna().sum()

In [None]:
aplastic_df['mcv'] = hem_df['mcv'].fillna(random.uniform(75, 80))
aplastic_df['ret_count'] = hem_df['ferritin'].fillna(random.uniform(1.5, 2))
hem_df.head()