In [2]:
import pandas as pd
import numpy as np
import random
import joblib
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [4]:
class_dict = constants.CLASS_DICT

### The anemia features dataset

In [5]:
def create_label(row):
    if row['hemoglobin']> 13:
        return 'No anemia'
    elif (row['hemoglobin'] > 12) & (row['gender']==0):
        return 'No anemia'
    else:
        if row['mcv']<80 :
            if row['ferritin']<30:
                return 'Iron deficiency anemia'
            elif row['ferritin']>100:
                return 'Anemia of chronic disease'
            elif row['tibc']<450:
                return 'Anemia of chronic disease'
            elif row['tibc']>=450:
                return 'Iron deficiency anemia'
            else:
                return 'Inconclusive diagnosis'
            
        elif row['mcv']<=100:
            if row['ret_count'] <= 2:
                return 'Aplastic anemia'
            elif row['ret_count'] >2:
                return 'Hemolytic anemia'
            else:
                return 'Inconclusive diagnosis'

        elif row['mcv']> 100:
            if row['segmented_neutrophils']>0:
                return 'Vitamin B12/Folate deficiency anemia'
            elif row['segmented_neutrophils']==0:
                return 'Unspecified anemia'
            else:
                return 'Inconclusive diagnosis'
        else:
            return 'Inconclusive diagnosis'

In [6]:
def uniform_dist(df, feat_dict, num=None):
    col_name = feat_dict['name']
    col_values = np.random.uniform(feat_dict['abnorm_lower'], feat_dict['abnorm_upper'], num)
    return col_values

In [7]:
def add_nans(df, exclude_col_list=[]):
    for col in df.columns:
        if col not in exclude_col_list:
            col_frac = round(random.uniform(0.1, 0.7), 1)
            df.loc[df.sample(frac=col_frac, random_state=SEED).index, col] = np.nan
    return df

In [8]:
def get_feature_dict(col_name, feat_arr):
    return [i for i in feat_arr if i['name']==col_name][0]

In [9]:
def populate_columns(df, col_list, feat_arr, sample_num=None):
    for col in col_list:
        feat_dict = get_feature_dict(col, feat_arr)
        df[col] = uniform_dist(df, feat_dict, sample_num)
    return df    

In [10]:
anem_feat_arr = [{'name':'hemoglobin', 'lower':12, 'upper':17.2, 'abnorm_lower':6, 'abnorm_upper':18},
                 {'name':'ferritin', 'lower':10, 'upper':263, 'abnorm_lower':0, 'abnorm_upper': 500},
                 {'name':'ret_count', 'lower':0.5, 'upper':2.5, 'abnorm_lower':0, 'abnorm_upper':6},
                 {'name':'segmented_neutrophils', 'lower':0, 'upper':0, 'abnorm_lower':0.1, 'abnorm_upper':7},
                 {'name':'tibc', 'lower':250, 'upper':450, 'abnorm_lower': 100, 'abnorm_upper':520},
                 {'name':'mcv', 'lower':80, 'upper':100, 'abnorm_lower': 75, 'abnorm_upper':105},
                 {'name':'gender', 'lower':0, 'upper':1}, #0=Female 1=male
                ]

#### 0 - No anemia

In [11]:
no_df = pd.DataFrame()
hb_dict = get_feature_dict('hemoglobin', anem_feat_arr)
gender_dict = get_feature_dict('gender', anem_feat_arr)
no_df['hemoglobin'] = np.random.uniform(hb_dict['lower'], hb_dict['upper'], 10000)
no_df['gender'] = np.random.choice([gender_dict['lower'], gender_dict['upper']], 10000)
no_df.loc[no_df.hemoglobin <= 13, 'gender'] = 0
cols_to_populate = ['ferritin', 'ret_count', 'segmented_neutrophils', 'tibc', 'mcv']
no_df = populate_columns(no_df, cols_to_populate, anem_feat_arr, len(no_df) )
no_df = add_nans(no_df, ['hemoglobin', 'gender'])
no_df['label'] = 'No anemia'
no_df.head()

Unnamed: 0,hemoglobin,gender,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,13.947609,0,,,,,,No anemia
1,16.943714,1,373.373386,0.364522,0.368434,443.030136,75.431234,No anemia
2,15.806368,1,281.33339,3.625152,4.324596,517.638263,97.032331,No anemia
3,15.113024,1,,,,,,No anemia
4,12.811297,0,92.790118,3.016328,5.001375,245.380029,101.330348,No anemia


In [12]:
no_df[no_df.hemoglobin<=13].gender.value_counts()

0    1964
Name: gender, dtype: int64

In [13]:
no_df.isna().sum()

hemoglobin                  0
gender                      0
ferritin                 5000
ret_count                1000
segmented_neutrophils    3000
tibc                     2000
mcv                      5000
label                       0
dtype: int64

#### 1 - Vitamin B12 Anemia

In [14]:
vitamin_df = pd.DataFrame()
mcv_dict = get_feature_dict('mcv', anem_feat_arr)
neutrophils_dict = get_feature_dict('segmented_neutrophils', anem_feat_arr)
vitamin_df['hemoglobin'] = np.random.uniform(hb_dict['abnorm_lower'], 13, 10000)
vitamin_df['gender'] = np.random.choice([gender_dict['lower'], gender_dict['upper']], 10000)
vitamin_df.loc[vitamin_df.hemoglobin >= 12, 'gender'] = 1
vitamin_df['mcv'] = np.random.uniform(mcv_dict['upper'], mcv_dict['abnorm_upper'], 10000)
vitamin_df['segmented_neutrophils'] = np.random.uniform(neutrophils_dict['abnorm_lower'], neutrophils_dict['abnorm_upper'], 10000)
cols_to_populate = ['ferritin', 'ret_count', 'tibc']
vitamin_df = populate_columns(vitamin_df, cols_to_populate, anem_feat_arr, len(vitamin_df) )
vitamin_df = add_nans(vitamin_df, ['hemoglobin', 'gender', 'mcv', 'segmented_neutrophils'])
vitamin_df['label'] = 'Vitamin B12/Folate deficiency anemia'
vitamin_df.head()

Unnamed: 0,hemoglobin,gender,mcv,segmented_neutrophils,ferritin,ret_count,tibc,label
0,9.269562,0,104.090822,6.152398,,,,Vitamin B12/Folate deficiency anemia
1,8.327591,1,100.726349,6.35949,263.485825,0.653504,326.684881,Vitamin B12/Folate deficiency anemia
2,11.164463,1,104.732319,2.710769,175.518475,,358.462258,Vitamin B12/Folate deficiency anemia
3,6.85752,1,104.216121,1.698252,,,,Vitamin B12/Folate deficiency anemia
4,11.251758,1,104.594266,1.422461,182.548318,0.449292,317.390831,Vitamin B12/Folate deficiency anemia


In [15]:
vitamin_df[vitamin_df.hemoglobin>12].gender.value_counts()

1    1378
Name: gender, dtype: int64

In [16]:
vitamin_df.describe()

Unnamed: 0,hemoglobin,gender,mcv,segmented_neutrophils,ferritin,ret_count,tibc
count,10000.0,10000.0,10000.0,10000.0,5000.0,4000.0,8000.0
mean,9.506998,0.5667,102.505217,3.518018,251.616909,3.01292,306.856057
std,2.00363,0.495556,1.458266,1.985366,143.760443,1.740609,121.033976
min,6.001058,0.0,100.000644,0.100203,0.089257,0.000165,100.023894
25%,7.775635,0.0,101.219134,1.789674,126.847341,1.509914,202.407273
50%,9.500934,1.0,102.521189,3.526537,253.112849,3.027165,305.073081
75%,11.264747,1.0,103.79497,5.221714,374.549804,4.541689,410.96443
max,12.999126,1.0,104.99996,6.999834,499.887714,5.999175,519.926256


#### 2 - Unspecified anemia

In [17]:
unspec_df = pd.DataFrame()
unspec_df['hemoglobin'] = np.random.uniform(hb_dict['abnorm_lower'], 13, 10000)
unspec_df['gender'] = np.random.choice([gender_dict['lower'], gender_dict['upper']], 10000)
unspec_df.loc[unspec_df.hemoglobin >= 12, 'gender'] = 1
unspec_df['mcv'] = np.random.uniform(mcv_dict['upper'], mcv_dict['abnorm_upper'], 10000)
unspec_df['segmented_neutrophils'] = 0
cols_to_populate = ['ferritin', 'ret_count', 'tibc']
unspec_df = populate_columns(unspec_df, cols_to_populate, anem_feat_arr, len(unspec_df) )
unspec_df = add_nans(unspec_df, ['hemoglobin', 'gender', 'mcv', 'segmented_neutrophils'])
unspec_df['label'] = 'Unspecified anemia'
unspec_df.head()

Unnamed: 0,hemoglobin,gender,mcv,segmented_neutrophils,ferritin,ret_count,tibc,label
0,8.897282,1,100.186242,0,,,,Unspecified anemia
1,6.387051,1,101.063104,0,219.772768,1.94141,429.74776,Unspecified anemia
2,10.407141,1,102.646372,0,42.283279,3.35404,300.573225,Unspecified anemia
3,7.477302,0,100.842741,0,,,,Unspecified anemia
4,8.376418,1,101.236387,0,358.155498,3.13425,172.914617,Unspecified anemia


In [18]:
unspec_df[unspec_df.hemoglobin>12].gender.value_counts()

1    1474
Name: gender, dtype: int64

In [19]:
unspec_df.describe()

Unnamed: 0,hemoglobin,gender,mcv,segmented_neutrophils,ferritin,ret_count,tibc
count,10000.0,10000.0,10000.0,10000.0,6000.0,9000.0,8000.0
mean,9.531576,0.5744,102.517243,0.0,249.695172,3.001376,311.53302
std,2.026889,0.494458,1.440068,0.0,144.05915,1.721664,120.802324
min,6.000688,0.0,100.000033,0.0,0.013121,2.9e-05,100.016626
25%,7.761613,0.0,101.270239,0.0,124.413597,1.519737,209.096483
50%,9.546192,1.0,102.517298,0.0,249.580882,3.019181,309.866299
75%,11.300111,1.0,103.757773,0.0,372.891696,4.46625,414.345418
max,12.999343,1.0,104.999261,0.0,499.806736,5.999898,519.962172


#### 3 - Anemia of chronic disease

In [20]:
acd_df = pd.DataFrame()
ferritin_dict = get_feature_dict('ferritin', anem_feat_arr)
tibc_dict = get_feature_dict('tibc', anem_feat_arr)
acd_df['hemoglobin'] = np.random.uniform(hb_dict['abnorm_lower'], 13, 10000)
acd_df['gender'] = np.random.choice([gender_dict['lower'], gender_dict['upper']], 10000)
acd_df.loc[acd_df.hemoglobin >= 12, 'gender'] = 1

acd_df['mcv'] = np.random.uniform(mcv_dict['abnorm_lower'], mcv_dict['lower'], 10000)
acd_df['ferritin'] = np.random.uniform(30, ferritin_dict['abnorm_upper'], 10000)

def define_tibc_acd(row):
    if row['ferritin']> 100:
        generated_num = np.random.choice([np.random.uniform(tibc_dict['abnorm_lower'], tibc_dict['abnorm_upper']), np.nan])
        return generated_num
    else:
        return np.random.uniform(tibc_dict['abnorm_lower'], tibc_dict['upper'])
acd_df['tibc'] = acd_df.apply(lambda row: define_tibc_acd(row), axis=1)
cols_to_populate = ['segmented_neutrophils', 'ret_count']
acd_df = populate_columns(acd_df, cols_to_populate, anem_feat_arr, len(acd_df) )
acd_df = add_nans(acd_df, ['hemoglobin', 'gender', 'mcv', 'ferritin', 'tibc'])
acd_df['label'] = 'Anemia of chronic disease'
acd_df.head()

Unnamed: 0,hemoglobin,gender,mcv,ferritin,tibc,segmented_neutrophils,ret_count,label
0,9.006692,1,76.41294,197.854151,430.375081,,,Anemia of chronic disease
1,11.215429,0,77.293383,460.495274,207.339958,1.453271,3.808731,Anemia of chronic disease
2,12.714183,1,75.496077,491.686175,,4.751321,4.13262,Anemia of chronic disease
3,7.1752,1,77.234185,484.700078,,,,Anemia of chronic disease
4,9.93212,1,76.015407,325.287898,,4.646368,3.546949,Anemia of chronic disease


In [21]:
acd_df[acd_df.hemoglobin>12].gender.value_counts()

1    1438
Name: gender, dtype: int64

In [22]:
acd_df.describe()

Unnamed: 0,hemoglobin,gender,mcv,ferritin,tibc,segmented_neutrophils,ret_count
count,10000.0,10000.0,10000.0,10000.0,5742.0,6000.0,9000.0
mean,9.516764,0.5738,77.475436,266.739027,302.418116,3.576772,2.98388
std,2.022136,0.494548,1.448824,136.214971,118.49676,2.003097,1.73079
min,6.001025,0.0,75.000243,30.016259,100.033023,0.100697,0.001621
25%,7.752493,0.0,76.203111,149.274362,199.117521,1.841976,1.490006
50%,9.541962,1.0,77.474494,268.467278,303.4689,3.582921,2.971043
75%,11.262215,1.0,78.745885,384.461853,403.291022,5.299657,4.479369
max,12.997036,1.0,79.999875,499.990253,519.949468,6.999424,5.997868


#### 4 - Iron deficiency anemia

In [23]:
ida_df = pd.DataFrame()
ida_df['hemoglobin'] = np.random.uniform(hb_dict['abnorm_lower'], 13, 10000)
ida_df['gender'] = np.random.choice([gender_dict['lower'], gender_dict['upper']], 10000)
ida_df.loc[ida_df.hemoglobin >= 12, 'gender'] = 1

ida_df['mcv'] = np.random.uniform(mcv_dict['abnorm_lower'], mcv_dict['lower'], 10000)
ida_df['ferritin'] = np.random.uniform(ferritin_dict['abnorm_lower'], 100, 10000)

def define_tibc_ida(row):
    if row['ferritin']< 30:
        generated_num = np.random.choice([np.random.uniform(tibc_dict['abnorm_lower'], tibc_dict['abnorm_upper']), np.nan])
        return generated_num
    else:
        return np.random.uniform(tibc_dict['upper'], tibc_dict['abnorm_upper'])
ida_df['tibc'] = ida_df.apply(lambda row: define_tibc_ida(row), axis=1)
cols_to_populate = ['segmented_neutrophils', 'ret_count']
ida_df = populate_columns(ida_df, cols_to_populate, anem_feat_arr, len(ida_df) )
ida_df = add_nans(ida_df, ['hemoglobin', 'gender', 'mcv', 'ferritin', 'tibc'])
ida_df['label'] = 'Iron deficiency anemia'
ida_df.head()

Unnamed: 0,hemoglobin,gender,mcv,ferritin,tibc,segmented_neutrophils,ret_count,label
0,8.739591,0,75.499873,10.223622,,,,Iron deficiency anemia
1,12.308411,1,75.04587,27.010797,,6.821118,1.120181,Iron deficiency anemia
2,7.329874,0,76.344396,23.971183,169.052418,4.316831,0.556877,Iron deficiency anemia
3,9.591604,0,76.30374,23.003296,201.826714,,,Iron deficiency anemia
4,9.157939,1,76.145714,9.571018,,3.820393,1.102272,Iron deficiency anemia


In [26]:
ida_df[ida_df.hemoglobin>12].gender.value_counts()

1    1451
Name: gender, dtype: int64

In [27]:
ida_df.describe()

Unnamed: 0,hemoglobin,gender,mcv,ferritin,tibc,segmented_neutrophils,ret_count
count,10000.0,10000.0,10000.0,10000.0,8452.0,8000.0,5000.0
mean,9.522538,0.5685,77.525815,49.84068,452.248005,3.552903,2.96074
std,2.009474,0.49531,1.440955,28.973932,88.27709,1.998648,1.731056
min,6.000102,0.0,75.000284,0.025086,100.142381,0.101881,0.000819
25%,7.801548,0.0,76.298169,24.293553,457.958816,1.830958,1.440847
50%,9.510291,1.0,77.545498,50.006449,478.467001,3.528856,2.942783
75%,11.252608,1.0,78.79638,74.841577,498.846032,5.305122,4.477652
max,12.999124,1.0,79.99938,99.991081,519.999167,6.999498,5.998827


#### 5- Hemolytic anemia

In [28]:
hemolytic_df = pd.DataFrame()
ret_dict = get_feature_dict('ret_count', anem_feat_arr)
hemolytic_df['hemoglobin'] = np.random.uniform(hb_dict['abnorm_lower'], 13, 10000)
hemolytic_df['gender'] = np.random.choice([gender_dict['lower'], gender_dict['upper']], 10000)
hemolytic_df.loc[hemolytic_df.hemoglobin >= 12, 'gender'] = 1

hemolytic_df['mcv'] = np.random.uniform(mcv_dict['lower'], mcv_dict['upper'], 10000)
hemolytic_df['ret_count'] = np.random.uniform(2.1, ret_dict['abnorm_upper'], 10000)
cols_to_populate = ['ferritin', 'segmented_neutrophils', 'tibc']
hemolytic_df = populate_columns(hemolytic_df, cols_to_populate, anem_feat_arr, len(hemolytic_df) )
hemolytic_df = add_nans(hemolytic_df, ['hemoglobin', 'gender', 'mcv', 'ret_count'])
hemolytic_df['label'] = 'Hemolytic anemia'
hemolytic_df.head()

Unnamed: 0,hemoglobin,gender,mcv,ret_count,ferritin,segmented_neutrophils,tibc,label
0,12.793732,1,81.666105,2.957869,,,,Hemolytic anemia
1,11.379106,1,99.461658,4.226354,230.198204,3.878443,367.377833,Hemolytic anemia
2,9.762204,1,88.048182,5.082087,467.824495,0.780075,484.973091,Hemolytic anemia
3,12.201397,1,91.315328,4.158392,,,,Hemolytic anemia
4,12.875781,1,80.346447,4.5589,76.335966,5.588108,498.08804,Hemolytic anemia


In [29]:
hemolytic_df[hemolytic_df.hemoglobin>12].gender.value_counts()

1    1482
Name: gender, dtype: int64

In [30]:
hemolytic_df.describe()

Unnamed: 0,hemoglobin,gender,mcv,ret_count,ferritin,segmented_neutrophils,tibc
count,10000.0,10000.0,10000.0,10000.0,6000.0,8000.0,5000.0
mean,9.511801,0.5742,89.944778,4.04932,251.330987,3.548285,309.097978
std,2.018278,0.494488,5.762967,1.125629,145.920329,1.998267,122.657731
min,6.000156,0.0,80.003731,2.100888,0.050552,0.101228,100.150008
25%,7.744496,0.0,84.942035,3.079984,124.737485,1.806383,202.471507
50%,9.504062,1.0,89.88689,4.059641,250.794316,3.559358,308.611262
75%,11.261889,1.0,94.922445,5.00762,380.457586,5.27563,417.291024
max,12.999962,1.0,99.992973,5.999721,499.938474,6.99473,519.939216


#### 6 - Aplastic anemia

In [31]:
aplastic_df = pd.DataFrame()
aplastic_df['hemoglobin'] = np.random.uniform(hb_dict['abnorm_lower'], 13, 10000)
aplastic_df['gender'] = np.random.choice([gender_dict['lower'], gender_dict['upper']], 10000)
aplastic_df.loc[aplastic_df.hemoglobin >= 12, 'gender'] = 1

aplastic_df['mcv'] = np.random.uniform(mcv_dict['lower'], mcv_dict['upper'], 10000)
aplastic_df['ret_count'] = np.random.uniform(ret_dict['abnorm_lower'], 2, 10000)
cols_to_populate = ['ferritin', 'segmented_neutrophils', 'tibc']
aplastic_df = populate_columns(aplastic_df, cols_to_populate, anem_feat_arr, len(aplastic_df) )
aplastic_df = add_nans(aplastic_df, ['hemoglobin', 'gender', 'mcv', 'ret_count'])
aplastic_df['label'] = 'Aplastic anemia'
aplastic_df.head()

Unnamed: 0,hemoglobin,gender,mcv,ret_count,ferritin,segmented_neutrophils,tibc,label
0,10.671367,1,88.126494,1.518342,,,,Aplastic anemia
1,9.124143,0,85.846038,1.921482,251.073623,2.191953,488.138603,Aplastic anemia
2,12.054159,1,96.104986,0.384968,,1.133875,,Aplastic anemia
3,11.763536,0,82.230693,1.58994,,,,Aplastic anemia
4,11.07139,1,96.139974,0.426865,279.222554,1.420518,230.365854,Aplastic anemia


In [32]:
aplastic_df[aplastic_df.hemoglobin>12].gender.value_counts()

1    1390
Name: gender, dtype: int64

In [33]:
aplastic_df.describe()

Unnamed: 0,hemoglobin,gender,mcv,ret_count,ferritin,segmented_neutrophils,tibc
count,10000.0,10000.0,10000.0,10000.0,4000.0,9000.0,4000.0
mean,9.510379,0.5644,90.003377,1.0038,246.931925,3.523916,308.802999
std,2.005302,0.49586,5.815534,0.575735,144.992487,1.984865,121.3173
min,6.00028,0.0,80.002849,0.000188,0.040196,0.100215,100.143863
25%,7.785462,0.0,85.019381,0.508853,119.367208,1.796033,203.658095
50%,9.527222,1.0,90.053757,1.005512,245.906619,3.520492,307.574579
75%,11.243505,1.0,95.01818,1.50059,373.276279,5.207357,413.376843
max,12.999876,1.0,99.99968,1.999882,499.926461,6.99991,519.696706


In [34]:
total_anem_df = pd.concat([no_df, vitamin_df, unspec_df, acd_df, ida_df, hemolytic_df, aplastic_df], axis=0)
total_anem_df['new_label'] = total_anem_df.apply(lambda row: create_label(row), axis=1)
assert total_anem_df.label.tolist() == total_anem_df.new_label.tolist()
total_anem_df = total_anem_df.drop(['new_label'], axis=1)
total_anem_df = total_anem_df.sample(frac=1).reset_index(drop=True)
total_anem_df.head()

Unnamed: 0,hemoglobin,gender,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,11.87567,0,161.10407,3.981397,,358.191297,77.370085,Anemia of chronic disease
1,9.296779,0,457.149834,1.874231,5.544915,,78.717345,Anemia of chronic disease
2,9.780377,1,483.21655,4.041312,5.515052,304.155693,100.750651,Vitamin B12/Folate deficiency anemia
3,9.131688,0,57.578776,5.778568,1.976049,207.684776,96.85522,Hemolytic anemia
4,11.39568,0,260.627627,3.892581,1.320481,318.250045,102.206351,Vitamin B12/Folate deficiency anemia


### The other features dataset

In [35]:
add_feat_arr = [{'name':'serum_iron', 'lower':60, 'upper':170, 'abnorm_lower':20, 'abnorm_upper':250},
                {'name':'rbc', 'lower':3.8, 'upper':5.9, 'abnorm_lower': 2.5, 'abnorm_upper':7},
                {'name':'age', 'lower':20, 'upper':90, 'abnorm_lower':18, 'abnorm_upper':95},
                {'name':'indirect_bilirubin', 'lower':0.2, 'upper':1.2, 'abnorm_lower':0, 'abnorm_upper':3},
                {'name':'transferrin', 'lower':204, 'upper':360, 'abnorm_lower':150, 'abnorm_upper':450},
                {'name':'creatinine', 'lower':0.6, 'upper':1.3, 'abnorm_lower':0.2, 'abnorm_upper':2},
                {'name':'cholestrol', 'lower':30, 'upper':100, 'abnorm_lower':0, 'abnorm_upper':150},
                {'name':'copper', 'lower':62, 'upper':140, 'abnorm_lower':30, 'abnorm_upper':130},
                {'name':'ethanol', 'lower':0, 'upper':50, 'abnorm_lower':0, 'abnorm_upper':80},
                {'name':'folate', 'lower':2.7, 'upper':17, 'abnorm_lower':0.5, 'abnorm_upper':30},
                {'name':'glucose', 'lower':70, 'upper':100, 'abnorm_lower':40, 'abnorm_upper':140}
               ]

In [36]:
def create_dataset(sample_num, feat_arr, dist_type='uniform'):
    dataset = pd.DataFrame()
    for feat_dict in feat_arr:
        if feat_dict['name'] == 'gender':
            dataset[feat_dict['name']] = np.random.choice([feat_dict['lower'], feat_dict['upper']], sample_num)
        elif dist_type=='normal':
            dataset[feat_dict['name']] = normal_dist(dataset, feat_dict, sample_num)
        else:
            dataset[feat_dict['name']] = uniform_dist(dataset, feat_dict, sample_num)
    return dataset

In [37]:
add_ft_df = create_dataset(len(total_anem_df), add_feat_arr)
add_ft_df = add_nans(add_ft_df)
#add_ft_df = add_ft_df.fillna(-1)
add_ft_df.head()

Unnamed: 0,serum_iron,rbc,age,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose
0,,6.918734,74.271409,,441.651953,0.508604,86.888802,,,,
1,,3.99921,70.112738,,414.333999,0.784442,37.659905,,,,
2,118.428782,6.273777,87.682835,2.705387,350.319553,1.129411,140.917968,47.714319,62.722981,20.039071,46.973234
3,,3.773108,39.42505,,353.517996,1.569724,26.393979,,,,
4,,,,,,,,,,,


In [38]:
total_df = pd.concat([total_anem_df, add_ft_df], axis=1)
total_df = total_df[['hemoglobin', 'ferritin', 'ret_count', 'segmented_neutrophils', 'tibc', 'mcv', 'serum_iron', 'rbc', 'age', 
                     'gender', 'indirect_bilirubin', 'transferrin', 'creatinine', 'cholestrol', 'copper', 'ethanol', 'folate', 
                     'glucose', 'label']]
total_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
0,11.87567,161.10407,3.981397,,358.191297,77.370085,,6.918734,74.271409,0,,441.651953,0.508604,86.888802,,,,,Anemia of chronic disease
1,9.296779,457.149834,1.874231,5.544915,,78.717345,,3.99921,70.112738,0,,414.333999,0.784442,37.659905,,,,,Anemia of chronic disease
2,9.780377,483.21655,4.041312,5.515052,304.155693,100.750651,118.428782,6.273777,87.682835,1,2.705387,350.319553,1.129411,140.917968,47.714319,62.722981,20.039071,46.973234,Vitamin B12/Folate deficiency anemia
3,9.131688,57.578776,5.778568,1.976049,207.684776,96.85522,,3.773108,39.42505,0,,353.517996,1.569724,26.393979,,,,,Hemolytic anemia
4,11.39568,260.627627,3.892581,1.320481,318.250045,102.206351,,,,0,,,,,,,,,Vitamin B12/Folate deficiency anemia


In [39]:
total_df.to_csv('../../final/data/dataset_by_type.csv', index=False)

### Adding Missing Values

See different variations of missing data notebook