In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
SEED = 20
random.seed(SEED)
np.random.seed(SEED)

In [9]:
feat_arr = [{'name':'hemoglobin', 'lower':12.1, 'upper':17.2, 'mean':10.3, 'std':2.3, 'min':(0, 5)},
             {'name':'ferritin', 'lower':10, 'upper':263, 'mean':697, 'std':3305, 'min':(1.6, 8)},
             {'name':'ret_count', 'lower':0.5, 'upper':2.5, 'mean':2.6, 'std':2.5, 'min':(0, 0.5)},
             {'name':'segmented_neutrophils', 'lower':0, 'upper':0, 'mean':0, 'std':2.4, 'min':(0, 0)},
             {'name':'iron', 'lower':60, 'upper':170, 'mean':50, 'std':44, 'min':(0.9, 40)},
             {'name':'tibc', 'lower':250, 'upper':450, 'mean':242, 'std':90, 'min':(3, 100)},
             {'name':'rbc', 'lower':3.92, 'upper':5.65, 'mean':3.5, 'std':0.8, 'min':(0, 3)},
             {'name':'mcv', 'lower':80, 'upper':100, 'mean':90, 'std':8, 'min':(0, 65)}]

In [10]:
def clip_value(val, feat_dict):
    if val<=0:
        new_val = np.random.uniform(feat_dict['min'][0], feat_dict['min'][1])
    else:
        new_val = val
    return new_val                                                         

In [11]:
def normal_dist(df, col_name, num=None, cond=None):
    for feat_dict in feat_arr:
        if feat_dict['name'] == col_name:
            if cond is None:
                df[col_name] = np.random.normal(feat_dict['mean'], feat_dict['std'], num)
            else:
                df[col_name] = np.random.normal(feat_dict['mean'], feat_dict['std'] if cond else np.nan)
            df[col_name] = [np.random.uniform(feat_dict['min'][0], feat_dict['min'][1]) if val<0 else val for val in df[col_name]]
    return df

In [12]:
def less_mcv(df):
    irons = []
    ferritins = []
    tibcs = []
    for i, row in df.iterrows():
        if row['mcv']<80:
            iron = clip_value(np.random.normal(feat_arr[4]['mean'], feat_arr[4]['std']), feat_arr[4])
            ferritin = clip_value(np.random.normal(feat_arr[1]['mean'], feat_arr[1]['std']), feat_arr[1])
            tibc = clip_value(np.random.normal(feat_arr[5]['mean'], feat_arr[5]['std']), feat_arr[5])
        else:
            iron = ferritin = tibc = np.nan
        irons.append(iron)
        ferritins.append(ferritin)
        tibcs.append(tibc)
    return irons, ferritins, tibcs

In [13]:
def normal_mcv(row):
    if (row['mcv']>=80) & (row['mcv'] <=100):
        return clip_value(np.random.normal(feat_arr[2]['mean'], feat_arr[2]['std']), feat_arr[2])
    else:
        return np.nan

In [14]:
def more_mcv(row):
    if row['mcv']> 100:
        return clip_value(np.random.normal(feat_arr[3]['mean'], feat_arr[3]['std']), feat_arr[3])
    else:
        return np.nan

In [15]:
synth_df= pd.DataFrame(columns = [feat['name'] for feat in feat_arr if feat['name']!='hemoglobin'], dtype='object')
# synth_df = normal_dist(synth_df, 'mcv', 30000)
# synth_df = normal_dist(synth_df, 'rbc', 30000)
synth_df = normal_dist(synth_df, 'mcv', 60000)
synth_df = normal_dist(synth_df, 'rbc', 60000)
synth_df['mentzer_index'] = synth_df['mcv']/synth_df['rbc']
synth_df['iron'], synth_df['ferritin'], synth_df['tibc'] = less_mcv(synth_df)
synth_df['ret_count'] = synth_df.apply(lambda row: normal_mcv(row), axis=1)
synth_df['segmented_neutrophils'] = synth_df.apply(lambda row: more_mcv(row), axis=1)
synth_df.head()

Unnamed: 0,ferritin,ret_count,segmented_neutrophils,iron,tibc,rbc,mcv,mentzer_index
0,,7.419683,,,,3.268986,97.071145,29.694573
1,,0.768225,,,,5.228592,91.56692,17.512729
2,,0.095205,,,,2.698679,92.860292,34.409535
3,5.773862,,,65.675268,272.340699,3.632368,71.253905,19.616379
4,,1.831053,,,,3.509544,81.321339,23.171482


In [16]:
synth_df.describe()

Unnamed: 0,ferritin,ret_count,segmented_neutrophils,iron,tibc,rbc,mcv,mentzer_index
count,6338.0,47274.0,6388.0,6338.0,6338.0,60000.0,60000.0,60000.0
mean,1665.447625,2.840236,0.973489,54.591765,244.060611,3.504759,90.03863,27.333883
std,2150.026915,2.142463,1.412296,35.425152,89.192841,0.800871,8.033262,8.577876
min,1.60559,5.7e-05,0.0,0.041118,0.4418,0.19927,59.27154,10.88774
25%,5.343627,0.91786,0.0,26.512426,183.325951,2.965273,84.62854,21.988112
50%,610.183715,2.613531,0.072052,49.164302,242.022214,3.501786,90.019109,25.706606
75%,2891.00713,4.287805,1.634197,78.569418,303.406348,4.044705,95.423225,30.637738
max,13040.444622,12.229931,9.058435,222.002964,593.019934,7.035352,122.280231,415.455275


In [17]:
def create_label(row):
    if row['mcv']<80 :
        if row['mentzer_index']<13:
            return 'Thalassemia'
        elif row['ferritin']<30:
            return 'Iron deficiency anemia'
        elif row['ferritin']>100:
            return 'Anemia of chronic disease'
        elif row['tibc']<450:
            return 'Anemia of chronic disease'
        elif row['tibc']>=450:
            return 'Iron deficiency anemia'
        else:
            return 'Inconclusive diagnosis'
        
    elif row['mcv']<=100:
        if row['ret_count'] <=2:
            return 'Aplastic anemia'
        elif row['ret_count']>2:
            return 'Hemolytic anemia'
        else:
            return 'Inconclusive diagnosis'
        
    elif row['mcv']> 100:
        if row['segmented_neutrophils']>0:
            return 'Vitamin B12/Folate deficiency anemia'
        elif row['segmented_neutrophils'] == 0:
            return 'Unspecified anemia'
        else:
            return 'Inconclusive diagnosis'
    else:
        return 'Inconclusive diagnosis'

In [18]:
synth_df['label'] = synth_df.apply(lambda row: create_label(row), axis=1)
synth_df.head()

Unnamed: 0,ferritin,ret_count,segmented_neutrophils,iron,tibc,rbc,mcv,mentzer_index,label
0,,7.419683,,,,3.268986,97.071145,29.694573,Hemolytic anemia
1,,0.768225,,,,5.228592,91.56692,17.512729,Aplastic anemia
2,,0.095205,,,,2.698679,92.860292,34.409535,Aplastic anemia
3,5.773862,,,65.675268,272.340699,3.632368,71.253905,19.616379,Iron deficiency anemia
4,,1.831053,,,,3.509544,81.321339,23.171482,Aplastic anemia


In [19]:
synth_df.isna().sum()

ferritin                 53662
ret_count                12726
segmented_neutrophils    53612
iron                     53662
tibc                     53662
rbc                          0
mcv                          0
mentzer_index                0
label                        0
dtype: int64

In [14]:
# creating the unspecified anemia dataset
# unspecified_anem_df = synth_df[synth_df.label.isna()]
# unspecified_anem_df['label'] = 'Unspecified anemia'
# unspecified_anem_df.fillna(0, inplace=True)
# print(len(unspecified_anem_df))
# unspecified_anem_df.to_csv(f'data/unspecified_anemia_dataset.csv', index=False)

In [21]:
#synth_df = synth_df[synth_df.label.notna()]
synth_df = synth_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
synth_df.isna().sum()

ferritin                 53662
ret_count                12726
segmented_neutrophils    53612
iron                     53662
tibc                     53662
rbc                          0
mcv                          0
mentzer_index                0
label                        0
dtype: int64

In [37]:
synth_df['ferritin'].isna().sum()

26775

In [38]:
len(synth_df)

30000

In [24]:
synth_df.label.value_counts()

Hemolytic anemia                        28222
Aplastic anemia                         19052
Anemia of chronic disease                3607
Vitamin B12/Folate deficiency anemia     3278
Unspecified anemia                       3110
Iron deficiency anemia                   2701
Thalassemia                                30
Name: label, dtype: int64

#### Saving the data

In [17]:
#synth_df.to_csv('data/anemia_synth_dataset.csv', index=False)

#### Randomly replacing some nulls with "normal" values

In [22]:
def replace_nans(df, cols, frac): #can implement where frac is a list
    for col_name in cols:
        for feat_dict in feat_arr:
            if feat_dict['name'] == col_name:
                nan_num = df[col_name].isna().sum()
                fill_num = int(np.ceil(frac*nan_num))
                generated_nums = np.random.uniform(feat_dict['lower'], feat_dict['upper'], fill_num)
                indices = list(df[df[col_name].isna()].index)
                new_indices = random.sample(indices, fill_num)
                df.loc[new_indices, col_name] = generated_nums
    return df

In [19]:
# new_synth_df = pd.read_csv('data/anemia_synth_dataset.csv')
# new_synth_df.isna().sum()

In [40]:
# to delete
# synth_df['label'] = synth_df['label'].fillna('Unspecified anemia')
# new_synth_df = synth_df.copy()
# new_synth_df.label.value_counts()

Hemolytic anemia                        14146
Aplastic anemia                          9450
Anemia of chronic disease                1869
Unspecified anemia                       1604
Vitamin B12/Folate deficiency anemia     1575
Iron deficiency anemia                   1343
Thalassemia                                13
Name: label, dtype: int64

In [23]:
cols = ['ferritin', 'ret_count', 'segmented_neutrophils', 'iron', 'tibc']
filled_synth_df = replace_nans(new_synth_df, cols, 0.30)
filled_synth_df.isna().sum()

NameError: name 'new_synth_df' is not defined

In [42]:
filled_synth_df.head()

Unnamed: 0,ferritin,ret_count,segmented_neutrophils,iron,tibc,rbc,mcv,mentzer_index,label
0,,0.253194,,,,4.052988,88.164375,21.752932,Aplastic anemia
1,99.558391,0.087604,,156.830616,,4.220903,87.223193,20.66458,Aplastic anemia
2,134.020515,9.093607,,68.112529,,3.682854,82.5254,22.408002,Hemolytic anemia
3,,0.292759,0.0,96.968292,,4.234232,96.353048,22.755732,Aplastic anemia
4,129.481445,1.717516,1.698862,,,3.814345,103.899232,27.239074,Vitamin B12/Folate deficiency anemia


In [43]:
filled_synth_df.to_csv('data/anemia_synth_dataset_with_unspecified.csv', index=False)

In [None]:
#filled_synth_df.to_csv('data/anemia_synth_dataset_some_filled.csv', index=False)

#### Data analysis

In [None]:
analytic_df = pd.read_csv('data/anemia_synth_dataset.csv')
analytic_df.head()

In [None]:
plt.xticks(rotation=90)
sns.countplot(x='label', data=analytic_df, palette = 'Set1')

In [None]:
isna_series = analytic_df.isna().sum()

In [None]:
def plot_feature_frequencies(df):
    output_df  = pd.DataFrame()
    for col in df.columns:
        if col != 'label':
            missing_num = df.isna().sum()[col]
            present_num = df.notna().sum()[col]
            col_dict = {'feature name':col, 'present': present_num, 'missing':missing_num,}
            output_df = output_df.append(col_dict, ignore_index=True)
    output_df = output_df[['feature name', 'present', 'missing']]
    output_df.plot(x='feature name', kind='bar', figsize=(8,6), stacked=True)
    return output_df

In [None]:
output_df = plot_feature_frequencies(analytic_df)

In [None]:
output_df