In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
import torch
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

In [19]:
def create_label(row):
    if row['hemoglobin']> 13:
        return 'No anemia'
    elif (row['hemoglobin'] > 12) & (row['gender']==0):
        return 'No anemia'
    else:
        if row['mcv']<80 :
            if row['ferritin']<30:
                return 'Iron deficiency anemia'
            elif row['ferritin']>100:
                return 'Anemia of chronic disease'
            elif row['tibc']<450:
                return 'Anemia of chronic disease'
            elif row['tibc']>=450:
                return 'Iron deficiency anemia'
            else:
                return 'Inconclusive diagnosis'
            
        elif row['mcv']<=100:
            if row['ret_count'] <= 2:
                return 'Aplastic anemia'
            elif row['ret_count'] >2:
                return 'Hemolytic anemia'
            else:
                return 'Inconclusive diagnosis'

        elif row['mcv']> 100:
            if row['segmented_neutrophils']>0:
                return 'Vitamin B12/Folate deficiency anemia'
            elif row['segmented_neutrophils']==0:
                return 'Unspecified anemia'
            else:
                return 'Inconclusive diagnosis'
        else:
            return 'Inconclusive diagnosis'

In [20]:
#df = pd.read_csv('../../data/anemia_synth_dataset_some_nans_unspecified_more_feats.csv')
df = pd.read_csv('../../data/more_features/synth_dataset_by_type_more_feats.csv')
# df = df.fillna(-1)
df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
0,11.183192,187.573466,4.951674,1.661383,316.593436,95.006424,181.242992,6.758433,60.586525,1,2.976104,282.291951,,28.040619,96.721542,49.530542,23.186628,,Hemolytic anemia
1,14.387445,,,,,,125.249617,6.350652,86.964793,1,2.968983,158.844879,,98.357508,112.758764,62.464566,7.16892,,No anemia
2,12.749357,5.012158,3.5028,6.179371,498.418768,76.759285,159.834784,5.018156,67.065069,1,2.705719,442.293823,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,Iron deficiency anemia
3,11.50887,197.180945,1.200125,0.0,457.033309,102.900301,131.177927,6.656823,66.403181,1,1.869338,223.896404,,111.220307,66.999185,18.353272,14.132423,,Unspecified anemia
4,9.456656,427.952052,,0.660252,,104.543774,,,,0,,,,,,,,,Vitamin B12/Folate deficiency anemia


In [21]:
df.label.value_counts()

Vitamin B12/Folate deficiency anemia    10000
Unspecified anemia                      10000
No anemia                               10000
Aplastic anemia                         10000
Anemia of chronic disease               10000
Hemolytic anemia                        10000
Iron deficiency anemia                  10000
Name: label, dtype: int64

In [22]:
df.isna().sum()

hemoglobin                   0
ferritin                 20000
ret_count                20000
segmented_neutrophils    17000
tibc                     21000
mcv                       5000
serum_iron               21000
rbc                       7000
age                      14000
gender                       0
indirect_bilirubin       14000
transferrin              21000
creatinine               35000
cholestrol               21000
copper                   21000
ethanol                  14000
folate                   21000
glucose                  49000
label                        0
dtype: int64

In [23]:
utils.get_dt_performance(df)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



(0.9995714285714286, 0.9995713411666153, 0.99975)

In [24]:
col_list = [col for col in df.columns if col not in ['hemoglobin', 'gender', 'age', 'mcv', 'label']]

In [25]:
nan_frac = 0.1
trial_df = utils.generate_nans(df.copy(), col_list, nan_frac)

In [26]:
trial_df.isna().sum()

hemoglobin                   0
ferritin                 25000
ret_count                25000
segmented_neutrophils    22300
tibc                     25900
mcv                       5000
serum_iron               25900
rbc                      13300
age                      14000
gender                       0
indirect_bilirubin       19600
transferrin              25900
creatinine               38500
cholestrol               25900
copper                   25900
ethanol                  19600
folate                   25900
glucose                  51100
label                        0
dtype: int64

In [27]:
trial_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
0,11.183192,187.573466,4.951674,1.661383,316.593436,95.006424,181.242992,6.758433,60.586525,1,2.976104,282.291951,,28.040619,96.721542,49.530542,23.186628,,Hemolytic anemia
1,14.387445,,,,,,125.249617,6.350652,86.964793,1,2.968983,158.844879,,98.357508,112.758764,62.464566,7.16892,,No anemia
2,12.749357,5.012158,3.5028,6.179371,498.418768,76.759285,159.834784,5.018156,67.065069,1,2.705719,442.293823,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,Iron deficiency anemia
3,11.50887,197.180945,1.200125,0.0,457.033309,102.900301,131.177927,6.656823,66.403181,1,1.869338,223.896404,,111.220307,66.999185,18.353272,14.132423,,Unspecified anemia
4,9.456656,427.952052,,0.660252,,104.543774,,,,0,,,,,,,,,Vitamin B12/Folate deficiency anemia


In [28]:
trial_df.label.value_counts()

Vitamin B12/Folate deficiency anemia    10000
Unspecified anemia                      10000
No anemia                               10000
Aplastic anemia                         10000
Anemia of chronic disease               10000
Hemolytic anemia                        10000
Iron deficiency anemia                  10000
Name: label, dtype: int64

In [29]:
trial_df.to_csv(f'../../data/more_features/more_feats_{nan_frac}.csv', index=False)

In [30]:
utils.get_dt_performance(trial_df)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



(0.9735238095238096, 0.9735286083517992, 0.9845555555555556)

In [31]:
trial_df['label'] = trial_df.apply(lambda row: create_label(row), axis=1)
utils.get_dt_performance(trial_df)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



(0.9991428571428571, 0.999034983412542, 0.9993843570360474)

In [32]:
trial_df.to_csv(f'../../data/more_features/more_feats_new_labels_{nan_frac}.csv', index=False)

In [33]:
trial_df.label.value_counts()

No anemia                               10000
Anemia of chronic disease                9756
Iron deficiency anemia                   9267
Unspecified anemia                       9033
Aplastic anemia                          9020
Vitamin B12/Folate deficiency anemia     9000
Hemolytic anemia                         8976
Inconclusive diagnosis                   4948
Name: label, dtype: int64