In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
import torch
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

In [22]:
def create_label(row):
    if row['hemoglobin']> 13:
        return 'No anemia'
    elif (row['hemoglobin'] > 12) & (row['gender']==0):
        return 'No anemia'
    else:
        if row['mcv']<80 :
            if row['ferritin']<30:
                return 'Iron deficiency anemia'
            elif row['ferritin']>100:
                return 'Anemia of chronic disease'
            elif ((row['ferritin']>=30 ) & (row['tibc']<450)):
                return 'Anemia of chronic disease'
            elif ((row['ferritin']>=30) & (row['tibc']>=450)):
                return 'Iron deficiency anemia'
            else:
                return 'Inconclusive diagnosis'
            
        elif row['mcv']<=100:
            if row['ret_count'] <= 2:
                return 'Aplastic anemia'
            elif row['ret_count'] >2:
                return 'Hemolytic anemia'
            else:
                return 'Inconclusive diagnosis'

        elif row['mcv']> 100:
            if row['segmented_neutrophils']>0:
                return 'Vitamin B12/Folate deficiency anemia'
            elif row['segmented_neutrophils']==0:
                return 'Unspecified anemia'
            else:
                return 'Inconclusive diagnosis'
        else:
            return 'Inconclusive diagnosis'

In [23]:
#df = pd.read_csv('../../data/anemia_synth_dataset_some_nans_unspecified_more_feats.csv')
#df = pd.read_csv('../../data/more_features/synth_dataset_by_type_more_feats.csv')
df = pd.read_csv('../../final/data/dataset_by_type.csv')
# df = df.fillna(-1)
df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
0,11.87567,161.10407,3.981397,,358.191297,77.370085,,6.918734,74.271409,0,,441.651953,0.508604,86.888802,,,,,Anemia of chronic disease
1,9.296779,457.149834,1.874231,5.544915,,78.717345,,3.99921,70.112738,0,,414.333999,0.784442,37.659905,,,,,Anemia of chronic disease
2,9.780377,483.21655,4.041312,5.515052,304.155693,100.750651,118.428782,6.273777,87.682835,1,2.705387,350.319553,1.129411,140.917968,47.714319,62.722981,20.039071,46.973234,Vitamin B12/Folate deficiency anemia
3,9.131688,57.578776,5.778568,1.976049,207.684776,96.85522,,3.773108,39.42505,0,,353.517996,1.569724,26.393979,,,,,Hemolytic anemia
4,11.39568,260.627627,3.892581,1.320481,318.250045,102.206351,,,,0,,,,,,,,,Vitamin B12/Folate deficiency anemia


In [24]:
df.label.value_counts()

Aplastic anemia                         10000
Hemolytic anemia                        10000
Unspecified anemia                      10000
Vitamin B12/Folate deficiency anemia    10000
No anemia                               10000
Anemia of chronic disease               10000
Iron deficiency anemia                  10000
Name: label, dtype: int64

In [25]:
df.isna().sum()

hemoglobin                   0
ferritin                 24000
ret_count                14000
segmented_neutrophils    12000
tibc                     22806
mcv                       5000
serum_iron               35000
rbc                      21000
age                      14000
gender                       0
indirect_bilirubin       49000
transferrin              21000
creatinine               14000
cholestrol               14000
copper                   42000
ethanol                  35000
folate                   42000
glucose                  35000
label                        0
dtype: int64

In [26]:
utils.get_dt_performance(df)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



(0.9997857142857143,
 0.9997857142723214,
 0.9998750000000001,
 datetime.timedelta(microseconds=2000))

In [27]:
col_list = [col for col in df.columns if col not in ['hemoglobin', 'gender', 'age', 'mcv', 'label']]

In [28]:
nan_frac = 0.1
trial_df = utils.generate_nans(df.copy(), col_list, nan_frac)

In [29]:
trial_df.isna().sum()

hemoglobin                   0
ferritin                 28600
ret_count                19600
segmented_neutrophils    17800
tibc                     27525
mcv                       5000
serum_iron               38500
rbc                      25900
age                      14000
gender                       0
indirect_bilirubin       51100
transferrin              25900
creatinine               19600
cholestrol               19600
copper                   44800
ethanol                  38500
folate                   44800
glucose                  38500
label                        0
dtype: int64

In [30]:
trial_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,age,gender,indirect_bilirubin,transferrin,creatinine,cholestrol,copper,ethanol,folate,glucose,label
0,11.87567,161.10407,3.981397,,358.191297,77.370085,,6.918734,74.271409,0,,441.651953,0.508604,86.888802,,,,,Anemia of chronic disease
1,9.296779,457.149834,1.874231,5.544915,,78.717345,,3.99921,70.112738,0,,414.333999,0.784442,37.659905,,,,,Anemia of chronic disease
2,9.780377,483.21655,4.041312,5.515052,304.155693,100.750651,118.428782,6.273777,87.682835,1,2.705387,350.319553,1.129411,140.917968,47.714319,62.722981,20.039071,46.973234,Vitamin B12/Folate deficiency anemia
3,9.131688,57.578776,5.778568,1.976049,207.684776,96.85522,,3.773108,39.42505,0,,353.517996,1.569724,26.393979,,,,,Hemolytic anemia
4,11.39568,,,1.320481,318.250045,102.206351,,,,0,,,,,,,,,Vitamin B12/Folate deficiency anemia


In [31]:
trial_df.label.value_counts()

Aplastic anemia                         10000
Hemolytic anemia                        10000
Unspecified anemia                      10000
Vitamin B12/Folate deficiency anemia    10000
No anemia                               10000
Anemia of chronic disease               10000
Iron deficiency anemia                  10000
Name: label, dtype: int64

In [32]:
# trial_df.to_csv(f'../../final/data/more_feats_{nan_frac}.csv', index=False)

In [33]:
utils.get_dt_performance(trial_df)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



(0.9647857142857142,
 0.9647924366247957,
 0.9794583333333333,
 datetime.timedelta(microseconds=3001))

In [34]:
trial_df['label'] = trial_df.apply(lambda row: create_label(row), axis=1)
utils.get_dt_performance(trial_df)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



(0.9995,
 0.9994990407834472,
 0.9997316430824832,
 datetime.timedelta(microseconds=1995))

In [35]:
trial_df.to_csv(f'../../final/data/dataset_with_inconclusive.csv', index=False)

In [36]:
trial_df.label.value_counts()

No anemia                               10000
Aplastic anemia                          9029
Hemolytic anemia                         9025
Vitamin B12/Folate deficiency anemia     9004
Unspecified anemia                       8964
Anemia of chronic disease                8859
Iron deficiency anemia                   8398
Inconclusive diagnosis                   6721
Name: label, dtype: int64

In [37]:
len(trial_df)

70000