In [2]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
import warnings
warnings.filterwarnings('ignore')

In [3]:
feat_thresh_arr = [{'name':'hemoglobin', 'thresh':[12], 'std':2},
                   {'name':'ferritin', 'thresh':[30, 100], 'std':2},
                   {'name':'ret_count', 'thresh':[2], 'std':0.2},
                   {'name':'segmented_neutrophils', 'thresh':[0], 'std':2},
                   {'name':'tibc', 'thresh':[450], 'std':50},
                   {'name':'mcv', 'thresh':[80,100], 'std':2}]

anemias_features_dict = {'Vitamin B12/Folate deficiency anemia': ['mcv', 'segmented_neutrophils'], 
                         'Unspecified anemia': ['mcv', 'segmented_neutrophils'], 
                         'Anemia of chronic disease': ['mcv', 'ferritin', 'tibc'], 
                         'Iron deficiency anemia': ['mcv', 'ferritin', 'tibc'], 
                         'Hemolytic anemia': ['mcv', 'ret_count'], 
                         'Aplastic anemia': ['mcv', 'ret_count']}

#### The data

In [4]:
df= pd.read_csv('../../data/more_features/more_feats_correlated_0.1.csv')
df = df.fillna(-1)
df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.183192,187.573466,4.951674,1.661383,316.593436,95.006424,181.242992,3.531295,1,-1.0,28.040619,96.721542,49.530542,23.186628,-1.0,33.549575,57.247868,Hemolytic anemia
1,14.387445,-1.0,-1.0,-1.0,-1.0,-1.0,125.249617,-1.0,1,-1.0,98.357508,112.758764,62.464566,7.16892,-1.0,43.162335,-1.0,No anemia
2,12.749357,5.012158,3.5028,6.179371,498.418768,76.759285,159.834784,4.982859,1,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,38.248071,32.068372,Iron deficiency anemia
3,11.50887,197.180945,1.200125,0.0,457.033309,102.900301,131.177927,3.355346,1,-1.0,111.220307,66.999185,18.353272,14.132423,-1.0,34.526609,28.70205,Unspecified anemia
4,9.456656,427.952052,-1.0,0.660252,-1.0,104.543774,-1.0,2.713693,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,28.369968,-1.0,Vitamin B12/Folate deficiency anemia


In [5]:
df.label.value_counts()

No anemia                               10000
Anemia of chronic disease                9756
Iron deficiency anemia                   9267
Unspecified anemia                       9033
Aplastic anemia                          9020
Vitamin B12/Folate deficiency anemia     9000
Hemolytic anemia                         8976
Inconclusive diagnosis                   4948
Name: label, dtype: int64

In [6]:
def get_dict(dict_arr, feat_name):
    for dictionary in dict_arr:
        if dictionary['name']==feat_name:
            return dictionary

In [7]:
def feat_noisiness(df, feat_name, frac, mean, std):
    df[feat_name] = df[feat_name].sample(frac=1-frac)
    if feat_name ==  'segmented_neutrophils':
        df[feat_name] = df[feat_name].fillna(-1)
    else:
        nan_index = df[df[feat_name].isna()].index
        fill_values = np.random.normal(mean, std, size=len(nan_index))
        df[feat_name] = df[feat_name].fillna(pd.Series(fill_values, index=nan_index))
    return df[feat_name]

In [8]:
total_frac = 0.6

In [9]:
noisy_anemia_dfs_list = []
for anemia in df.label.unique():
    print(anemia.upper())
    if anemia not in ['No anemia', 'Inconclusive diagnosis']:
        anemia_df = df[df.label==anemia]
        feature_num = len(anemias_features_dict[anemia])
        frac = total_frac/feature_num
        for feat in anemias_features_dict[anemia]:
            feat_info = get_dict(feat_thresh_arr, feat)
            threshes, std = feat_info['thresh'], feat_info['std']
            print(f'feat:{feat}, threshes:{threshes}, std:{std}')
            for thresh in threshes:
                anemia_df[feat] = feat_noisiness(anemia_df, feat, frac/len(threshes), thresh, std)
        noisy_anemia_dfs_list.append(anemia_df)

HEMOLYTIC ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:ret_count, threshes:[2], std:0.2
NO ANEMIA
IRON DEFICIENCY ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:ferritin, threshes:[30, 100], std:2
feat:tibc, threshes:[450], std:50
UNSPECIFIED ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:segmented_neutrophils, threshes:[0], std:2
VITAMIN B12/FOLATE DEFICIENCY ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:segmented_neutrophils, threshes:[0], std:2
APLASTIC ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:ret_count, threshes:[2], std:0.2
ANEMIA OF CHRONIC DISEASE
feat:mcv, threshes:[80, 100], std:2
feat:ferritin, threshes:[30, 100], std:2
feat:tibc, threshes:[450], std:50
INCONCLUSIVE DIAGNOSIS


In [10]:
noisy_anemia_dfs_list[0].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat
count,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0
mean,9.50073,110.430343,3.44151,0.638844,162.5096,90.253161,86.187883,3.177761,0.576315,-0.050466,46.883783,50.446564,28.819956,9.366428,23.455645,28.502189,17.936511
std,2.024398,157.668881,1.341431,2.49478,173.564923,7.303839,84.598208,0.711503,0.494169,1.098499,50.03804,45.228159,27.080726,10.400227,42.993476,6.073194,34.550425
min,6.000214,-1.0,1.228725,-1.0,-1.0,72.835875,-1.0,1.805443,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,18.000641,-1.0
25%,7.721399,-1.0,2.165211,-1.0,-1.0,83.582233,-1.0,2.565631,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,23.164198,-1.0
50%,9.489891,-1.0,3.221504,-1.0,129.648213,90.404025,70.876841,3.164961,1.0,-1.0,30.613549,51.246214,24.942002,6.795334,-1.0,28.469674,-1.0
75%,11.273937,218.951524,4.630136,2.266321,318.520326,97.005647,161.452599,3.74729,1.0,1.003649,90.121746,90.569093,52.199525,18.514494,47.277566,33.82181,28.674039
max,12.99857,499.897137,5.999972,6.999613,499.967563,106.202373,249.978161,4.844795,1.0,1.999669,149.968003,129.999938,79.998067,29.99698,139.995018,38.995711,234.107919


In [11]:
no_df = df[df.label=='No anemia']
inconc_df = df[df.label=='Inconclusive diagnosis']
len(no_df), len(inconc_df)

(10000, 4948)

In [12]:
noisy_anemia_dfs_list.append(no_df)
noisy_anemia_dfs_list.append(inconc_df)
len(noisy_anemia_dfs_list)

8

In [13]:
combined_df = pd.concat(noisy_anemia_dfs_list, axis=0)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,10.246017,23.079085,-1.0,-1.0,319.72267,81.172047,-1.0,3.942489,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,30.738052,-1.0,Iron deficiency anemia
1,11.182131,405.470307,3.8042,-1.0,461.725618,99.729919,-1.0,3.363724,0,1.267152,-1.0,-1.0,64.077503,-1.0,-1.0,33.546392,-1.0,Hemolytic anemia
2,6.57067,119.277868,-1.0,-1.0,286.060016,101.248843,32.417732,2.483761,0,-1.0,135.281025,127.409158,-1.0,20.203261,-1.0,19.712009,11.332493,Anemia of chronic disease
3,15.256346,-1.0,2.464129,-1.0,-1.0,-1.0,107.864968,-1.0,1,1.608654,140.306421,45.005908,-1.0,3.73705,73.111942,45.769037,-1.0,No anemia
4,10.505849,467.024053,-1.0,-1.0,414.069202,78.404772,-1.0,4.019851,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,31.517548,-1.0,Anemia of chronic disease


#### The correlated features

In [14]:
def get_rbc(row):
    if row['mcv'] == -1:
        return -1
    else:
        return (30*row['hemoglobin'])/row['mcv']

def get_tsat(row):
    if (row['serum_iron']==-1) | (row['tibc']==-1):
        return -1
    else:
        return (row['serum_iron']/row['tibc'])*100

In [15]:
combined_df['rbc'] = combined_df.apply(lambda row: get_rbc(row), axis=1)
combined_df['tsat'] = combined_df.apply(lambda row: get_tsat(row), axis=1)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,10.246017,23.079085,-1.0,-1.0,319.72267,81.172047,-1.0,3.786778,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,30.738052,-1.0,Iron deficiency anemia
1,11.182131,405.470307,3.8042,-1.0,461.725618,99.729919,-1.0,3.363724,0,1.267152,-1.0,-1.0,64.077503,-1.0,-1.0,33.546392,-1.0,Hemolytic anemia
2,6.57067,119.277868,-1.0,-1.0,286.060016,101.248843,32.417732,1.946887,0,-1.0,135.281025,127.409158,-1.0,20.203261,-1.0,19.712009,11.332493,Anemia of chronic disease
3,15.256346,-1.0,2.464129,-1.0,-1.0,-1.0,107.864968,-1.0,1,1.608654,140.306421,45.005908,-1.0,3.73705,73.111942,45.769037,-1.0,No anemia
4,10.505849,467.024053,-1.0,-1.0,414.069202,78.404772,-1.0,4.019851,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,31.517548,-1.0,Anemia of chronic disease


#### Randomly modifying to no anemia

In [16]:
anemic_df = combined_df[combined_df.label != 'No anemia']
non_anemic_df = combined_df[combined_df.label == 'No anemia']
len(anemic_df), len(non_anemic_df)

(60000, 10000)

In [17]:
anemic_df['label'] = anemic_df['label'].sample(frac=1-0.1)
anemic_df['label'] = anemic_df['label'].fillna('No anemia')
anemic_df.label.value_counts()

Anemia of chronic disease               8828
Iron deficiency anemia                  8331
Unspecified anemia                      8104
Aplastic anemia                         8093
Hemolytic anemia                        8089
Vitamin B12/Folate deficiency anemia    8082
No anemia                               6000
Inconclusive diagnosis                  4473
Name: label, dtype: int64

#### Finalizing 

In [18]:
final_df = pd.concat([non_anemic_df, anemic_df], axis=0)
final_df = final_df.sample(frac=1).reset_index(drop=True)
final_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,12.322384,163.121429,4.121959,-1.0,433.145097,100.147359,51.037057,3.691276,1,0.819482,147.693827,59.199141,41.958432,20.792161,101.383932,36.967153,11.782901,Unspecified anemia
1,8.298889,-1.0,2.07695,-1.0,483.617753,98.431076,-1.0,2.52935,0,-1.0,-1.0,-1.0,36.118322,-1.0,-1.0,24.896668,-1.0,Hemolytic anemia
2,12.696391,3.393723,-1.0,-1.0,451.933132,79.486542,85.001345,4.791902,1,-1.0,4.852168,89.831485,44.946238,0.965963,-1.0,38.089174,18.80839,Iron deficiency anemia
3,12.705102,-1.0,2.305379,-1.0,-1.0,81.057541,135.371313,4.702253,1,1.32414,32.717943,76.524319,-1.0,27.439316,-1.0,38.115305,-1.0,Aplastic anemia
4,8.211543,29.622561,-1.0,0.93619,479.914773,78.38848,-1.0,3.142634,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,24.634629,-1.0,Iron deficiency anemia


In [19]:
final_df.label.value_counts()

No anemia                               16000
Anemia of chronic disease                8828
Iron deficiency anemia                   8331
Unspecified anemia                       8104
Aplastic anemia                          8093
Hemolytic anemia                         8089
Vitamin B12/Folate deficiency anemia     8082
Inconclusive diagnosis                   4473
Name: label, dtype: int64

In [20]:
utils.get_dt_performance(final_df)

(0.6596428571428572,
 0.6536656960257498,
 0.8005238687826254,
 datetime.timedelta(microseconds=3980))

In [21]:
final_df.to_csv('../../data/more_features/more_feats_correlated_noisy_6.csv', index=False)