In [1]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
feat_thresh_arr = [{'name':'hemoglobin', 'thresh':[12], 'std':2},
                   {'name':'ferritin', 'thresh':[30, 100], 'std':2},
                   {'name':'ret_count', 'thresh':[2], 'std':0.2},
                   {'name':'segmented_neutrophils', 'thresh':[0], 'std':2},
                   {'name':'tibc', 'thresh':[450], 'std':50},
                   {'name':'mcv', 'thresh':[80,100], 'std':2}]

anemias_features_dict = {'Vitamin B12/Folate deficiency anemia': ['mcv', 'segmented_neutrophils'], 
                         'Unspecified anemia': ['mcv', 'segmented_neutrophils'], 
                         'Anemia of chronic disease': ['mcv', 'ferritin', 'tibc'], 
                         'Iron deficiency anemia': ['mcv', 'ferritin', 'tibc'], 
                         'Hemolytic anemia': ['mcv', 'ret_count'], 
                         'Aplastic anemia': ['mcv', 'ret_count']}

#### The data

In [3]:
df= pd.read_csv('../../data/more_features/more_feats_correlated_0.1.csv')
df = df.fillna(-1)
df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.183192,187.573466,4.951674,1.661383,316.593436,95.006424,181.242992,3.531295,1,-1.0,28.040619,96.721542,49.530542,23.186628,-1.0,33.549575,57.247868,Hemolytic anemia
1,14.387445,-1.0,-1.0,-1.0,-1.0,-1.0,125.249617,-1.0,1,-1.0,98.357508,112.758764,62.464566,7.16892,-1.0,43.162335,-1.0,No anemia
2,12.749357,5.012158,3.5028,6.179371,498.418768,76.759285,159.834784,4.982859,1,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,38.248071,32.068372,Iron deficiency anemia
3,11.50887,197.180945,1.200125,0.0,457.033309,102.900301,131.177927,3.355346,1,-1.0,111.220307,66.999185,18.353272,14.132423,-1.0,34.526609,28.70205,Unspecified anemia
4,9.456656,427.952052,-1.0,0.660252,-1.0,104.543774,-1.0,2.713693,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,28.369968,-1.0,Vitamin B12/Folate deficiency anemia


In [4]:
df.label.value_counts()

No anemia                               10000
Anemia of chronic disease                9756
Iron deficiency anemia                   9267
Unspecified anemia                       9033
Aplastic anemia                          9020
Vitamin B12/Folate deficiency anemia     9000
Hemolytic anemia                         8976
Inconclusive diagnosis                   4948
Name: label, dtype: int64

In [5]:
def get_dict(dict_arr, feat_name):
    for dictionary in dict_arr:
        if dictionary['name']==feat_name:
            return dictionary

In [6]:
def feat_noisiness(df, feat_name, frac, mean, std):
    df[feat_name] = df[feat_name].sample(frac=1-frac)
    if feat_name ==  'segmented_neutrophils':
        df[feat_name] = df[feat_name].fillna(-1)
    else:
        nan_index = df[df[feat_name].isna()].index
        fill_values = np.random.normal(mean, std, size=len(nan_index))
        df[feat_name] = df[feat_name].fillna(pd.Series(fill_values, index=nan_index))
    return df[feat_name]

In [7]:
total_frac = 0.8

In [8]:
noisy_anemia_dfs_list = []
for anemia in df.label.unique():
    print(anemia.upper())
    if anemia not in ['No anemia', 'Inconclusive diagnosis']:
        anemia_df = df[df.label==anemia]
        feature_num = len(anemias_features_dict[anemia])
        frac = total_frac/feature_num
        for feat in anemias_features_dict[anemia]:
            feat_info = get_dict(feat_thresh_arr, feat)
            threshes, std = feat_info['thresh'], feat_info['std']
            print(f'feat:{feat}, threshes:{threshes}, std:{std}')
            for thresh in threshes:
                anemia_df[feat] = feat_noisiness(anemia_df, feat, frac/len(threshes), thresh, std)
        noisy_anemia_dfs_list.append(anemia_df)

HEMOLYTIC ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:ret_count, threshes:[2], std:0.2
NO ANEMIA
IRON DEFICIENCY ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:ferritin, threshes:[30, 100], std:2
feat:tibc, threshes:[450], std:50
UNSPECIFIED ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:segmented_neutrophils, threshes:[0], std:2
VITAMIN B12/FOLATE DEFICIENCY ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:segmented_neutrophils, threshes:[0], std:2
APLASTIC ANEMIA
feat:mcv, threshes:[80, 100], std:2
feat:ret_count, threshes:[2], std:0.2
ANEMIA OF CHRONIC DISEASE
feat:mcv, threshes:[80, 100], std:2
feat:ferritin, threshes:[30, 100], std:2
feat:tibc, threshes:[450], std:50
INCONCLUSIVE DIAGNOSIS


In [9]:
noisy_anemia_dfs_list[0].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat
count,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0,8976.0
mean,9.50073,110.430343,3.243719,0.638844,162.5096,90.4492,86.187883,3.177761,0.576315,-0.050466,46.883783,50.446564,28.819956,9.366428,23.455645,28.502189,17.936511
std,2.024398,157.668881,1.345959,2.49478,173.564923,7.691207,84.598208,0.711503,0.494169,1.098499,50.03804,45.228159,27.080726,10.400227,42.993476,6.073194,34.550425
min,6.000214,-1.0,1.410527,-1.0,-1.0,72.835875,-1.0,1.805443,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,18.000641,-1.0
25%,7.721399,-1.0,2.062962,-1.0,-1.0,83.173228,-1.0,2.565631,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,23.164198,-1.0
50%,9.489891,-1.0,2.763761,-1.0,129.648213,90.735453,70.876841,3.164961,1.0,-1.0,30.613549,51.246214,24.942002,6.795334,-1.0,28.469674,-1.0
75%,11.273937,218.951524,4.385839,2.266321,318.520326,97.712083,161.452599,3.74729,1.0,1.003649,90.121746,90.569093,52.199525,18.514494,47.277566,33.82181,28.674039
max,12.99857,499.897137,5.998728,6.999613,499.967563,106.130949,249.978161,4.844795,1.0,1.999669,149.968003,129.999938,79.998067,29.99698,139.995018,38.995711,234.107919


In [10]:
no_df = df[df.label=='No anemia']
inconc_df = df[df.label=='Inconclusive diagnosis']
len(no_df), len(inconc_df)

(10000, 4948)

In [11]:
noisy_anemia_dfs_list.append(no_df)
noisy_anemia_dfs_list.append(inconc_df)
len(noisy_anemia_dfs_list)

8

In [12]:
combined_df = pd.concat(noisy_anemia_dfs_list, axis=0)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,12.041851,-1.0,4.362182,-1.0,-1.0,104.756576,150.619047,3.448524,1,1.717046,21.416768,38.990527,25.496142,22.891907,-1.0,36.125552,-1.0,Inconclusive diagnosis
1,6.424855,286.327939,3.412889,6.556005,287.112383,103.742265,167.158408,1.857928,1,1.199705,68.085817,64.556363,43.105853,21.109134,64.981084,19.274565,58.22055,Vitamin B12/Folate deficiency anemia
2,7.855726,-1.0,4.450887,6.94165,204.711218,78.765215,-1.0,2.99208,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,23.567179,-1.0,Anemia of chronic disease
3,6.341001,-1.0,2.338293,-1.0,-1.0,85.021833,-1.0,2.237426,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.023002,-1.0,Hemolytic anemia
4,8.025471,-1.0,-1.0,0.68453,-1.0,100.485637,-1.0,2.396005,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,24.076413,-1.0,Vitamin B12/Folate deficiency anemia


#### The correlated features

In [13]:
def get_rbc(row):
    if row['mcv'] == -1:
        return -1
    else:
        return (30*row['hemoglobin'])/row['mcv']

def get_tsat(row):
    if (row['serum_iron']==-1) | (row['tibc']==-1):
        return -1
    else:
        return (row['serum_iron']/row['tibc'])*100

In [14]:
combined_df['rbc'] = combined_df.apply(lambda row: get_rbc(row), axis=1)
combined_df['tsat'] = combined_df.apply(lambda row: get_tsat(row), axis=1)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,12.041851,-1.0,4.362182,-1.0,-1.0,104.756576,150.619047,3.448524,1,1.717046,21.416768,38.990527,25.496142,22.891907,-1.0,36.125552,-1.0,Inconclusive diagnosis
1,6.424855,286.327939,3.412889,6.556005,287.112383,103.742265,167.158408,1.857928,1,1.199705,68.085817,64.556363,43.105853,21.109134,64.981084,19.274565,58.22055,Vitamin B12/Folate deficiency anemia
2,7.855726,-1.0,4.450887,6.94165,204.711218,78.765215,-1.0,2.99208,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,23.567179,-1.0,Anemia of chronic disease
3,6.341001,-1.0,2.338293,-1.0,-1.0,85.021833,-1.0,2.237426,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.023002,-1.0,Hemolytic anemia
4,8.025471,-1.0,-1.0,0.68453,-1.0,100.485637,-1.0,2.396005,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,24.076413,-1.0,Vitamin B12/Folate deficiency anemia


#### Randomly modifying to no anemia

In [15]:
anemic_df = combined_df[combined_df.label != 'No anemia']
non_anemic_df = combined_df[combined_df.label == 'No anemia']
len(anemic_df), len(non_anemic_df)

(60000, 10000)

In [16]:
anemic_df['label'] = anemic_df['label'].sample(frac=1-0.1)
anemic_df['label'] = anemic_df['label'].fillna('No anemia')
anemic_df.label.value_counts()

Anemia of chronic disease               8801
Iron deficiency anemia                  8362
Unspecified anemia                      8134
Aplastic anemia                         8097
Vitamin B12/Folate deficiency anemia    8085
Hemolytic anemia                        8068
No anemia                               6000
Inconclusive diagnosis                  4453
Name: label, dtype: int64

#### Finalizing 

In [17]:
final_df = pd.concat([non_anemic_df, anemic_df], axis=0)
final_df = final_df.sample(frac=1).reset_index(drop=True)
final_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.31768,-1.0,2.088832,-1.0,-1.0,90.986272,113.064122,3.731666,1,1.276701,81.145964,78.146183,29.513433,26.029977,105.808765,33.95304,-1.0,Aplastic anemia
1,10.109186,-1.0,-1.0,-1.0,-1.0,104.941225,107.239455,2.889957,0,1.191488,86.444945,103.72873,32.453257,20.009164,90.105559,30.327558,-1.0,Unspecified anemia
2,6.289434,-1.0,4.156,-1.0,-1.0,86.013564,-1.0,2.193642,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,18.868301,-1.0,Hemolytic anemia
3,7.077683,97.195746,0.261727,0.142422,263.47846,94.058079,102.335651,2.25744,1,-1.0,114.702341,64.345736,20.102258,3.865156,-1.0,21.233048,38.840234,No anemia
4,9.717874,101.889169,-1.0,3.937127,475.993105,78.25893,-1.0,3.725277,0,1.367858,-1.0,-1.0,38.308559,-1.0,-1.0,29.153622,-1.0,No anemia


In [18]:
final_df.label.value_counts()

No anemia                               16000
Anemia of chronic disease                8801
Iron deficiency anemia                   8362
Unspecified anemia                       8134
Aplastic anemia                          8097
Vitamin B12/Folate deficiency anemia     8085
Hemolytic anemia                         8068
Inconclusive diagnosis                   4453
Name: label, dtype: int64

In [19]:
utils.get_dt_performance(final_df)

(0.6207142857142857,
 0.6122228660331646,
 0.7775530397555184,
 datetime.timedelta(microseconds=7982))

In [21]:
final_df.to_csv('../../data/more_features/more_feats_correlated_noisy_8.csv', index=False)