In [1]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.append('../../')
from modules.many_features import constants
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
feat_thresh_arr = [{'name':'hemoglobin', 'thresh':[12], 'std':2},
                   {'name':'ferritin', 'thresh':[30, 100], 'std':2},
                   {'name':'ret_count', 'thresh':[2], 'std':0.2},
                   {'name':'segmented_neutrophils', 'thresh':[0], 'std':2},
                   {'name':'tibc', 'thresh':[450], 'std':50},
                   {'name':'mcv', 'thresh':[80,100], 'std':2}]

anemias_features_dict = {'Vitamin B12/Folate deficiency anemia': ['mcv', 'segmented_neutrophils'], 
                         'Unspecified anemia': ['mcv', 'segmented_neutrophils'], 
                         'Anemia of chronic disease': ['mcv', 'ferritin', 'tibc'], 
                         'Iron deficiency anemia': ['mcv', 'ferritin', 'tibc'], 
                         'Hemolytic anemia': ['mcv', 'ret_count'], 
                         'Aplastic anemia': ['mcv', 'ret_count']}

#### The data

In [4]:
training_df = pd.read_csv('../../../anemia_ml4hc/data/train_set_basic.csv')
training_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,14.728733,-1.0,3.170892,-1.0,-1.0,-1.0,-1.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,44.1862,-1.0,0
1,10.405752,9.634615,5.659537,-1.0,-1.0,77.413788,212.671838,4.032519,0,0.88713,96.311597,-1.0,43.218595,-1.0,83.207518,31.217256,-1.0,4
2,15.132737,358.914888,1.842252,3.797487,315.102272,80.500314,-1.0,5.639507,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,45.398211,-1.0,0
3,11.340169,-1.0,1.662209,2.441767,-1.0,97.033963,102.079062,3.506041,1,1.020527,127.281715,-1.0,20.847013,-1.0,62.210273,34.020508,-1.0,6
4,6.691485,-1.0,3.337971,-1.0,-1.0,99.838438,24.119564,2.010694,0,1.957666,34.633063,-1.0,34.612121,-1.0,112.411298,20.074456,-1.0,5


#### Noisiness functions

In [5]:
def get_dict(dict_arr, feat_name):
    for dictionary in dict_arr:
        if dictionary['name']==feat_name:
            return dictionary

In [6]:
def feat_noisiness(df, feat_name, frac, mean, std):
    df[feat_name] = df[feat_name].sample(frac=1-frac)
    if feat_name ==  'segmented_neutrophils': #how was this a good idea
        df[feat_name] = df[feat_name].fillna(-1)
    else:
        nan_index = df[df[feat_name].isna()].index
        fill_values = np.random.normal(mean, std, size=len(nan_index))
        df[feat_name] = df[feat_name].fillna(pd.Series(fill_values, index=nan_index))
    return df[feat_name]

In [7]:
total_frac = 0.5

In [8]:
noisy_anemia_dfs_list = []
for label in training_df.label.unique():
    anemia = constants.ACTION_SPACE[label]
    print(f'{label} - {constants.ACTION_SPACE[label]}')
    if anemia not in ['No anemia', 'Inconclusive diagnosis']:
        anemia_df = training_df[training_df.label==label]
        feature_num = len(anemias_features_dict[anemia])
        frac = total_frac/feature_num
        for feat in anemias_features_dict[anemia]:
            feat_info = get_dict(feat_thresh_arr, feat)
            threshes, std = feat_info['thresh'], feat_info['std']
            print(f'feat:{feat}, threshes:{threshes}, std:{std}')
            for thresh in threshes:
                anemia_df[feat] = feat_noisiness(anemia_df, feat, frac/len(threshes), thresh, std)
        noisy_anemia_dfs_list.append(anemia_df)

0 - No anemia
4 - Iron deficiency anemia
feat:mcv, threshes:[80, 100], std:2
feat:ferritin, threshes:[30, 100], std:2
feat:tibc, threshes:[450], std:50
6 - Aplastic anemia
feat:mcv, threshes:[80, 100], std:2
feat:ret_count, threshes:[2], std:0.2
5 - Hemolytic anemia
feat:mcv, threshes:[80, 100], std:2
feat:ret_count, threshes:[2], std:0.2
1 - Vitamin B12/Folate deficiency anemia
feat:mcv, threshes:[80, 100], std:2
feat:segmented_neutrophils, threshes:[0], std:2
2 - Unspecified anemia
feat:mcv, threshes:[80, 100], std:2
feat:segmented_neutrophils, threshes:[0], std:2
3 - Anemia of chronic disease
feat:mcv, threshes:[80, 100], std:2
feat:ferritin, threshes:[30, 100], std:2
feat:tibc, threshes:[450], std:50
7 - Inconclusive diagnosis


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

In [9]:
noisy_anemia_dfs_list[0].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
count,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0,6047.0
mean,9.553448,51.59611,0.773839,2.299295,383.629914,79.60409,61.88549,3.697393,0.566231,0.527926,54.101697,28.169224,17.736612,4.915172,40.638368,28.660345,11.877155,4.0
std,2.006699,31.070573,2.286687,2.661967,178.377627,6.381504,81.815121,0.780505,0.495635,1.039331,50.142165,42.50127,25.601081,9.393202,49.347205,6.020096,21.699164,0.0
min,6.000102,0.025086,-1.0,-1.0,-1.0,74.793726,-1.0,2.268296,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,18.000307,-1.0,4.0
25%,7.821592,26.322517,-1.0,-1.0,379.702903,76.525757,-1.0,3.024995,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,23.464777,-1.0,4.0
50%,9.570429,48.789431,-1.0,2.22238,466.817237,77.973372,-1.0,3.695122,1.0,0.778761,46.115166,-1.0,-1.0,-1.0,-1.0,28.711286,-1.0,4.0
75%,11.277294,79.96655,2.609402,4.657963,493.730394,79.298394,127.64221,4.36451,1.0,1.391334,98.061,60.469977,36.033318,9.662013,85.845222,33.831881,22.448203,4.0
max,12.998672,108.958169,5.998827,6.999498,630.141607,105.946151,249.985193,5.182304,1.0,1.999573,149.991041,129.984563,79.99563,29.998753,139.996922,38.996015,221.088779,4.0


In [10]:
no_df = training_df[training_df.label==constants.CLASS_DICT['No anemia']]
inconc_df = training_df[training_df.label==constants.CLASS_DICT['Inconclusive diagnosis']]
len(no_df), len(inconc_df)

(7200, 4839)

In [11]:
noisy_anemia_dfs_list.append(no_df)
noisy_anemia_dfs_list.append(inconc_df)
len(noisy_anemia_dfs_list)

8

In [12]:
combined_df = pd.concat(noisy_anemia_dfs_list, axis=0)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,10.136999,86.355412,-1.0,0.0,340.460623,100.677779,194.566279,3.020627,1,0.214955,41.371985,129.614786,32.235362,24.002288,109.425265,30.410997,57.147954,2
1,9.005782,204.206458,-1.0,2.60002,-1.0,75.958075,-1.0,3.556876,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,27.017347,-1.0,3
2,6.582586,463.019129,2.089211,0.861768,212.055942,79.530326,-1.0,2.343294,0,1.084375,85.757822,-1.0,-1.0,-1.0,-1.0,19.747757,-1.0,5
3,10.139186,-1.0,2.230087,3.205156,-1.0,91.022268,-1.0,3.341771,1,1.491569,132.526449,129.007443,-1.0,27.00198,-1.0,30.417558,-1.0,6
4,14.913256,-1.0,3.879377,1.135883,351.320848,-1.0,-1.0,-1.0,0,1.779421,104.164639,-1.0,-1.0,-1.0,-1.0,44.739767,-1.0,0


#### The correlated features

In [13]:
def get_rbc(row):
    if row['mcv'] == -1:
        return -1
    else:
        return (30*row['hemoglobin'])/row['mcv']

def get_tsat(row):
    if (row['serum_iron']==-1) | (row['tibc']==-1):
        return -1
    else:
        return (row['serum_iron']/row['tibc'])*100

In [14]:
combined_df['rbc'] = combined_df.apply(lambda row: get_rbc(row), axis=1)
combined_df['tsat'] = combined_df.apply(lambda row: get_tsat(row), axis=1)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,10.136999,86.355412,-1.0,0.0,340.460623,100.677779,194.566279,3.020627,1,0.214955,41.371985,129.614786,32.235362,24.002288,109.425265,30.410997,57.147954,2
1,9.005782,204.206458,-1.0,2.60002,-1.0,75.958075,-1.0,3.556876,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,27.017347,-1.0,3
2,6.582586,463.019129,2.089211,0.861768,212.055942,79.530326,-1.0,2.483047,0,1.084375,85.757822,-1.0,-1.0,-1.0,-1.0,19.747757,-1.0,5
3,10.139186,-1.0,2.230087,3.205156,-1.0,91.022268,-1.0,3.341771,1,1.491569,132.526449,129.007443,-1.0,27.00198,-1.0,30.417558,-1.0,6
4,14.913256,-1.0,3.879377,1.135883,351.320848,-1.0,-1.0,-1.0,0,1.779421,104.164639,-1.0,-1.0,-1.0,-1.0,44.739767,-1.0,0


#### Randomly modifying to no anemia

In [15]:
anemic_df = combined_df[combined_df.label != constants.CLASS_DICT['No anemia']]
non_anemic_df = combined_df[combined_df.label == constants.CLASS_DICT['No anemia']]
len(anemic_df), len(non_anemic_df)

(43200, 7200)

In [16]:
anemic_df.label.value_counts()

6    6501
5    6498
1    6483
2    6454
3    6378
4    6047
7    4839
Name: label, dtype: int64

In [17]:
anemic_df['label'] = anemic_df['label'].sample(frac=1-0.1)
anemic_df['label'] = anemic_df['label'].fillna(constants.CLASS_DICT['No anemia'])
anemic_df.label.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


5.0    5864
6.0    5856
1.0    5812
2.0    5806
3.0    5738
4.0    5457
7.0    4347
0.0    4320
Name: label, dtype: int64

#### Finalizing

In [18]:
final_df = pd.concat([non_anemic_df, anemic_df], axis=0)
final_df = final_df.sample(frac=1).reset_index(drop=True)
final_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.477922,381.698702,1.950432,2.253608,469.882849,84.435196,-1.0,4.078129,1,1.841648,56.668105,-1.0,-1.0,-1.0,-1.0,34.433766,-1.0,5.0
1,7.506117,487.267338,2.108645,1.356243,-1.0,100.267833,212.502959,2.24582,1,1.845189,58.402404,111.582936,71.008045,4.662531,134.348979,22.518352,-1.0,5.0
2,10.140281,284.315199,1.557134,5.741366,318.122861,96.44091,-1.0,3.15435,0,1.409696,147.252007,-1.0,-1.0,-1.0,-1.0,30.420842,-1.0,6.0
3,9.54303,-1.0,3.708461,-1.0,146.414041,78.021163,127.203703,3.6694,1,1.838578,58.287221,90.434585,20.130092,28.278097,98.54076,28.629089,86.879443,2.0
4,7.32125,29.243319,-1.0,1.400555,491.611233,77.997942,-1.0,2.815939,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,21.963749,-1.0,4.0


In [19]:
final_df.label.value_counts()

0.0    11520
5.0     5864
6.0     5856
1.0     5812
2.0     5806
3.0     5738
4.0     5457
7.0     4347
Name: label, dtype: int64

In [20]:
final_df.to_csv(f'../../../anemia_ml4hc/data/train_set_noisiness_{total_frac}.csv', index=False)