In [18]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.append('..')
from modules import constants

In [19]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [2]:
feat_thresh_arr = [{'name':'hemoglobin', 'thresh':[12], 'std':2},
                   {'name':'ferritin', 'thresh':[30, 100], 'std':2},
                   {'name':'ret_count', 'thresh':[2], 'std':0.2},
                   {'name':'segmented_neutrophils', 'thresh':[0], 'std':2},
                   {'name':'tibc', 'thresh':[450], 'std':50},
                   {'name':'mcv', 'thresh':[80,100], 'std':2}]

anemias_features_dict = {'Vitamin B12/Folate deficiency anemia': ['mcv', 'segmented_neutrophils'], 
                         'Unspecified anemia': ['mcv', 'segmented_neutrophils'], 
                         'Anemia of chronic disease': ['mcv', 'ferritin', 'tibc'], 
                         'Iron deficiency anemia': ['mcv', 'ferritin', 'tibc'], 
                         'Hemolytic anemia': ['mcv', 'ret_count'], 
                         'Aplastic anemia': ['mcv', 'ret_count']}

In [3]:
training_df = pd.read_csv('../../data/train_set_basic.csv')
training_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,14.728733,-1.0,3.170892,-1.0,-1.0,-1.0,-1.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,44.1862,-1.0,0
1,10.405752,9.634615,5.659537,-1.0,-1.0,77.413788,212.671838,4.032519,0,0.88713,96.311597,-1.0,43.218595,-1.0,83.207518,31.217256,-1.0,4
2,15.132737,358.914888,1.842252,3.797487,315.102272,80.500314,-1.0,5.639507,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,45.398211,-1.0,0
3,11.340169,-1.0,1.662209,2.441767,-1.0,97.033963,102.079062,3.506041,1,1.020527,127.281715,-1.0,20.847013,-1.0,62.210273,34.020508,-1.0,6
4,6.691485,-1.0,3.337971,-1.0,-1.0,99.838438,24.119564,2.010694,0,1.957666,34.633063,-1.0,34.612121,-1.0,112.411298,20.074456,-1.0,5


#### Noisiness functions

In [4]:
def get_dict(dict_arr, feat_name):
    for dictionary in dict_arr:
        if dictionary['name']==feat_name:
            return dictionary

In [5]:
def feat_noisiness(df, feat_name, frac, mean, std):
    df[feat_name] = df[feat_name].sample(frac=1-frac)
    if feat_name ==  'segmented_neutrophils':
        df[feat_name] = df[feat_name].fillna(-1)
    else:
        nan_index = df[df[feat_name].isna()].index
        fill_values = np.random.normal(mean, std, size=len(nan_index))
        df[feat_name] = df[feat_name].fillna(pd.Series(fill_values, index=nan_index))
    return df[feat_name]

#### Adding noise

In [6]:
total_frac = 0.1

In [7]:
noisy_anemia_dfs_list = []
for label in training_df.label.unique():
    anemia = constants.ACTION_SPACE[label]
    print(f'{label} - {constants.ACTION_SPACE[label]}')
    if anemia not in ['No anemia', 'Inconclusive diagnosis']:
        anemia_df = training_df[training_df.label==label]
        feature_num = len(anemias_features_dict[anemia])
        frac = total_frac/feature_num
        for feat in anemias_features_dict[anemia]:
            feat_info = get_dict(feat_thresh_arr, feat)
            threshes, std = feat_info['thresh'], feat_info['std']
            #print(f'feat:{feat}, threshes:{threshes}, std:{std}')
            for thresh in threshes:
                anemia_df[feat] = feat_noisiness(anemia_df, feat, frac/len(threshes), thresh, std)
        noisy_anemia_dfs_list.append(anemia_df)

0 - No anemia
4 - Iron deficiency anemia
6 - Aplastic anemia
5 - Hemolytic anemia
1 - Vitamin B12/Folate deficiency anemia
2 - Unspecified anemia
3 - Anemia of chronic disease
7 - Inconclusive diagnosis


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

In [8]:
no_df = training_df[training_df.label==constants.CLASS_DICT['No anemia']]
inconc_df = training_df[training_df.label==constants.CLASS_DICT['Inconclusive diagnosis']]
len(no_df), len(inconc_df)

(7200, 4839)

In [9]:
noisy_anemia_dfs_list.append(no_df)
noisy_anemia_dfs_list.append(inconc_df)
len(noisy_anemia_dfs_list)

8

In [10]:
combined_df = pd.concat(noisy_anemia_dfs_list, axis=0)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,7.954131,-1.0,1.397309,3.115946,-1.0,91.015855,117.63127,2.621784,0,1.654975,14.409476,106.199411,5.382789,21.896702,91.812818,23.862394,-1.0,6
1,11.255705,292.322085,2.029159,4.652351,249.132073,101.418432,-1.0,3.329485,1,0.616276,47.236837,-1.0,-1.0,-1.0,-1.0,33.767114,-1.0,1
2,6.015772,67.371304,-1.0,5.779808,458.559291,76.171774,221.264748,2.369292,0,0.771681,56.195341,36.43253,71.778319,26.715737,126.493966,18.047315,48.252157,4
3,12.466442,-1.0,5.674294,1.16342,-1.0,82.866447,-1.0,4.513205,1,0.275695,120.312553,-1.0,-1.0,-1.0,-1.0,37.399327,-1.0,5
4,6.517585,302.289161,5.042147,6.494386,-1.0,78.519658,78.678805,2.490173,1,0.543184,87.515001,-1.0,28.216398,-1.0,68.527284,19.552756,-1.0,3


#### The correlated features

In [11]:
def get_rbc(row):
    if row['mcv'] == -1:
        return -1
    else:
        return (30*row['hemoglobin'])/row['mcv']

def get_tsat(row):
    if (row['serum_iron']==-1) | (row['tibc']==-1):
        return -1
    else:
        return (row['serum_iron']/row['tibc'])*100

In [12]:
combined_df['rbc'] = combined_df.apply(lambda row: get_rbc(row), axis=1)
combined_df['tsat'] = combined_df.apply(lambda row: get_tsat(row), axis=1)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,7.954131,-1.0,1.397309,3.115946,-1.0,91.015855,117.63127,2.621784,0,1.654975,14.409476,106.199411,5.382789,21.896702,91.812818,23.862394,-1.0,6
1,11.255705,292.322085,2.029159,4.652351,249.132073,101.418432,-1.0,3.329485,1,0.616276,47.236837,-1.0,-1.0,-1.0,-1.0,33.767114,-1.0,1
2,6.015772,67.371304,-1.0,5.779808,458.559291,76.171774,221.264748,2.369292,0,0.771681,56.195341,36.43253,71.778319,26.715737,126.493966,18.047315,48.252157,4
3,12.466442,-1.0,5.674294,1.16342,-1.0,82.866447,-1.0,4.513205,1,0.275695,120.312553,-1.0,-1.0,-1.0,-1.0,37.399327,-1.0,5
4,6.517585,302.289161,5.042147,6.494386,-1.0,78.519658,78.678805,2.490173,1,0.543184,87.515001,-1.0,28.216398,-1.0,68.527284,19.552756,-1.0,3


#### Randomly labeling anemic samples as no anemia

In [13]:
anemic_df = combined_df[combined_df.label != constants.CLASS_DICT['No anemia']]
non_anemic_df = combined_df[combined_df.label == constants.CLASS_DICT['No anemia']]
len(anemic_df), len(non_anemic_df)

(43200, 7200)

In [14]:
anemic_df['label'] = anemic_df['label'].sample(frac=1-0.1)
anemic_df['label'] = anemic_df['label'].fillna(constants.CLASS_DICT['No anemia'])
anemic_df.label.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


6.0    5856
1.0    5838
2.0    5830
5.0    5823
3.0    5750
4.0    5426
7.0    4357
0.0    4320
Name: label, dtype: int64

#### Finalizing

In [15]:
final_df = pd.concat([non_anemic_df, anemic_df], axis=0)
final_df = final_df.sample(frac=1).reset_index(drop=True)
final_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,6.255332,29.05275,-1.0,0.970195,-1.0,79.65328,-1.0,2.35596,0,1.70485,56.276373,-1.0,-1.0,-1.0,-1.0,18.765995,-1.0,4.0
1,12.946859,41.436393,3.048505,-1.0,208.072179,76.004166,-1.0,5.110322,1,1.979505,135.198105,-1.0,-1.0,-1.0,-1.0,38.840577,-1.0,3.0
2,6.988266,486.192203,0.713708,-1.0,353.314146,103.189113,96.612183,2.031687,1,1.81325,136.77643,80.264365,2.920875,25.65852,76.123786,20.964799,27.344556,7.0
3,11.381682,-1.0,-1.0,5.450651,-1.0,104.021128,-1.0,3.282511,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,34.145047,-1.0,1.0
4,10.894667,-1.0,4.603604,-1.0,-1.0,92.413723,71.126445,3.536704,0,0.812053,135.190366,-1.0,41.314898,-1.0,86.941023,32.684,-1.0,5.0


In [16]:
final_df.label.value_counts()

0.0    11520
6.0     5856
1.0     5838
2.0     5830
5.0     5823
3.0     5750
4.0     5426
7.0     4357
Name: label, dtype: int64

In [17]:
# final_df.to_csv(f'../../data/train_set_noisiness_{total_frac}.csv', index=False)