In [26]:
import pandas as pd
import numpy as np
import random

In [27]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [28]:
feat_arr = [{'name':'hemoglobin', 'lower':12.1, 'upper':17.2, 'mean':10.3, 'std':2.3, 'min':(0, 5)},
            {'name':'ferritin', 'lower':10, 'upper':263, 'mean':697, 'std':3305, 'min':(1.6, 8)},
            {'name':'ret_count', 'lower':0.5, 'upper':2.5, 'mean':2.6, 'std':2.5, 'min':(0, 0.5)},
            {'name':'segmented_neutrophils', 'lower':0, 'upper':0, 'mean':0, 'std':2.4, 'min':(0, 0)},
            {'name':'tibc', 'lower':250, 'upper':450, 'mean':242, 'std':90, 'min':(3, 100)},
            {'name':'mcv', 'lower':80, 'upper':100, 'mean':90, 'std':8, 'min':(0, 65)}]

In [29]:
anem_arr = {
    'No anemia': ['hemoglobin'],
    'Hemolytic anemia': ['mcv', 'ret_count'],
    'aplastic anemia': ['mcv', 'ret_count'],
    'Iron deficiency anemia': ['mcv', 'ferritin', 'tibc'],
    'Vitamin B12/Folate deficiency anemia': ['mcv', 'segmented_neutrophils'],
    'Anemia of chronic disease': ['mcv', 'ferritin', 'tibc']
         }

In [30]:
hb_df = pd.read_csv('data/anemia_synth_dataset_hb_some_nans.csv')
hb_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,12.176139,13.829445,0.0,0.0,0.0,82.648043,No anemia
1,2.691116,0.0,5.445103,0.0,334.67435,95.635788,Hemolytic anemia
2,14.139714,44.223073,0.0,0.0,0.0,94.631075,No anemia
3,11.946911,202.228906,4.032461,0.0,0.0,99.628591,Hemolytic anemia
4,6.543366,0.0,3.420648,0.0,421.689813,99.829839,Hemolytic anemia


In [31]:
hb_df.label.unique()

array(['No anemia', 'Hemolytic anemia', 'Aplastic anemia',
       'Iron deficiency anemia', 'Vitamin B12/Folate deficiency anemia',
       'Anemia of chronic disease'], dtype=object)

#### Replacing some values with other values

In [32]:
def noisy_feature(df, feature, frac, lower, upper, seed): #op is the operation eg < or >
    df[feature] = df[feature].sample(frac = 1-frac, random_state = seed)
    null_indices_list = df[df[feature].isnull()].index.tolist()
    #print(f'{feature}: {null_indices_list[:10]}')
    df[feature] = df[feature].fillna(random.uniform(lower, upper))
    return df[feature]

In [33]:
def make_noisy(anemia):
    noisy_df = hb_df[hb_df.label == anemia]
    if anemia == 'No anemia':
         noisy_df['hemoglobin'] = noisy_feature(noisy_df, 'hemoglobin', 0.2, 3, 12, 1)
    elif anemia == 'Hemolytic anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.1, 60, 79, 2)
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.1, 101, 108, 3)
        noisy_df['ret_count'] = noisy_feature(noisy_df, 'ret_count', 0.2, 0.1, 2, 4)
    elif anemia == 'Aplastic anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.1, 60, 79, 5)
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.1, 101, 108, 6)
        noisy_df['ret_count'] = noisy_feature(noisy_df, 'ret_count', 0.2, 2.1, 6, 7)
    elif anemia == 'Iron deficiency anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 80, 110, 8)
        noisy_df['ferritin'] = noisy_feature(noisy_df, 'ferritin', 0.2, 100.1, 120, 9)
    elif anemia == 'Vitamin B12/Folate deficiency anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 60, 99, 10)
        noisy_df['segmented_neutrophils'] = noisy_feature(noisy_df, 'segmented_neutrophils', 0.2, 0, 0, 11)
    elif anemia == 'Anemia of chronic disease':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 80, 110, 12)
        noisy_df['ferritin'] = noisy_feature(noisy_df, 'ferritin', 0.2, 10, 30, 13)
    else:
        print('What the hell is this?')
    return noisy_df

In [34]:
noisy_df = pd.DataFrame()
for anemia in hb_df.label.unique():
    noisy_anem_df = make_noisy(anemia)
    noisy_df = pd.concat([noisy_df, noisy_anem_df], axis=0)
noisy_df = noisy_df.sample(frac=1)#.reset_index(drop=True)
noisy_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
4448,2.134359,0.0,5.5795,0.0,401.043739,73.992953,Aplastic anemia
4267,4.109353,0.0,0.5241,0.0,0.0,102.925205,Hemolytic anemia
2972,6.844196,254.92673,0.381276,0.0,0.0,105.736896,Aplastic anemia
30851,13.149565,0.0,0.0,0.0,0.0,0.0,No anemia
15529,2.424722,0.0,1.642175,0.0,312.420905,92.343651,Aplastic anemia


In [35]:
len(noisy_df), len(hb_df)

(38383, 38383)

In [36]:
hb_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,38383.0,38383.0,38383.0,38383.0,38383.0,38383.0
mean,8.623734,192.658819,1.927337,0.079374,143.793072,84.733269
std,4.577406,790.525233,2.107686,0.482283,171.250096,21.057232
min,1.000135,0.0,0.0,0.0,0.0,0.0
25%,4.701877,0.0,0.0,0.0,0.0,83.455972
50%,8.419577,0.0,1.328451,0.0,0.0,89.040477
75%,12.31048,118.337954,3.193592,0.0,317.779924,94.292515
max,17.199709,12833.435346,13.647977,7.690749,612.766224,122.280231


In [37]:
noisy_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,38383.0,38383.0,38383.0,38383.0,38383.0,38383.0
mean,7.544002,92.220106,1.792007,0.024051,143.793072,83.930725
std,3.565684,425.515037,2.198187,0.265534,171.250096,23.74485
min,1.000135,0.0,0.0,0.0,0.0,0.0
25%,4.701877,0.0,0.0,0.0,0.0,80.15328
50%,8.419577,0.0,0.5241,0.0,0.0,89.806376
75%,9.444923,108.496244,3.107521,0.0,317.779924,98.802435
max,17.198309,10807.602001,13.647977,6.226783,612.766224,116.597542


In [38]:
hb_df[hb_df.label == 'No anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,14.678389,81.681328,0.454578,0.0,210.077415,72.009645
std,1.475565,87.291245,0.763093,0.0,177.37618,36.377297
min,12.101036,0.0,0.0,0.0,0.0,0.0
25%,13.404991,0.0,0.0,0.0,0.0,81.218736
50%,14.696121,53.261843,0.0,0.0,283.313886,87.450743
75%,15.974306,156.521874,0.83573,0.0,365.979001,93.76274
max,17.199709,262.927512,2.497851,0.0,449.993007,99.998546


In [39]:
noisy_df[noisy_df.label == 'No anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,10.534052,81.681328,0.454578,0.0,210.077415,72.009645
std,2.835663,87.291245,0.763093,0.0,177.37618,36.377297
min,8.754841,0.0,0.0,0.0,0.0,0.0
25%,8.754841,0.0,0.0,0.0,0.0,81.218736
50%,8.754841,53.261843,0.0,0.0,283.313886,87.450743
75%,12.949147,156.521874,0.83573,0.0,365.979001,93.76274
max,17.198309,262.927512,2.497851,0.0,449.993007,99.998546


In [40]:
hb_df[hb_df.label == 'Hemolytic anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,14146.0,14146.0,14146.0,14146.0,14146.0,14146.0
mean,6.486196,42.304322,4.234729,0.0,105.682935,90.019646
std,3.181763,75.374954,1.650527,0.0,163.433765,5.181653
min,1.000135,0.0,2.000104,0.0,0.0,80.000726
25%,3.751119,0.0,2.922755,0.0,0.0,85.870555
50%,6.477627,0.0,3.915298,0.0,0.0,90.01442
75%,9.230706,57.825166,5.206471,0.0,283.420563,94.133759
max,11.999148,262.992477,13.647977,0.0,449.806439,99.996267


In [41]:
noisy_df[noisy_df.label == 'Hemolytic anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,14146.0,14146.0,14146.0,14146.0,14146.0,14146.0
mean,6.486196,42.304322,1.625941,0.0,105.682935,87.761774
std,3.181763,75.374954,1.901241,0.0,163.433765,16.235069
min,1.000135,0.0,0.5241,0.0,0.0,60.475204
25%,3.751119,0.0,0.5241,0.0,0.0,81.605681
50%,6.477627,0.0,0.5241,0.0,0.0,92.22276
75%,9.230706,57.825166,2.610254,0.0,283.420563,102.925205
max,11.999148,262.992477,13.647977,0.0,449.806439,102.925205


In [42]:
hb_df[hb_df.label == 'Aplastic anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,9450.0,9450.0,9450.0,9450.0,9450.0,9450.0
mean,6.462678,39.381952,0.784227,0.0,103.532711,89.985259
std,3.185101,73.487066,0.616173,0.0,163.331908,5.223266
min,1.001366,0.0,1e-05,0.0,0.0,80.000131
25%,3.676539,0.0,0.258196,0.0,0.0,85.847949
50%,6.455401,0.0,0.514264,0.0,0.0,89.963489
75%,9.239299,43.334938,1.339659,0.0,283.944428,94.186309
max,11.998428,262.757084,1.99979,0.0,449.999706,99.998615


In [43]:
noisy_df[noisy_df.label == 'Aplastic anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,9450.0,9450.0,9450.0,9450.0,9450.0,9450.0
mean,6.462678,39.381952,4.139733,0.0,103.532711,91.830207
std,3.185101,73.487066,2.2249,0.0,163.331908,12.402515
min,1.001366,0.0,0.000646,0.0,0.0,73.992953
25%,3.676539,0.0,1.554958,0.0,0.0,81.326772
50%,6.455401,0.0,5.5795,0.0,0.0,92.431615
75%,9.239299,43.334938,5.5795,0.0,283.944428,105.736896
max,11.998428,262.757084,5.5795,0.0,449.999706,105.736896


In [44]:
hb_df[hb_df.label == 'Iron deficiency anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,1343.0,1343.0,1343.0,1343.0,1343.0,1343.0
mean,6.439992,4.931263,0.45421,0.0,241.714704,76.160756
std,3.153254,3.184475,0.738642,0.0,89.232757,3.188135
min,1.001787,0.757007,0.0,0.0,7.637018,62.186572
25%,3.715423,3.192551,0.0,0.0,182.739086,74.542229
50%,6.452017,4.799012,0.0,0.0,244.96353,76.878642
75%,9.14021,6.336819,0.890889,0.0,298.260571,78.634781
max,11.993901,86.532986,2.492994,0.0,612.766224,79.994818


In [45]:
noisy_df[noisy_df.label == 'Iron deficiency anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,1343.0,1343.0,1343.0,1343.0,1343.0,1343.0
mean,6.439992,77.510397,0.45421,0.0,241.714704,80.663734
std,3.153254,47.412034,0.738642,0.0,89.232757,3.495749
min,1.001787,1.610635,0.0,0.0,7.637018,62.186572
25%,3.715423,6.857315,0.0,0.0,182.739086,79.168907
50%,6.452017,108.496244,0.0,0.0,244.96353,82.608165
75%,9.14021,108.496244,0.890889,0.0,298.260571,82.608165
max,11.993901,108.496244,2.492994,0.0,612.766224,82.608165


In [46]:
hb_df[hb_df.label == 'Vitamin B12/Folate deficiency anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0
mean,6.601809,39.488512,0.447821,1.934359,109.000233,104.012731
std,3.176771,73.6566,0.755023,1.442703,166.004426,3.44553
min,1.000511,0.0,0.0,0.003293,0.0,100.003852
25%,3.93573,0.0,0.0,0.757268,0.0,101.322214
50%,6.554787,0.0,0.0,1.612612,0.0,103.068642
75%,9.403432,41.941863,0.804594,2.845712,292.666533,105.778911
max,11.9979,262.186946,2.489278,7.690749,449.838897,122.280231


In [47]:
noisy_df[noisy_df.label == 'Vitamin B12/Folate deficiency anemia'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0
mean,6.601809,39.488512,0.447821,0.586118,109.000233,74.087068
std,3.176771,73.6566,0.755023,1.178855,166.004426,19.828519
min,1.000511,0.0,0.0,0.0,0.0,61.162092
25%,3.93573,0.0,0.0,0.0,0.0,61.162092
50%,6.554787,0.0,0.0,0.0,0.0,61.162092
75%,9.403432,41.941863,0.804594,0.52889,292.666533,100.942916
max,11.9979,262.186946,2.489278,6.226783,449.838897,116.597542


In [48]:
hb_df[hb_df.label == 'Anemia of chronic disease'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,1869.0,1869.0,1869.0,1869.0,1869.0,1869.0
mean,6.606795,2963.401118,0.428281,0.0,240.108196,76.157325
std,3.133907,2154.665916,0.737979,0.0,90.039854,3.330631
min,1.004266,34.166553,0.0,0.0,3.444409,59.27154
25%,4.022541,1223.57696,0.0,0.0,179.037655,74.506716
50%,6.630468,2562.854431,0.0,0.0,240.471361,77.035119
75%,9.311093,4277.281366,0.769981,0.0,304.879608,78.703718
max,11.996571,12833.435346,2.496488,0.0,573.833319,79.998863


In [49]:
noisy_df[noisy_df.label == 'Anemia of chronic disease'].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,1869.0,1869.0,1869.0,1869.0,1869.0,1869.0
mean,6.606795,848.573454,0.428281,0.0,240.108196,89.41923
std,3.133907,1730.476176,0.737979,0.0,90.039854,8.960891
min,1.004266,10.530719,0.0,0.0,3.444409,62.124858
25%,4.022541,10.530719,0.0,0.0,179.037655,79.132513
50%,6.630468,10.530719,0.0,0.0,240.471361,95.160659
75%,9.311093,768.490855,0.769981,0.0,304.879608,95.160659
max,11.996571,10807.602001,2.496488,0.0,573.833319,95.160659


#### saving the noisy data

In [50]:
noisy_df.to_csv(f'data/noisy_dataset_0.7.csv', index=False)