In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
feat_arr = [{'name':'hemoglobin', 'lower':12.1, 'upper':17.2, 'mean':10.3, 'std':2.3, 'abnorm_lower':6, 'abnorm_upper':18, 'min':(0, 5)},
            {'name':'ferritin', 'lower':10, 'upper':263, 'mean':697, 'std':3305, 'abnorm_lower':0, 'abnorm_upper': 500, 'min':(1.6, 8)},
            {'name':'ret_count', 'lower':0.5, 'upper':2.5, 'mean':2.6, 'std':2.5, 'abnorm_lower':0, 'abnorm_upper':6, 'min':(0, 0.5)},
            {'name':'segmented_neutrophils', 'lower':0, 'upper':0, 'mean':0, 'std':2.4, 'abnorm_lower':0.01, 'abnorm_upper':7, 'min':(0, 0)},
            {'name':'tibc', 'lower':250, 'upper':450, 'mean':242, 'std':90, 'abnorm_lower': 100, 'abnorm_upper':500, 'min':(3, 100)},
            {'name':'mcv', 'lower':80, 'upper':100, 'mean':90, 'std':5, 'abnorm_lower': 75, 'abnorm_upper':105, 'min':(0, 65)}]

#### Utils functions

In [4]:
def noisy_feature(df, feature, frac, lower, upper, seed):
    '''a frac of the feature data is made to be between lower and upper'''
    df[feature] = df[feature].sample(frac = 1-frac, random_state = seed)
    null_indices_list = df[df[feature].isnull()].index.tolist()
    df[feature] = df[feature].fillna(random.uniform(lower, upper))
    return df[feature]

In [5]:
def clip_value(val, feat_dict):
    '''prevents negative values in data following a normal distribution'''
    if val<=0:
        new_val = np.random.uniform(feat_dict['min'][0], feat_dict['min'][1])
    else:
        new_val = val
    return new_val                                                         

In [6]:
def normal_dist(df, col_name, num=None, cond=None):
    for feat_dict in feat_arr:
        if feat_dict['name'] == col_name:
            if cond is None:
                df[col_name] = np.random.normal(feat_dict['mean'], feat_dict['std'], num)
            else:
                df[col_name] = np.random.normal(feat_dict['mean'], feat_dict['std'] if cond else np.nan)
            df[col_name] = [np.random.uniform(feat_dict['min'][0], feat_dict['min'][1]) if val<0 else val for val in df[col_name]]
    return df

In [7]:
def uniform_dist(df, col_name):
    for feat_dict in feat_arr:
        if feat_dict['name'] == col_name:
            df[col_name] = np.random.uniform(feat_dict['abnorm_lower'], feat_dict['abnorm_upper'], len(df))
    return df

In [8]:
def less_mcv(df):
    irons = []
    ferritins = []
    tibcs = []
    for i, row in df.iterrows():
        if row['mcv']<80:
            #ferritin = clip_value(np.random.normal(feat_arr[1]['mean'], feat_arr[1]['std']), feat_arr[1])
            #tibc = clip_value(np.random.normal(feat_arr[5]['mean'], feat_arr[5]['std']), feat_arr[5])
            ferritin = np.random.uniform(feat_arr[1]['abnorm_lower'], feat_arr[1]['abnorm_upper'])
            tibc = np.random.uniform(feat_arr[5]['abnorm_lower'], feat_arr[5]['abnorm_upper'])
        else:
            iron = ferritin = tibc = np.nan
        irons.append(iron)
        ferritins.append(ferritin)
        tibcs.append(tibc)
    return ferritins, tibcs

In [9]:
def normal_mcv(row):
    if (row['mcv']>=80) & (row['mcv'] <=100):
        #return clip_value(np.random.normal(feat_arr[2]['mean'], feat_arr[2]['std']), feat_arr[2])
        return np.random.uniform(feat_arr[2]['abnorm_lower'], feat_arr[2]['abnorm_upper'])
    else:
        return np.nan

In [10]:
def more_mcv(row):
    if row['mcv']> 100:
        #return clip_value(np.random.normal(feat_arr[3]['mean'], feat_arr[3]['std']), feat_arr[3])
        return np.random.uniform(0, feat_arr[3]['abnorm_upper'])
    else:
        return np.nan

In [11]:
def create_label(row):
    if row['mcv']<80 :
        if row['ferritin']<30:
            return 'Iron deficiency anemia'
        elif row['ferritin']>100:
            return 'Anemia of chronic disease'
        elif row['tibc']<450:
            return 'Anemia of chronic disease'
        elif row['tibc']>=450:
            return 'Iron deficiency anemia'
        else:
            pass
    elif row['mcv']<=100:
        if row['ret_count'] <=2:
            return 'Aplastic anemia'
        else:
            return 'Hemolytic anemia'
        
    elif row['mcv']> 100:
        if row['segmented_neutrophils']>0:
            return 'Vitamin B12/Folate deficiency anemia'
        else:
            pass
    else:
        pass

In [12]:
def make_noisy(df, anemia):
    noisy_df = df[df.label == anemia]
    if anemia == 'No anemia':
         noisy_df['hemoglobin'] = noisy_feature(noisy_df, 'hemoglobin', 0.3, 3, 12, 1)
    elif anemia == 'Hemolytic anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 60, 79, 2)
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 101, 108, 3)
        noisy_df['ret_count'] = noisy_feature(noisy_df, 'ret_count', 0.2, 0.1, 2, 4)
    elif anemia == 'Aplastic anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 60, 79, 5)
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 101, 108, 6)
        noisy_df['ret_count'] = noisy_feature(noisy_df, 'ret_count', 0.3, 2.1, 6, 7)
    elif anemia == 'Iron deficiency anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.3, 80, 110, 8)
        noisy_df['ferritin'] = noisy_feature(noisy_df, 'ferritin', 0.3, 100.1, 120, 9)
    elif anemia == 'Vitamin B12/Folate deficiency anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.3, 60, 99, 10)
        noisy_df['segmented_neutrophils'] = noisy_feature(noisy_df, 'segmented_neutrophils', 0.3, 0, 0, 11)
    elif anemia == 'Anemia of chronic disease':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.3, 80, 110, 12)
        noisy_df['ferritin'] = noisy_feature(noisy_df, 'ferritin', 0.3, 10, 30, 13)
    else:
        print('What the hell is this?')
    return noisy_df

In [13]:
def replace_nans(df, cols, fracs):#fracs is a list
    for feat_dict in feat_arr:
        feat_name = feat_dict['name']
        if feat_name in cols:
            idx = cols.index(feat_name)
            nan_num = df[feat_name].isna().sum()
            fill_num = int(np.ceil(fracs[idx]*nan_num))
            generated_nums = np.random.uniform(feat_dict['abnorm_lower'], feat_dict['abnorm_upper'], fill_num)
            indices = list(df[df[feat_name].isna()].index)
            new_indices = random.sample(indices, fill_num)
            df.loc[new_indices, feat_name] = generated_nums
    return df

#### No anemia dataset

In [14]:
cols_to_use = [feat['name'] for feat in feat_arr]
present_fractions = [1.0, 0.6, 0.3, 0.1, 0.6, 1.0]

no_df = pd.DataFrame()
for idx, feat in enumerate(feat_arr):
    no_df[feat['name']] = np.random.uniform(feat['lower'], feat['upper'], 10000)
    no_df[feat['name']] = no_df[feat['name']].sample(frac=present_fractions[idx])
no_df['label'] = 'No anemia'
no_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,14.010155,,,,345.481644,91.442937,No anemia
1,16.948643,,,,,97.958187,No anemia
2,15.833169,171.449291,0.800313,,363.985025,96.683212,No anemia
3,15.153158,,,,,85.347757,No anemia
4,12.895695,96.569765,,,271.622851,99.016812,No anemia


In [15]:
no_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,10000.0,6000.0,3000.0,1000.0,6000.0,10000.0
mean,14.620214,137.008678,1.510928,0.0,349.515381,89.976241
std,1.466914,73.190153,0.577518,0.0,57.136086,5.747731
min,12.100059,10.002352,0.500406,0.0,250.026002,80.006622
25%,13.356277,73.978997,1.023732,0.0,299.915333,84.957872
50%,14.611896,136.345683,1.501286,0.0,349.882057,90.041805
75%,15.874032,201.072785,2.01725,0.0,399.100208,94.913663
max,17.19856,262.972439,2.498595,0.0,449.97344,99.99824


In [16]:
# randomly place 30% of hb vaues below 12.1
noisy_no_df = no_df.copy()
noisy_no_df['hemoglobin'] = noisy_feature(noisy_no_df, 'hemoglobin', 0.3, 3, 12, 1)
noisy_no_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,10000.0,6000.0,3000.0,1000.0,6000.0,10000.0
mean,12.869912,137.008678,1.510928,0.0,349.515381,89.976241
std,2.963361,73.190153,0.577518,0.0,57.136086,5.747731
min,8.754841,10.002352,0.500406,0.0,250.026002,80.006622
25%,8.754841,73.978997,1.023732,0.0,299.915333,84.957872
50%,13.530014,136.345683,1.501286,0.0,349.882057,90.041805
75%,15.35294,201.072785,2.01725,0.0,399.100208,94.913663
max,17.197744,262.972439,2.498595,0.0,449.97344,99.99824


#### Anemic dataset

In [21]:
anem_df= pd.DataFrame(columns = cols_to_use, dtype='object')
anem_df['hemoglobin'] = np.random.uniform(1, 12.1, 50000) #stopped here
#anem_df = normal_dist(anem_df, 'mcv', len(anem_df))
anem_df = uniform_dist(anem_df, 'mcv')
anem_df.describe()

Unnamed: 0,hemoglobin,mcv
count,50000.0,50000.0
mean,6.551247,90.042099
std,3.195902,8.64593
min,1.000006,75.000211
25%,3.773567,82.617594
50%,6.556797,89.988133
75%,9.321788,97.506846
max,12.099981,104.998961


In [22]:
anem_df.isna().sum()

hemoglobin                   0
ferritin                 50000
ret_count                50000
segmented_neutrophils    50000
tibc                     50000
mcv                          0
dtype: int64

In [23]:
anem_df['ferritin'], anem_df['tibc'] = less_mcv(anem_df)
anem_df['ret_count'] = anem_df.apply(lambda row: normal_mcv(row), axis=1)
anem_df['segmented_neutrophils'] = anem_df.apply(lambda row: more_mcv(row), axis=1)
anem_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
0,8.414592,,3.622525,,,84.069601
1,5.128417,,0.969421,,,85.061891
2,5.436468,,,0.0,,103.182944
3,6.14854,,4.609716,,,97.831239
4,10.895308,,4.094586,,,95.890273


In [24]:
anem_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,50000.0,8250.0,33412.0,8338.0,8250.0,50000.0
mean,6.551247,1651.844429,2.809462,0.970683,89.950276,90.042099
std,3.195902,2140.534052,2.140823,1.42808,4.969173,8.64593
min,1.000006,1.555388,0.000275,0.0,71.905418,75.000211
25%,3.773567,5.265555,0.881799,0.0,86.548351,82.617594
50%,6.556797,626.682711,2.569909,0.0,89.922237,89.988133
75%,9.321788,2820.768703,4.266184,1.653212,93.293404,97.506846
max,12.099981,13797.147997,12.632006,8.66275,109.904298,104.998961


In [25]:
# assigning labels to the dataset
anem_df['label'] = anem_df.apply(lambda row: create_label(row), axis=1)
anem_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,8.414592,,3.622525,,,84.069601,Hemolytic anemia
1,5.128417,,0.969421,,,85.061891,Aplastic anemia
2,5.436468,,,0.0,,103.182944,
3,6.14854,,4.609716,,,97.831239,Hemolytic anemia
4,10.895308,,4.094586,,,95.890273,Hemolytic anemia


In [26]:
anem_df.label.value_counts()

Hemolytic anemia                        19687
Aplastic anemia                         13725
Anemia of chronic disease                4723
Vitamin B12/Folate deficiency anemia     4121
Iron deficiency anemia                   3527
Name: label, dtype: int64

In [27]:
anem_df.isna().sum()

hemoglobin                   0
ferritin                 41750
ret_count                16588
segmented_neutrophils    41662
tibc                     41750
mcv                          0
label                     4217
dtype: int64

In [28]:
anem_df = anem_df[anem_df.label.notna()]
anem_df.isna().sum()

hemoglobin                   0
ferritin                 37533
ret_count                12371
segmented_neutrophils    41662
tibc                     37533
mcv                          0
label                        0
dtype: int64

In [31]:
noisy_anem_df = pd.DataFrame()
for anemia in anem_df.label.unique():
    #print(anemia)
    one_anem_df = make_noisy(anem_df, anemia)
    noisy_anem_df = pd.concat([noisy_anem_df, one_anem_df], axis=0)
noisy_anem_df = noisy_anem_df.sample(frac=1).reset_index(drop=True)
noisy_anem_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,6.343735,,0.104366,,,90.422936,Aplastic anemia
1,5.817056,,4.755657,,,90.40778,Hemolytic anemia
2,8.274027,,3.312978,,,90.506908,Hemolytic anemia
3,11.995761,,0.468324,,,95.208135,Aplastic anemia
4,9.032229,100.628066,,,86.204741,79.064724,Iron deficiency anemia


In [32]:
noisy_anem_df.isna().sum()

hemoglobin                   0
ferritin                 37533
ret_count                12371
segmented_neutrophils    41662
tibc                     37533
mcv                          0
label                        0
dtype: int64

In [34]:
cols_to_fill = ['ferritin', 'ret_count', 'segmented_neutrophils', 'tibc']
fracs_to_use = [0.4, 0.2, 0.1, 0.4]

In [35]:
filled_noisy_anem_df = replace_nans(noisy_anem_df, cols_to_fill, fracs_to_use)
filled_noisy_anem_df.isna().sum()

hemoglobin                   0
ferritin                 22519
ret_count                 9896
segmented_neutrophils    37495
tibc                     22519
mcv                          0
label                        0
dtype: int64

#### Merging the two datasets

In [36]:
final_df = pd.concat([noisy_no_df, filled_noisy_anem_df], axis=0)
final_df = final_df.sample(frac=1, random_state = 42).reset_index(drop=True)
final_df = final_df.fillna(0)
final_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,6.927945,299.116534,0.526298,0.0,334.360231,87.383481,Aplastic anemia
1,3.40945,130.313903,0.5241,0.0,0.0,102.925205,Hemolytic anemia
2,4.647288,0.0,0.5241,0.0,0.0,102.925205,Hemolytic anemia
3,4.827671,0.0,4.914933,0.0,0.0,90.50278,Hemolytic anemia
4,6.874963,477.981773,0.945871,0.0,226.382582,99.588399,Aplastic anemia


In [37]:
final_df.label.value_counts()

Hemolytic anemia                        19687
Aplastic anemia                         13725
No anemia                               10000
Anemia of chronic disease                4723
Vitamin B12/Folate deficiency anemia     4121
Iron deficiency anemia                   3527
Name: label, dtype: int64

In [38]:
final_df.to_csv('data/noisy_data_uniform_mcv_30_08_22.csv', index=False)

#### All uniform

In [18]:
uni_anem_df= pd.DataFrame(columns = cols_to_use, dtype='object')
uni_anem_df['hemoglobin'] = np.random.uniform(1, 12.1, 50000) #stopped here
uni_anem_df = uniform_dist(uni_anem_df, 'mcv')
uni_anem_df.describe()

Unnamed: 0,hemoglobin,mcv
count,50000.0,50000.0
mean,6.55505,90.011831
std,3.20423,8.676632
min,1.000053,75.000069
25%,3.778279,82.478025
50%,6.553701,89.980234
75%,9.314626,97.5129
max,12.099811,104.999378


In [20]:
uni_anem_df['ferritin'], uni_anem_df['tibc'] = less_mcv(uni_anem_df)
uni_anem_df['ret_count'] = uni_anem_df.apply(lambda row: normal_mcv(row), axis=1)
uni_anem_df['segmented_neutrophils'] = uni_anem_df.apply(lambda row: more_mcv(row), axis=1)
uni_anem_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
0,8.514534,,5.30563,,,92.784811
1,8.683968,,5.912183,,,88.422045
2,4.401023,,,3.666286,,103.819302
3,7.304413,52.006671,,,102.432429,75.479003
4,8.80586,,4.900134,,,80.148334


In [21]:
uni_anem_df['label'] = uni_anem_df.apply(lambda row: create_label(row), axis=1)
uni_anem_df = uni_anem_df[uni_anem_df.label.notna()]

In [22]:
uni_noisy_anem_df = pd.DataFrame()
for anemia in uni_anem_df.label.unique():
    #print(anemia)
    uni_one_anem_df = make_noisy(uni_anem_df, anemia)
    uni_noisy_anem_df = pd.concat([uni_noisy_anem_df, uni_one_anem_df], axis=0)
uni_noisy_anem_df = uni_noisy_anem_df.sample(frac=1).reset_index(drop=True)
uni_noisy_anem_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,2.295617,,2.303685,,,90.763193,Hemolytic anemia
1,9.86956,,0.278021,,,91.359367,Aplastic anemia
2,3.753498,11.738777,,,98.357173,78.525034,Anemia of chronic disease
3,9.346194,,1.821827,,,68.016515,Aplastic anemia
4,6.350371,,,0.0,,103.571371,Vitamin B12/Folate deficiency anemia


In [23]:
uni_cols_to_fill = ['ferritin', 'ret_count', 'segmented_neutrophils', 'tibc']
uni_fracs_to_use = [0.4, 0.2, 0.1, 0.4]

In [24]:
uni_filled_noisy_anem_df = replace_nans(uni_noisy_anem_df, uni_cols_to_fill, uni_fracs_to_use)
uni_filled_noisy_anem_df.isna().sum()

hemoglobin                   0
ferritin                 25017
ret_count                13386
segmented_neutrophils    37414
tibc                     25017
mcv                          0
label                        0
dtype: int64

In [26]:
uni_final_df = pd.concat([noisy_no_df, uni_filled_noisy_anem_df], axis=0)
uni_final_df = uni_final_df.sample(frac=1, random_state = 42).reset_index(drop=True)
uni_final_df = uni_final_df.fillna(0)
uni_final_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,3.656078,0.0,4.214187,0.0,0.0,98.080036,Hemolytic anemia
1,3.132632,466.228476,5.800653,0.0,0.0,102.925205,Hemolytic anemia
2,11.391199,240.486248,0.0,0.0,82.894768,75.874792,Anemia of chronic disease
3,8.754841,0.0,1.07056,0.0,0.0,93.135502,No anemia
4,8.754841,11.517467,0.0,0.0,0.0,97.609014,No anemia


In [27]:
uni_final_df.label.value_counts()

Hemolytic anemia                        22110
Aplastic anemia                         11157
No anemia                               10000
Vitamin B12/Folate deficiency anemia     8428
Anemia of chronic disease                7831
Iron deficiency anemia                    474
Name: label, dtype: int64

In [28]:
uni_final_df.to_csv('data/noisy_data_uniform_all_30_08_22.csv', index=False)