In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
feat_arr = [{'name':'hemoglobin', 'lower':12.1, 'upper':17.2, 'mean':10.3, 'std':2.3, 'abnorm_lower':6, 'abnorm_upper':18, 'min':(0, 5)},
            {'name':'ferritin', 'lower':10, 'upper':263, 'mean':697, 'std':3305, 'abnorm_lower':0, 'abnorm_upper': 500, 'min':(1.6, 8)},
            {'name':'ret_count', 'lower':0.5, 'upper':2.5, 'mean':2.6, 'std':2.5, 'abnorm_lower':0, 'abnorm_upper':6, 'min':(0, 0.5)},
            {'name':'segmented_neutrophils', 'lower':0, 'upper':0, 'mean':0, 'std':2.4, 'abnorm_lower':0.01, 'abnorm_upper':7, 'min':(0, 0)},
            {'name':'tibc', 'lower':250, 'upper':450, 'mean':242, 'std':90, 'abnorm_lower': 100, 'abnorm_upper':500, 'min':(3, 100)},
            {'name':'mcv', 'lower':80, 'upper':100, 'mean':90, 'std':5, 'abnorm_lower': 75, 'abnorm_upper':105, 'min':(0, 65)}]

#### Utils functions

In [4]:
def noisy_feature(df, feature, frac, lower, upper, seed):
    '''a frac of the feature data is made to be between lower and upper'''
    df[feature] = df[feature].sample(frac = 1-frac, random_state = seed)
    null_indices_list = df[df[feature].isnull()].index.tolist()
    df[feature] = df[feature].fillna(random.uniform(lower, upper))
    return df[feature]

In [5]:
def clip_value(val, feat_dict):
    '''prevents negative values in data following a normal distribution'''
    if val<=0:
        new_val = np.random.uniform(feat_dict['min'][0], feat_dict['min'][1])
    else:
        new_val = val
    return new_val                                                         

In [6]:
def normal_dist(df, col_name, num=None, cond=None):
    for feat_dict in feat_arr:
        if feat_dict['name'] == col_name:
            if cond is None:
                df[col_name] = np.random.normal(feat_dict['mean'], feat_dict['std'], num)
            else:
                df[col_name] = np.random.normal(feat_dict['mean'], feat_dict['std'] if cond else np.nan)
            df[col_name] = [np.random.uniform(feat_dict['min'][0], feat_dict['min'][1]) if val<0 else val for val in df[col_name]]
    return df

In [7]:
def less_mcv(df):
    irons = []
    ferritins = []
    tibcs = []
    for i, row in df.iterrows():
        if row['mcv']<80:
            ferritin = clip_value(np.random.normal(feat_arr[1]['mean'], feat_arr[1]['std']), feat_arr[1])
            tibc = clip_value(np.random.normal(feat_arr[5]['mean'], feat_arr[5]['std']), feat_arr[5])
        else:
            iron = ferritin = tibc = np.nan
        irons.append(iron)
        ferritins.append(ferritin)
        tibcs.append(tibc)
    return ferritins, tibcs

In [8]:
def normal_mcv(row):
    if (row['mcv']>=80) & (row['mcv'] <=100):
        return clip_value(np.random.normal(feat_arr[2]['mean'], feat_arr[2]['std']), feat_arr[2])
    else:
        return np.nan

In [9]:
def more_mcv(row):
    if row['mcv']> 100:
        return clip_value(np.random.normal(feat_arr[3]['mean'], feat_arr[3]['std']), feat_arr[3])
    else:
        return np.nan

In [10]:
def create_label(row):
    if row['mcv']<80 :
        if row['ferritin']<30:
            return 'Iron deficiency anemia'
        elif row['ferritin']>100:
            return 'Anemia of chronic disease'
        elif row['tibc']<450:
            return 'Anemia of chronic disease'
        elif row['tibc']>=450:
            return 'Iron deficiency anemia'
        else:
            pass
    elif row['mcv']<=100:
        if row['ret_count'] <=2:
            return 'Aplastic anemia'
        else:
            return 'Hemolytic anemia'
        
    elif row['mcv']> 100:
        if row['segmented_neutrophils']>0:
            return 'Vitamin B12/Folate deficiency anemia'
        else:
            pass
    else:
        pass

#### No anemia dataset

In [11]:
cols_to_use = [feat['name'] for feat in feat_arr]
present_fractions = [1.0, 0.6, 0.3, 0.1, 0.6, 1.0]

no_df = pd.DataFrame()
for idx, feat in enumerate(feat_arr):
    no_df[feat['name']] = np.random.uniform(feat['lower'], feat['upper'], 10000)
    no_df[feat['name']] = no_df[feat['name']].sample(frac=present_fractions[idx])
no_df['label'] = 'No anemia'
no_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,14.010155,,,,345.481644,91.442937,No anemia
1,16.948643,,,,,97.958187,No anemia
2,15.833169,171.449291,0.800313,,363.985025,96.683212,No anemia
3,15.153158,,,,,85.347757,No anemia
4,12.895695,96.569765,,,271.622851,99.016812,No anemia


In [12]:
no_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,10000.0,6000.0,3000.0,1000.0,6000.0,10000.0
mean,14.620214,137.008678,1.510928,0.0,349.515381,89.976241
std,1.466914,73.190153,0.577518,0.0,57.136086,5.747731
min,12.100059,10.002352,0.500406,0.0,250.026002,80.006622
25%,13.356277,73.978997,1.023732,0.0,299.915333,84.957872
50%,14.611896,136.345683,1.501286,0.0,349.882057,90.041805
75%,15.874032,201.072785,2.01725,0.0,399.100208,94.913663
max,17.19856,262.972439,2.498595,0.0,449.97344,99.99824


In [13]:
# randomly place 30% of hb vaues below 12.1
noisy_no_df = no_df.copy()
noisy_no_df['hemoglobin'] = noisy_feature(noisy_no_df, 'hemoglobin', 0.3, 3, 12, 1)
noisy_no_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,10000.0,6000.0,3000.0,1000.0,6000.0,10000.0
mean,12.869912,137.008678,1.510928,0.0,349.515381,89.976241
std,2.963361,73.190153,0.577518,0.0,57.136086,5.747731
min,8.754841,10.002352,0.500406,0.0,250.026002,80.006622
25%,8.754841,73.978997,1.023732,0.0,299.915333,84.957872
50%,13.530014,136.345683,1.501286,0.0,349.882057,90.041805
75%,15.35294,201.072785,2.01725,0.0,399.100208,94.913663
max,17.197744,262.972439,2.498595,0.0,449.97344,99.99824


#### Anemic dataset

In [14]:
anem_df= pd.DataFrame(columns = cols_to_use, dtype='object')
anem_df['hemoglobin'] = np.random.uniform(1, 12.1, 50000) #stopped here
anem_df = normal_dist(anem_df, 'mcv', len(anem_df))
anem_df.describe()

Unnamed: 0,hemoglobin,mcv
count,50000.0,50000.0
mean,6.569678,90.026833
std,3.205954,5.006698
min,1.000175,69.595835
25%,3.774922,86.665957
50%,6.579877,89.998775
75%,9.347484,93.369162
max,12.099882,111.491392


In [15]:
anem_df.isna().sum()

hemoglobin                   0
ferritin                 50000
ret_count                50000
segmented_neutrophils    50000
tibc                     50000
mcv                          0
dtype: int64

In [16]:
anem_df['ferritin'], anem_df['tibc'] = less_mcv(anem_df)
anem_df['ret_count'] = anem_df.apply(lambda row: normal_mcv(row), axis=1)
anem_df['segmented_neutrophils'] = anem_df.apply(lambda row: more_mcv(row), axis=1)
anem_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
0,5.270236,,0.972796,,,95.926449
1,2.395295,,0.938043,,,95.455899
2,4.802576,,0.225422,,,93.126124
3,5.786772,,5.107302,,,81.095111
4,2.78824,,3.004652,,,94.73574


In [17]:
anem_df.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv
count,50000.0,1112.0,47714.0,1174.0,1112.0,50000.0
mean,6.569678,1651.056511,2.833326,0.943822,90.037595,90.026833
std,3.205954,2091.954091,2.139131,1.412888,4.893362,5.006698
min,1.000175,1.600221,5e-05,0.0,75.629038,69.595835
25%,3.774922,5.548739,0.917366,0.0,86.650977,86.665957
50%,6.579877,714.055661,2.609434,0.051461,90.173287,89.998775
75%,9.347484,2806.721883,4.297352,1.497811,93.345126,93.369162
max,12.099882,9313.488705,11.892821,8.640888,103.705859,111.491392


In [18]:
# assigning labels to the dataset
anem_df['label'] = anem_df.apply(lambda row: create_label(row), axis=1)
anem_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,5.270236,,0.972796,,,95.926449,Aplastic anemia
1,2.395295,,0.938043,,,95.455899,Aplastic anemia
2,4.802576,,0.225422,,,93.126124,Aplastic anemia
3,5.786772,,5.107302,,,81.095111,Hemolytic anemia
4,2.78824,,3.004652,,,94.73574,Hemolytic anemia


In [19]:
anem_df.isna().sum()

hemoglobin                   0
ferritin                 48888
ret_count                 2286
segmented_neutrophils    48826
tibc                     48888
mcv                          0
label                      577
dtype: int64

In [20]:
anem_df = anem_df[anem_df.label.notna()]
anem_df.isna().sum()

hemoglobin                   0
ferritin                 48311
ret_count                 1709
segmented_neutrophils    48826
tibc                     48311
mcv                          0
label                        0
dtype: int64

In [21]:
def make_noisy(df, anemia):
    noisy_df = df[df.label == anemia]
    if anemia == 'No anemia':
         noisy_df['hemoglobin'] = noisy_feature(noisy_df, 'hemoglobin', 0.3, 3, 12, 1)
    elif anemia == 'Hemolytic anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 60, 79, 2)
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 101, 108, 3)
        noisy_df['ret_count'] = noisy_feature(noisy_df, 'ret_count', 0.2, 0.1, 2, 4)
    elif anemia == 'Aplastic anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 60, 79, 5)
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.2, 101, 108, 6)
        noisy_df['ret_count'] = noisy_feature(noisy_df, 'ret_count', 0.3, 2.1, 6, 7)
    elif anemia == 'Iron deficiency anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.3, 80, 110, 8)
        noisy_df['ferritin'] = noisy_feature(noisy_df, 'ferritin', 0.3, 100.1, 120, 9)
    elif anemia == 'Vitamin B12/Folate deficiency anemia':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.3, 60, 99, 10)
        noisy_df['segmented_neutrophils'] = noisy_feature(noisy_df, 'segmented_neutrophils', 0.3, 0, 0, 11)
    elif anemia == 'Anemia of chronic disease':
        noisy_df['mcv'] = noisy_feature(noisy_df, 'mcv', 0.3, 80, 110, 12)
        noisy_df['ferritin'] = noisy_feature(noisy_df, 'ferritin', 0.3, 10, 30, 13)
    else:
        print('What the hell is this?')
    return noisy_df

In [22]:
noisy_anem_df = pd.DataFrame()
for anemia in anem_df.label.unique():
    #print(anemia)
    one_anem_df = make_noisy(anem_df, anemia)
    noisy_anem_df = pd.concat([noisy_anem_df, one_anem_df], axis=0)
noisy_anem_df = noisy_anem_df.sample(frac=1).reset_index(drop=True)
noisy_anem_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,11.820498,,5.5939,,,81.405442,Hemolytic anemia
1,4.207335,,1.601331,,,60.475204,Aplastic anemia
2,11.69728,,4.334316,,,97.15562,Hemolytic anemia
3,11.73615,,5.931547,,,88.93092,Hemolytic anemia
4,9.6631,,3.016481,,,86.161449,Hemolytic anemia


In [23]:
noisy_anem_df.isna().sum()

hemoglobin                   0
ferritin                 48311
ret_count                 1709
segmented_neutrophils    48826
tibc                     48311
mcv                          0
label                        0
dtype: int64

In [25]:
def replace_nans2(df, cols, fracs):#fracs is a list
    for feat_dict in feat_arr:
        feat_name = feat_dict['name']
        if feat_name in cols:
            idx = cols.index(feat_name)
            nan_num = df[feat_name].isna().sum()
            fill_num = int(np.ceil(fracs[idx]*nan_num))
            generated_nums = np.random.uniform(feat_dict['abnorm_lower'], feat_dict['abnorm_upper'], fill_num)
            indices = list(df[df[feat_name].isna()].index)
            new_indices = random.sample(indices, fill_num)
            df.loc[new_indices, feat_name] = generated_nums
    return df

In [26]:
cols_to_fill = ['ferritin', 'ret_count', 'segmented_neutrophils', 'tibc']
fracs_to_use = [0.4, 0.2, 0.1, 0.4]

In [27]:
filled_noisy_anem_df = replace_nans2(noisy_anem_df, cols_to_fill, fracs_to_use)
filled_noisy_anem_df.isna().sum()

hemoglobin                   0
ferritin                 28986
ret_count                 1367
segmented_neutrophils    43943
tibc                     28986
mcv                          0
label                        0
dtype: int64

#### Merging the two datasets

In [28]:
final_df = pd.concat([noisy_no_df, filled_noisy_anem_df], axis=0)
final_df = final_df.sample(frac=1, random_state = 42).reset_index(drop=True)
final_df = final_df.fillna(0)
final_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,11.24388,29.371438,1.795141,0.0,159.663273,105.736896,Hemolytic anemia
1,9.083643,261.807395,0.054047,0.0,158.845165,93.985779,Aplastic anemia
2,7.15021,24.151978,4.094634,0.0,0.0,91.987864,Hemolytic anemia
3,4.297103,217.221095,0.72009,0.0,263.999772,88.331155,Aplastic anemia
4,8.460786,4.551094,1.795141,0.0,479.026287,86.478161,Hemolytic anemia


In [29]:
final_df.label.value_counts()

Hemolytic anemia                        28362
Aplastic anemia                         19352
No anemia                               10000
Anemia of chronic disease                 649
Vitamin B12/Folate deficiency anemia      597
Iron deficiency anemia                    463
Name: label, dtype: int64

In [30]:
final_df.to_csv('data/noisy_data_nomal_30_08_22.csv', index=False)

In [None]:
# def create_data(anemia, norm_col_list=[], abnorm_low_col_list=[], abnorm_high_col_list=[], abnorm_frac=None, rand_frac=None, df_len):
#     '''
#     norm_col_list = all values in column are normal
#     abnorm_col_list = 1-frac are normal, frac are abnormal
#     abnormal_frac = percentage of abnormal e.g. 0.3
#     df_len = length of synthentic_df
#     '''
#     if len(list(set(norm_col_list).intersection(abnorm_col_list))) !=0:
#         print('A feature can only be an element of one and only one list')
#         return
#     anemia_df = pd.DataFrame()
#     if len(norm_col_list) != 0:
#         #Only normal values
#         for norm_col in norm_col_list:
#             col_dict = [feat_dict for feat_dict in feat_arr if feat_dict['name']==norm_col][0]
#             anemia_df[norm_col] =  np.random.uniform(col_dict['lower'], col_dict['upper'], df_len)
#     if len(abnorm_low_col_list) != 0:
#         #70% normal values 30% lower values
#         for abnorm_low_col in abnorm_low_col_list:
#             col_dict = [feat_dict for feat_dict in feat_arr if feat_dict['name']==abnorm_low_col][0]
#             norm_low_list = list(np.random.uniform(col_dict['lower'], col_dict['upper'], df_len*(1-abnorm_frac)))
#             abnorm_low_list = list(np.random.uniform(col_dict['abnorm_lower'], col_dict['lower'], df_len*abnorm_frac))
#             low_list = norm_low_list + abnorm_low_list
#             anemia_df[abnorm_low_col] = random.sample(low_list, len(low_list))
#     if len(abnorm_high_col_list) != 0:
#         #70% normal value 30% lower values
#         for abnorm_high_col in abnorm_high_col_list:
#             col_dict = [feat_dict for feat_dict in feat_arr if feat_dict['name']==abnorm_high_col][0]
#             norm_high_list = list(np.random.uniform(col_dict['lower'], col_dict['upper'], df_len*(1-abnorm_frac)))
#             abnorm_high_list = list(np.random.uniform(col_dict['upper'], col_dict['abnorm_upper'], df_len*abnorm_frac))
#             high_list = norm_high_list + abnorm_high_list
#             #random.shuffle(high_list)
#             anemia_df[abnorm_high_col] = random.sample(high_list, len(high_list))
            
#     #rest of the columns        
#     all_features = [i['name'] for i in feat_arr]
#     rest_col_list = list(set(all_features) - set(norm_col_list) - set(abnorm_low_col_list) - set(abnorm_high_col_list))
#     if len(rest_col_list) != 0:
#         for rest_col in rest_col_list:
#             col_dict = [feat_dict for feat_dict in feat_arr if feat_dict['name']==rest_col][0]
#             anemia_df[rest_col] = np.random.uniform(col_dict['abnorm_lower'], col_dict['abnorm_upper'], df_len)
#             anemia_df[rest_col] = anemia_df.loc[anemia_df.sample(frac=rand_frac)index, rest_col]=np.nan #add rand state?       
#     return anemia_df

In [None]:
# def create_column(df_len, col_name, norm_frac, upper_frac, lower_frac, lower_bd=None, upper_bd=None):
#     #norm frac, upper_frac, lower_frac should add up to 1
#     col_dict = [feat_dict for feat_dict in feat_arr if feat_dict['name'] == col_name][0]
#     if lower_bd and upper_bd:
#         norm_list = list(np.random.uniform(lower_bd, upper_bd, int(df_len*norm_frac)))
#         upper_list = list(np.random.uniform(upper_bd, col_dict['abnorm_upper'], int(df_len*upper_frac)))
#         lower_list = abnorm_low_list = list(np.random.uniform(col_dict['abnorm_lower'], lower_bd, int(df_len*lower_frac)))
#     elif lower_bd:
#         norm_list = list(np.random.uniform(lower_bd, col_dict['upper'], int(df_len*norm_frac)))
#         upper_list = list(np.random.uniform(col_dict['upper'], col_dict['abnorm_upper'], int(df_len*upper_frac)))
#         lower_list = abnorm_low_list = list(np.random.uniform(col_dict['abnorm_lower'], lower_bd, int(df_len*lower_frac)))
#     elif upper_bd:
#         norm_list = list(np.random.uniform(col_dict['lower'], upper_bd, int(df_len*norm_frac)))
#         upper_list = list(np.random.uniform(upper_bd, col_dict['abnorm_upper'], int(df_len*upper_frac)))
#         lower_list = abnorm_low_list = list(np.random.uniform(col_dict['abnorm_lower'], col_dict['lower'], int(df_len*lower_frac)))
#     else:
#         norm_list = list(np.random.uniform(col_dict['lower'], col_dict['upper'], int(df_len*norm_frac)))
#         upper_list = list(np.random.uniform(col_dict['upper'], col_dict['abnorm_upper'], int(df_len*upper_frac)))
#         lower_list = abnorm_low_list = list(np.random.uniform(col_dict['abnorm_lower'], col_dict['lower'], int(df_len*lower_frac)))
        
#     whole_list = norm_list + upper_list + lower_list
#     if len(whole_list) < df_len:
#         for i in range(df_len - len(whole_list)):
#             whole_list.append(np.random.uniform(col_dict['abnorm_lower'], col_dict['abnorm_upper']))
#     elif len(whole_list) > df_len:
#         for i in range(len(whole_list) - df_len):
#             whole_list.pop()
#     else:
#         pass
#     shuffled_whole_list = random.sample(whole_list, len(whole_list))
#     return shuffled_whole_list

In [None]:
# #### No anemia
# df_len = 10000
# no_df = pd.DataFrame()
# no_df['hemoglobin'] = create_column(df_len, 'hemoglobin', 0.7, 0, 0.3)


In [None]:
# #### Hemolytic anemia
# df_len = 10000
# hem_df = pd.DataFrame()
# hem_df['hemoglobin'] = create_column(10000, 'hemoglobin', 0, 0, 1)
# hem_df['mcv'] = create_column(10000, 'mcv', 0.7, 0.15, 0.15)
# hem_df['ret_count'] = create_column(10000, 'ret_count', 0.15, 0.55, 0.3, lower_bd=2)
# df['ferritin'] = create_column(10000, 'ferritin', )
# df['segmented_neutrophils']
# df['tibc']
# hem_df.describe()