In [None]:
import pandas as pd
import numpy as np
import random
import joblib
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [None]:
df = pd.read_csv('../../data/anemia_synth_dataset_hb_with_unspecified.csv')
#df = df.fillna(-1)
df.head()

In [None]:
class_dict = constants.CLASS_DICT

In [None]:
anem_feat_arr = [{'name':'hemoglobin', 'lower':12.1, 'upper':17.2, 'abnorm_lower':6, 'abnorm_upper':18},
                 {'name':'ferritin', 'lower':10, 'upper':263, 'abnorm_lower':0, 'abnorm_upper': 500},
                 {'name':'ret_count', 'lower':0.5, 'upper':2.5, 'abnorm_lower':0, 'abnorm_upper':6},
                 {'name':'segmented_neutrophils', 'lower':0, 'upper':0, 'abnorm_lower':0.01, 'abnorm_upper':7},
                 {'name':'tibc', 'lower':250, 'upper':450, 'abnorm_lower': 100, 'abnorm_upper':500},
                 {'name':'mcv', 'lower':80, 'upper':100, 'abnorm_lower': 75, 'abnorm_upper':105}]

In [None]:
add_feat_arr = [{'name':'serum_iron', 'lower':60, 'upper':170, 'abnorm_lower':20, 'abnorm_upper':250},
                {'name':'rbc', 'lower':3.8, 'upper':5.9, 'abnorm_lower': 2.5, 'abnorm_upper':7},
                {'name':'age', 'lower':20, 'upper':90, 'abnorm_lower':18, 'abnorm_upper':95},
                {'name':'gender', 'lower':0, 'upper':1}, #0=Female 1=male
                {'name':'indirect_bilirubin', 'lower':0.2, 'upper':1.2, 'abnorm_lower':0, 'abnorm_upper':3},
                {'name':'transferrin', 'lower':204, 'upper':360, 'abnorm_lower':150, 'abnorm_upper':450},
                {'name':'creatinine', 'lower':0.6, 'upper':1.3, 'abnorm_lower':0.2, 'abnorm_upper':2},
                {'name':'cholestrol', 'lower':30, 'upper':100, 'abnorm_lower':0, 'abnorm_upper':150},
                {'name':'copper', 'lower':62, 'upper':140, 'abnorm_lower':30, 'abnorm_upper':130},
                {'name':'ethanol', 'lower':0, 'upper':50, 'abnorm_lower':0, 'abnorm_upper':80},
                {'name':'folate', 'lower':2.7, 'upper':17, 'abnorm_lower':0.5, 'abnorm_upper':30},
                {'name':'glucose', 'lower':70, 'upper':100, 'abnorm_lower':40, 'abnorm_upper':140}
               ]

In [None]:
def create_label(row):
    if row['hemoglobin']> 13:
        return 'No anemia'
    elif (row['hemoglobin'] > 12) & (row['gender']==0):
        return 'No anemia'
    else:
        if row['mcv']<80 :
            if row['ferritin']<30:
                return 'Iron deficiency anemia'
            elif row['ferritin']>100:
                return 'Anemia of chronic disease'
            elif row['tibc']<450:
                return 'Anemia of chronic disease'
            elif row['tibc']>=450:
                return 'Iron deficiency anemia'
            else:
                return 'Inconclusive diagnosis'
        elif row['mcv']<=100:
            if row['ret_count'] <= 2:
                return 'Aplastic anemia'
            elif row['ret_count'] >2:
                return 'Hemolytic anemia'
            else:
                return 'Inconclusive diagnosis'

        elif row['mcv']> 100:
            if row['segmented_neutrophils']>0:
                return 'Vitamin B12/Folate deficiency anemia'
            elif row['segmented_neutrophils']==0:
                return 'Unspecified anemia'
            else:
                return 'Inconclusive diagnosis'
        else:
            return 'Inconclusive diagnosis'

In [None]:
def uniform_dist(df, feat_dict, num=None):
    col_name = feat_dict['name']
    col_values = np.random.uniform(feat_dict['abnorm_lower'], feat_dict['abnorm_upper'], num)
    return col_values

In [None]:
def create_dataset(sample_num, dist_type='uniform'):
    dataset = pd.DataFrame()
    for feat_dict in add_feat_arr:
        if feat_dict['name'] == 'gender':
            dataset[feat_dict['name']] = np.random.choice([feat_dict['lower'], feat_dict['upper']], sample_num)
        elif dist_type=='normal':
            dataset[feat_dict['name']] = normal_dist(dataset, feat_dict, sample_num)
        else:
            dataset[feat_dict['name']] = uniform_dist(dataset, feat_dict, sample_num)
    return dataset

In [None]:
def add_nans(df):
    for col in df.columns:
        if col not in ['gender', 'age']:
            col_frac = round(random.uniform(0.1, 0.7), 1)
            df.loc[df.sample(frac=col_frac, random_state=SEED).index, col] = np.nan
    return df

In [None]:
add_ft_df = create_dataset(len(df))
add_ft_df = add_nans(add_ft_df)
#add_ft_df = add_ft_df.fillna(-1)
add_ft_df.head()

In [None]:
len(df), len(add_ft_df)

#### Concatenate the dfs

In [None]:
new_df = pd.concat([df, add_ft_df], axis=1) #pd.concat([Xtest.reset_index(drop=True), ytest.reset_index(drop=True)], axis=1)
new_df['new_label'] = new_df.apply(lambda row: create_label(row), axis=1)
new_df = new_df.drop(['label'], axis=1)
new_df = new_df.rename({'new_label': 'label'}, axis=1)
new_df.head()

In [None]:
new_df.isna().sum()

In [None]:
new_df.label.value_counts()

In [None]:
new_df.to_csv('../../data/anemia_synth_dataset_some_nans_unspecified_more_feats.csv', index=False)