In [2]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [3]:
feat_thresh_arr = [{'name':'hemoglobin', 'thresh':[12], 'std':2},
                   {'name':'ferritin', 'thresh':[30, 100], 'std':2},
                   {'name':'ret_count', 'thresh':[2], 'std':0.2},
                   {'name':'segmented_neutrophils', 'thresh':[0], 'std':2},
                   {'name':'tibc', 'thresh':[450], 'std':50},
                   {'name':'mcv', 'thresh':[80,100], 'std':2}]

anemias_features_dict = {'Vitamin B12/Folate deficiency anemia': ['mcv', 'segmented_neutrophils'], 
                         'Unspecified anemia': ['mcv', 'segmented_neutrophils'], 
                         'Anemia of chronic disease': ['mcv', 'ferritin', 'tibc'], 
                         'Iron deficiency anemia': ['mcv', 'ferritin', 'tibc'], 
                         'Hemolytic anemia': ['mcv', 'ret_count'], 
                         'Aplastic anemia': ['mcv', 'ret_count']}

#### The data

In [4]:
all_df= pd.read_csv('../../data/more_features/more_feats_correlated_0.1.csv')
all_df = all_df.fillna(-1)
all_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.183192,187.573466,4.951674,1.661383,316.593436,95.006424,181.242992,3.531295,1,-1.0,28.040619,96.721542,49.530542,23.186628,-1.0,33.549575,57.247868,Hemolytic anemia
1,14.387445,-1.0,-1.0,-1.0,-1.0,-1.0,125.249617,-1.0,1,-1.0,98.357508,112.758764,62.464566,7.16892,-1.0,43.162335,-1.0,No anemia
2,12.749357,5.012158,3.5028,6.179371,498.418768,76.759285,159.834784,4.982859,1,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,38.248071,32.068372,Iron deficiency anemia
3,11.50887,197.180945,1.200125,0.0,457.033309,102.900301,131.177927,3.355346,1,-1.0,111.220307,66.999185,18.353272,14.132423,-1.0,34.526609,28.70205,Unspecified anemia
4,9.456656,427.952052,-1.0,0.660252,-1.0,104.543774,-1.0,2.713693,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,28.369968,-1.0,Vitamin B12/Folate deficiency anemia


In [5]:
utils.get_dt_performance(all_df)

(0.9996428571428572,
 0.9996136838530153,
 0.9997821980021458,
 datetime.timedelta(microseconds=1987))

In [6]:
class_dict = constants.CLASS_DICT
#print(all_df.label.unique())
all_df['label'] = all_df['label'].replace(class_dict)
X = all_df.iloc[:, 0:-1]
y = all_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train_df = pd.concat([X_test, y_test], axis=1)
X_test_df = pd.concat([X_test, y_test], axis=1)
X_test_df = X_test_df.reset_index(drop=True)
X_test_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.391136,-1.0,1.504298,5.058881,467.197112,96.252926,217.860499,3.550376,1,-1.0,36.70381,122.310168,49.897027,20.353251,-1.0,34.173407,46.631388,6
1,10.355048,272.506966,-1.0,0.0,128.706486,101.912313,-1.0,3.048223,1,-1.0,-1.0,-1.0,29.327349,-1.0,-1.0,31.065144,-1.0,2
2,11.159385,-1.0,-1.0,0.0,-1.0,103.395616,77.212369,3.23787,0,-1.0,8.690122,65.554731,19.167967,7.239049,-1.0,33.478155,-1.0,2
3,8.179735,140.876632,5.484515,-1.0,303.740826,99.877458,155.573175,2.456931,0,-1.0,41.123526,119.43384,39.384848,6.394235,-1.0,24.539204,51.219053,5
4,9.916825,-1.0,3.90981,-1.0,-1.0,90.543986,189.347916,3.285748,0,1.501433,34.905667,59.177001,58.538524,22.843594,139.245204,29.750475,-1.0,5


In [7]:
# X_test_df.to_csv('../../data/more_features/train_sets/test_set_constant.csv', index=False)

In [8]:
def get_dict(dict_arr, feat_name):
    for dictionary in dict_arr:
        if dictionary['name']==feat_name:
            return dictionary

In [9]:
def feat_noisiness(df, feat_name, frac, mean, std):
    df[feat_name] = df[feat_name].sample(frac=1-frac)
    if feat_name ==  'segmented_neutrophils': #how was this a good idea
        df[feat_name] = df[feat_name].fillna(-1)
    else:
        nan_index = df[df[feat_name].isna()].index
        fill_values = np.random.normal(mean, std, size=len(nan_index))
        df[feat_name] = df[feat_name].fillna(pd.Series(fill_values, index=nan_index))
    return df[feat_name]

In [10]:
total_frac = 0.1

In [11]:
noisy_anemia_dfs_list = []
for label in X_train_df.label.unique():
    anemia = constants.ACTION_SPACE[label]
    print(f'{label} - {constants.ACTION_SPACE[label]}')
    if anemia not in ['No anemia', 'Inconclusive diagnosis']:
        anemia_df = X_train_df[X_train_df.label==label]
        feature_num = len(anemias_features_dict[anemia])
        frac = total_frac/feature_num
        for feat in anemias_features_dict[anemia]:
            feat_info = get_dict(feat_thresh_arr, feat)
            threshes, std = feat_info['thresh'], feat_info['std']
            print(f'feat:{feat}, threshes:{threshes}, std:{std}')
            for thresh in threshes:
                anemia_df[feat] = feat_noisiness(anemia_df, feat, frac/len(threshes), thresh, std)
        noisy_anemia_dfs_list.append(anemia_df)

6 - Aplastic anemia
feat:mcv, threshes:[80, 100], std:2
feat:ret_count, threshes:[2], std:0.2
2 - Unspecified anemia
feat:mcv, threshes:[80, 100], std:2
feat:segmented_neutrophils, threshes:[0], std:2
5 - Hemolytic anemia
feat:mcv, threshes:[80, 100], std:2
feat:ret_count, threshes:[2], std:0.2
0 - No anemia
4 - Iron deficiency anemia
feat:mcv, threshes:[80, 100], std:2
feat:ferritin, threshes:[30, 100], std:2
feat:tibc, threshes:[450], std:50
3 - Anemia of chronic disease
feat:mcv, threshes:[80, 100], std:2
feat:ferritin, threshes:[30, 100], std:2
feat:tibc, threshes:[450], std:50
7 - Inconclusive diagnosis
1 - Vitamin B12/Folate deficiency anemia
feat:mcv, threshes:[80, 100], std:2
feat:segmented_neutrophils, threshes:[0], std:2


In [12]:
noisy_anemia_dfs_list[0].describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
count,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0,1804.0
mean,9.52818,120.738323,1.059264,2.714868,210.585193,89.816145,85.069516,3.197021,0.577051,0.000395,47.001392,50.364464,29.366471,9.356574,25.165446,28.584541,23.786132,6.0
std,2.039479,162.754674,0.611086,2.496026,167.623291,6.092881,83.747525,0.716963,0.494164,1.111089,50.103219,44.819183,26.998699,10.353845,44.500772,6.118437,37.686457,0.0
min,6.012741,-1.0,0.000472,-1.0,-1.0,76.005177,-1.0,1.840245,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,18.038222,-1.0,6.0
25%,7.729755,-1.0,0.534371,0.704494,-1.0,84.359639,-1.0,2.588521,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,23.189266,-1.0,6.0
50%,9.54656,-1.0,1.056295,2.748082,213.308271,89.696297,67.048163,3.18323,1.0,-1.0,31.611403,51.266659,26.677087,6.877341,-1.0,28.639679,-1.0,6.0
75%,11.323036,246.669088,1.602119,4.85918,357.269658,95.139899,156.621345,3.784719,1.0,1.060526,90.747333,89.685525,52.975697,18.417896,52.488947,33.969109,42.39114,6.0
max,12.99876,499.713675,2.499364,6.996337,499.97806,104.825231,249.899027,4.852152,1.0,1.998975,149.935213,129.906626,79.990568,29.953309,139.902597,38.996281,215.032858,6.0


In [13]:
no_df = X_train_df[X_train_df.label==constants.CLASS_DICT['No anemia']]
inconc_df = X_train_df[X_train_df.label==constants.CLASS_DICT['Inconclusive diagnosis']]
len(no_df), len(inconc_df)

(2000, 990)

In [14]:
noisy_anemia_dfs_list.append(no_df)
noisy_anemia_dfs_list.append(inconc_df)
len(noisy_anemia_dfs_list)

8

In [15]:
combined_df = pd.concat(noisy_anemia_dfs_list, axis=0)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,12.927352,317.93186,-1.0,6.696206,395.154399,75.891614,24.378607,5.11019,1,-1.0,69.2024,116.158696,78.143398,3.049132,-1.0,38.782056,6.169388,3
1,8.868128,39.753584,-1.0,2.969229,481.114564,77.238889,157.579958,3.444429,1,-1.0,108.509584,37.598654,8.284188,29.553581,-1.0,26.604385,32.753105,4
2,11.959294,80.956898,4.930771,6.757627,498.99416,76.683822,229.80609,4.678677,1,0.263527,121.07646,125.288923,-1.0,7.868289,59.048433,35.877881,46.053864,4
3,12.66765,326.565335,4.421412,3.038288,379.948313,90.073832,141.347497,4.219089,0,0.916064,146.297079,43.620029,77.967802,23.885814,-1.0,38.002951,37.20177,0
4,12.39588,238.452713,3.006709,3.844979,-1.0,77.802603,79.224562,4.779743,0,1.317751,61.811659,62.067772,57.26326,16.033517,130.555051,37.187641,-1.0,0


#### The correlated features

In [16]:
def get_rbc(row):
    if row['mcv'] == -1:
        return -1
    else:
        return (30*row['hemoglobin'])/row['mcv']

def get_tsat(row):
    if (row['serum_iron']==-1) | (row['tibc']==-1):
        return -1
    else:
        return (row['serum_iron']/row['tibc'])*100

In [17]:
combined_df['rbc'] = combined_df.apply(lambda row: get_rbc(row), axis=1)
combined_df['tsat'] = combined_df.apply(lambda row: get_tsat(row), axis=1)
combined_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,12.927352,317.93186,-1.0,6.696206,395.154399,75.891614,24.378607,5.11019,1,-1.0,69.2024,116.158696,78.143398,3.049132,-1.0,38.782056,6.169388,3
1,8.868128,39.753584,-1.0,2.969229,481.114564,77.238889,157.579958,3.444429,1,-1.0,108.509584,37.598654,8.284188,29.553581,-1.0,26.604385,32.753105,4
2,11.959294,80.956898,4.930771,6.757627,498.99416,76.683822,229.80609,4.678677,1,0.263527,121.07646,125.288923,-1.0,7.868289,59.048433,35.877881,46.053864,4
3,12.66765,326.565335,4.421412,3.038288,379.948313,90.073832,141.347497,4.219089,0,0.916064,146.297079,43.620029,77.967802,23.885814,-1.0,38.002951,37.20177,0
4,12.39588,238.452713,3.006709,3.844979,-1.0,77.802603,79.224562,4.779743,0,1.317751,61.811659,62.067772,57.26326,16.033517,130.555051,37.187641,-1.0,0


#### Randomly modifying to no anemia

In [18]:
anemic_df = combined_df[combined_df.label != constants.CLASS_DICT['No anemia']]
non_anemic_df = combined_df[combined_df.label == constants.CLASS_DICT['No anemia']]
len(anemic_df), len(non_anemic_df)

(12000, 2000)

In [19]:
anemic_df.label.value_counts()

3    1951
4    1853
2    1807
6    1804
1    1800
5    1795
7     990
Name: label, dtype: int64

In [20]:
anemic_df['label'] = anemic_df['label'].sample(frac=1-0.1)
anemic_df['label'] = anemic_df['label'].fillna(constants.CLASS_DICT['No anemia'])
anemic_df.label.value_counts()

3.0    1749
4.0    1661
1.0    1637
6.0    1632
5.0    1623
2.0    1608
0.0    1200
7.0     890
Name: label, dtype: int64

#### Finalizing 

In [21]:
final_df = pd.concat([non_anemic_df, anemic_df], axis=0)
final_df = final_df.sample(frac=1).reset_index(drop=True)
final_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,6.18278,382.662823,1.398232,4.577306,372.155901,87.645846,-1.0,2.116283,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,18.54834,-1.0,6.0
1,12.038808,-1.0,0.401422,2.040508,221.499167,-1.0,-1.0,-1.0,0,-1.0,-1.0,-1.0,30.887269,-1.0,-1.0,36.116425,-1.0,0.0
2,8.924011,3.029055,-1.0,-1.0,482.48927,75.195947,-1.0,3.560303,0,-1.0,-1.0,-1.0,53.878588,-1.0,-1.0,26.772034,-1.0,4.0
3,12.066383,-1.0,-1.0,-1.0,-1.0,102.470973,146.81654,3.532625,1,0.299036,86.782115,101.947605,22.933228,3.788646,-1.0,36.199149,-1.0,7.0
4,8.49575,-1.0,1.87095,2.983976,105.351976,88.069236,-1.0,2.894002,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25.48725,-1.0,6.0


In [22]:
final_df.label.value_counts()

0.0    3200
3.0    1749
4.0    1661
1.0    1637
6.0    1632
5.0    1623
2.0    1608
7.0     890
Name: label, dtype: int64

In [23]:
utils.get_dt_performance(final_df, 'numeric')

TypeError: Cannot compare types 'ndarray(dtype=float64)' and 'str'

In [None]:
final_df.to_csv('../../data/more_features/train_sets/train_set_noisy_1.csv', index=False)