In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../')
from modules import utils, constants

In [2]:
features = ['ana', 'fever', 'leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis', 'delirium', 'psychosis', 
            'seizure']

In [3]:
feat_arr = [{'name':'ana', 'lower':0, 'upper':25},
            {'name':'fever', 'lower':10, 'upper':36},
            {'name':'leukopenia', 'lower':0.5, 'upper':5.5},
            {'name':'thrombocytopenia', 'lower':0, 'upper':1}, #0 absent
            {'name':'auto_immune_hemolysis', 'lower':60, 'upper':95},
            {'name':'delirium', 'lower':240, 'upper':450},
            {'name':'psychosis', 'lower':4.8, 'upper':10.3},
            {'name':'seizure', 'lower':80, 'upper':100},
           ]
features = [i['name'] for i in feat_arr]

In [4]:
def get_label(row):
    if row.ana >= 15:
        return 0
    elif row.ana < 15:
        return 1
    else:
        print('Unknown ana value')

#### the dataset

In [5]:
simple_df = pd.DataFrame()
for feat in feat_arr:
    simple_df[feat['name']] = np.random.uniform(feat['lower'], feat['upper'], 70000)
# simple_df['ana'] = [1]*35000 + [0]*35000
simple_df['label'] = simple_df.apply(lambda i: get_label(i), axis=1)
simple_df = simple_df.sample(frac=1, random_state=42).reset_index(drop=True)
simple_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,label
0,19.7363,14.269858,3.104222,0.050291,66.144911,247.375466,8.905914,98.982852,0
1,11.673771,13.325764,2.8815,0.506983,83.933367,307.519239,5.21436,92.100978,1
2,9.567493,16.870439,5.099898,0.129086,78.469891,256.218377,9.791354,97.992413,1
3,21.421192,14.841025,0.652897,0.126821,69.280091,299.680233,4.957622,94.478144,0
4,24.113361,21.280716,3.125061,0.586256,81.000651,342.139488,10.146988,82.189625,0


In [6]:
simple_df.label.value_counts()

1    41966
0    28034
Name: label, dtype: int64

In [7]:
# for i in features:
#     simple_df[i] = np.random.choice([0, 1], len(simple_df), p=[0.75, 0.25])

In [8]:
len(simple_df[(simple_df.ana<15) & (simple_df.label==0)])

0

In [9]:
simple_df.to_csv('../data/28_jan/very_simple_dataset.csv', index=False)

#### splitting the dataset

In [10]:
X = simple_df.iloc[:, 0:-1]
y = simple_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [11]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
train_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,label
0,2.355559,23.275588,0.992151,0.031649,83.800891,424.05439,8.955005,97.923367,1
1,2.827644,10.440244,3.646265,0.245557,85.635244,314.385329,8.02326,93.09423,1
2,23.976005,10.394025,3.712749,0.293129,87.204209,433.242106,5.032835,91.21516,0
3,21.637678,14.643438,5.439854,0.556186,93.790617,426.112817,7.066064,92.727468,0
4,13.290648,26.657688,3.624939,0.952937,67.543757,366.875785,7.553432,82.498625,1


In [12]:
train_df.to_csv('../data/28_jan/train_set_basic.csv', index=False)
test_df.to_csv('../data/28_jan/test_set_basic.csv', index=False)