In [None]:
import os
import requests
import csv
import emc
import logging 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn import preprocessing 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

from matplotlib.ticker import StrMethodFormatter

In [None]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [None]:
df.head(3)

In [None]:
# Print label counts
df['stroke'].value_counts()

# Data label is imbalanced

In [None]:
# Fill in missing BMI data; use BMI means of each age quantile
q1 = df['age'].quantile(0.25)
q2 = df['age'].median()
q3 = df['age'].quantile(0.75)

q1_bmi = round(df[df['age']<=q1]['bmi'].mean(),1)
q2_bmi = round(df[(df['age']>q1)&(df['age']<=q2)]['bmi'].mean(),1)
q3_bmi = round(df[(df['age']>q2)&(df['age']<=q3)]['bmi'].mean(),1)
q4_bmi = round(df[df['age']>q3]['bmi'].mean(),1)

q1_bool_mask_sr = df[df['age']<=q1]['bmi'].isna()
q1_idx = q1_bool_mask_sr[q1_bool_mask_sr].index
df.loc[q1_idx,'bmi']=q1_bmi

q2_bool_mask_sr = df[(df['age']>q1)&(df['age']<=q2)]['bmi'].isna()
q2_idx = q2_bool_mask_sr[q2_bool_mask_sr].index
df.loc[q2_idx,'bmi']=q2_bmi

q3_bool_mask_sr = df[(df['age']>q2)&(df['age']<=q3)]['bmi'].isna()
q3_idx = q3_bool_mask_sr[q3_bool_mask_sr].index
df.loc[q3_idx,'bmi'] = q3_bmi

q4_bool_mask_sr = df[df['age']>q3]['bmi'].isna()
q4_idx = q4_bool_mask_sr[q4_bool_mask_sr].index
df.loc[q4_idx,'bmi'] = q4_bmi

### Categorical data

gender, ever_married, Residence_type => binary

work_type => one-hot (Nominal)

smoking_status => integer (Ordinal)

In [None]:
# Drop single row with "Other" gender value
df.drop(df[df['gender']=='Other'].index, inplace=True)

In [None]:
# Encode "work_type" column into one-hot
wt_category = pd.Categorical(df['work_type'], categories=['Private','Self-employed','Govt_job','children','Never_worked'])

# returns one-hot encoded df
one_hot_encoded = pd.get_dummies(wt_category).astype(int)

# Drop original work_type col
no_wt_df = df.drop('work_type', axis=1)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
numerical_df = pd.concat([no_wt_df.reset_index(drop=True), one_hot_encoded.reset_index(drop=True)], axis=1)

# Put stroke label column at the last columns
numerical_df = numerical_df[[c for c in numerical_df if c != 'stroke'] + [c for c in numerical_df if c == 'stroke']]

# Convert binary labeled columns into 0/1
numerical_df['gender'] = pd.factorize(numerical_df['gender'])[0]
numerical_df['ever_married'] = pd.factorize(numerical_df['ever_married'])[0]
numerical_df['Residence_type'] = pd.factorize(numerical_df['Residence_type'])[0]

In [None]:
# Convert smoking status to int
smoke_encoder = preprocessing.OrdinalEncoder(categories=[['never smoked','Unknown','formerly smoked','smokes']])

# Fit and transform the ordinal data
num_smoke_data = smoke_encoder.fit_transform(numerical_df['smoking_status'].values.reshape(-1,1))

numerical_df['smoking_status']=num_smoke_data.astype(int)

In [None]:
numerical_df['age'] = numerical_df['age'].astype(int)
numerical_df.head(3)

In [None]:
# Get label out before normalization
is_stroke = numerical_df['stroke'].values
numerical_df.drop(columns=['stroke'],inplace=True)

# Feature Normalization
numerical_df['bmi'] =  np.log(numerical_df['bmi'])
numerical_df['avg_glucose_level'] =  np.log(numerical_df['avg_glucose_level'])

scaler = preprocessing.StandardScaler().fit(numerical_df)
medical_data = scaler.transform(numerical_df)

In [None]:
# Extract id column as a list
sample_ids = df['id'].to_numpy()

# Drop id column
df.drop(columns=['id'],inplace=True)

In [None]:
print(medical_data.shape)
print(is_stroke.shape)
print(len(sample_ids))

## Splitting Imbalanced Data 
Our data is heavily imbalanced and it is important to balance the major and minor labels. Therefore, we do random oversampling of the minor class.


In [None]:
# Test out the oversample/split function
train_x, train_y, test_x, test_y, test_ids, train_ids = \
    emc.train_test_split_oversample(medical_data, 
                                    is_stroke, 
                                    sample_ids=sample_ids, 
                                    oversample_type='ros', 
                                    split_ratio=0.7, 
                                    oversample_size=0.4,
                                    seed=0)

print("Number of 1 label samples in:")
print("Train Set = ", sum(train_y == 1))
print("Test Set = ", sum(test_y == 1))

print("Number of 0 label samples in:")
print("Train Set = ", sum(train_y == 0))
print("Test Set = ", sum(test_y == 0))

In [None]:
# Create small dataset for debugging

# set number of major&minor labeled samples to be included in small dataset
n_samples = 80

# identify minor label
minor_label = emc.get_minor_label(is_stroke)
# get each label indices and slice into small number
minor_label_idx_arr = np.where(is_stroke==minor_label)[0]
minor_label_idx_arr = np.random.permutation(minor_label_idx_arr)[:n_samples]
major_label_idx_arr = np.where(is_stroke!=minor_label)[0]
major_label_idx_arr = np.random.permutation(major_label_idx_arr)[:n_samples]

# concat both labels
small_data = np.concatenate([medical_data[minor_label_idx_arr], medical_data[major_label_idx_arr]])
small_label = np.concatenate([is_stroke[minor_label_idx_arr], is_stroke[major_label_idx_arr]])
small_sample_ids = np.concatenate([sample_ids[minor_label_idx_arr], sample_ids[major_label_idx_arr]])

# shuffle 
suffled_idx = np.random.permutation(len(small_data))
small_data = small_data[suffled_idx]
small_label = small_label[suffled_idx]
small_sample_ids = small_sample_ids[suffled_idx]

print('small_data shape:',small_data.shape)
print('small_label shape:',small_label.shape)
print('small_sample_ids shape:',small_sample_ids.shape)

In [None]:
# Test/train split function on SMALL DEBUGGIN DATASET
train_x, train_y, test_x, test_y = \
    emc.train_test_split_oversample(small_data, small_label, 
                                    oversample_type='ros', 
                                    split_ratio=0.8, 
                                    oversample_size=20,
                                    seed=0)

print("Number of active compounds in:")
print("Train Set = ", sum(train_y == 1))
print("Test Set = ", sum(test_y == 1))

print("Number of inactive compounds in:")
print("Train Set = ", sum(train_y == 0))
print("Test Set = ", sum(test_y == 0))

### Model configuration

In [None]:
# Configuration for both EMC and random sampling models
config={
    'emc_dir_path':'stroke_emc_res',
    'rand_sampling_dir_path':'stroke_rand_sampling_res',
    'n_sim':10,
    'initial_train_ratio':0.05,    
    # 'ros' OR 'smote' 
    'oversample_type':'smote', 
    'split_ratio':0.8,
    # ratio in range [0.0,1.0] OR number (positive int) of samples to oversample
    'oversample_size':0.5,
    'log_freq':50
}

test_config={
    'emc_dir_path':'stroke_emc_res',
    'rand_sampling_dir_path':'stroke_rand_sampling_res',
    'n_sim':3,
    'initial_train_ratio':0.2,
    # 'ros' OR 'smote' 
    'oversample_type':'smote', 
    'split_ratio':0.8,
    # ratio in range [0.0,1.0] OR number (positive int) of samples to oversample
    'oversample_size':0.2,
    'log_freq':4
}

### Set up results directory and logger  

In [None]:
# Make new directory to store results

# emc sampling results directory
emc_dir_path = emc.make_unique_file_name(config['emc_dir_path'])
os.mkdir(emc_dir_path)

# random sampling results directory
rand_sampling_dir_path = emc.make_unique_file_name(config['rand_sampling_dir_path'])
os.mkdir(rand_sampling_dir_path)

In [None]:
# Set up logger for emc sampling simulation
logger_emc = logging.getLogger(__name__)
logger_emc.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M')
file_handler = logging.FileHandler(f'{emc_dir_path}/emc.log')
file_handler.setFormatter(formatter)

logger_emc.addHandler(file_handler)

In [None]:
# Set up logger for random sampling simulation
logger_rand_sampler = logging.getLogger(__name__)
logger_rand_sampler.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M')
file_handler = logging.FileHandler(f'{rand_sampling_dir_path}/rand_sampling.log')
file_handler.setFormatter(formatter)

logger_rand_sampler.addHandler(file_handler)

### Implementing Logistic Regression and Expected Model Change on Molecular Fingerprinting
Logistic regression will be our base learner for this project.

In [None]:
# EMC simulation on small test case data
# emc.run_l_simulations_emc(small_data, small_label, sample_ids=small_sample_ids, **test_config)

# EMC simulation on real data
emc.run_l_simulations_emc(medical_data, is_stroke, sample_ids=sample_ids, **config)

In [None]:
# Random Sampling simulation on small test case data
# emc.run_n_simulations_random_sampling(small_data, small_label, **config)

# Random Sampling simulation on real data
emc.run_n_simulations_random_sampling(medical_data, is_stroke, **config)

In [None]:
# Load EMC results
n_sim_accuracy_ls, n_sim_precision_ls, n_sim_recall_ls = [],[],[]

for i in range(config['n_sim']):
    n_sim_accuracy_ls.append(np.load(f"{config['emc_dir_path']}/{i}_sim_emc_accuracy.npy"))
    n_sim_precision_ls.append(np.load(f"{config['emc_dir_path']}/{i}_sim_emc_precision.npy"))
    n_sim_recall_ls.append(np.load(f"{config['emc_dir_path']}/{i}_sim_emc_recall.npy"))

In [None]:
# Load Random Sampling results
n_sim_accuracy_random_ls, n_sim_precision_random_ls, n_sim_recall_random_ls = [],[],[]

for i in range(config['n_sim']):
    n_sim_accuracy_random_ls.append(np.load(f"{config['rand_sampling_dir_path']}/{i}_sim_rand_sample_accuracy.npy"))
    n_sim_precision_random_ls.append(np.load(f"{config['rand_sampling_dir_path']}/{i}_sim_rand_sample_precision.npy"))
    n_sim_recall_random_ls.append(np.load(f"{config['rand_sampling_dir_path']}/{i}_sim_rand_sample_recall.npy"))

In [None]:
n_updates =  len(n_sim_accuracy_ls[0])

plt.figure(figsize=(7,5))
emc.plot_metrics(n_sim_accuracy_ls, n_updates, plot_separate_sim=False, color='red', label='emc query')
emc.plot_metrics(n_sim_accuracy_random_ls, n_updates, plot_separate_sim=False, color='blue', label='random query')
plt.title('Accuracy')
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.2f}'))
plt.show()

In [None]:
plt.figure(figsize=(7,5))
emc.plot_metrics(n_sim_precision_ls, n_updates, plot_separate_sim=False, color='red', label='emc query')
emc.plot_metrics(n_sim_precision_random_ls, n_updates, plot_separate_sim=False, color='blue', label='random query')
plt.title('Precision')
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.2f}'))
plt.show()

In [None]:
plt.figure(figsize=(7,5))
emc.plot_metrics(n_sim_recall_ls, n_updates, plot_separate_sim=False, color='red', label='emc query')
emc.plot_metrics(n_sim_recall_random_ls, n_updates, plot_separate_sim=False, color='blue', label='random query')
plt.title('Recall')
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.2f}'))
plt.show()

In [None]:
# plot for github
from matplotlib.ticker import FormatStrFormatter

def metric_plotter(ax, x, emc_y, baseline_y, title):
    ax.set_title(title)
    ax.plot(x,emc_y,color='red', label='emc query')
    ax.plot(x,baseline_y,color='blue',label='random query')
    ax.legend()
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

n_updates =  len(n_sim_accuracy_ls[0])
x = np.linspace(1, n_updates, n_updates)

emc_avg_acc_arr = np.sum(n_sim_accuracy_ls,axis=0) / len(n_sim_accuracy_ls)
emc_avg_prec_arr = np.sum(n_sim_precision_ls,axis=0) / len(n_sim_precision_ls)
emc_avg_recall_arr = np.sum(n_sim_recall_ls,axis=0) / len(n_sim_recall_ls)

rand_avg_acc_arr = np.sum(n_sim_accuracy_random_ls,axis=0) / len(n_sim_accuracy_random_ls)
rand_avg_prec_arr = np.sum(n_sim_precision_random_ls,axis=0) / len(n_sim_precision_random_ls)
rand_avg_recall_arr = np.sum(n_sim_recall_random_ls,axis=0) / len(n_sim_recall_random_ls)

fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize=(5,13))
# plot
metric_plotter(ax1, x, emc_avg_acc_arr, rand_avg_acc_arr, title="Accuracy")
metric_plotter(ax2, x, emc_avg_prec_arr, rand_avg_prec_arr, title="Precision")
metric_plotter(ax3, x, emc_avg_recall_arr, rand_avg_recall_arr, title="Recall")