<br>  

## <span style='color:blue'>Section 1: Import</span>  

In [None]:
import pandas as pd
import numpy as np
import copy

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold

from sklearn.impute import KNNImputer

import statsmodels.api as sm

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

<br>  

## <span style='color:blue'>Section 2: Read, drop columns, form X_train and y_train</span>  

In [None]:
# ---------- read ----------

df_1 = pd.read_csv('../data/data_train.csv')

# ---------- drop ----------

print('Before drop :', df_1.shape)
df_1.drop(columns=['index',
                   'pco2', 'ph', 'basophils', 'lactic_acid', 'bmi',
                   'creatine_kinase', 'lymphocyte', 'neutrophils'], inplace=True)
print('After drop:', df_1.shape)
print('')

# ---------- form X ----------

X_train = df_1.drop(columns=['outcome'])
print('X_train :', X_train.shape)

# ---------- form y ----------

y_train = df_1['outcome']
print('y_train :', y_train.shape)
print('y_train :', np.unique(y_train, return_counts=True))
print('y_train :', Counter(y_train))

<br>  

## <span style='color:blue'>Section 3: Function - Impute</span>  

In [None]:
def impute_fit_transform(to_impute, to_impute_fit_transform):
    temp_df = []
    temp_df = to_impute_fit_transform.fit_transform(to_impute)
    temp_df = pd.DataFrame(temp_df, columns=to_impute.columns)
    to_impute = copy.deepcopy(temp_df)
    return to_impute, to_impute_fit_transform

def impute_transform(to_impute, to_impute_fit_transform):
    temp_df = []
    temp_df = to_impute_fit_transform.transform(to_impute)
    temp_df = pd.DataFrame(temp_df, columns=to_impute.columns)
    to_impute = copy.deepcopy(temp_df)
    return to_impute

<br>  

## <span style='color:blue'>Section 4: Function - Oversample and undersample ratios</span>  

In [None]:
def over_under_sample_ratios(y, search):
    maj_count = Counter(y).most_common()[0][1]
    total_ratio = search[0] + search[1]                               # majority and minority
    total_count = len(y)
    minority_count = np.round(search[1]*total_count/total_ratio, 0)
    minority_ratio = minority_count/maj_count
    majority_count = np.round(search[0]*total_count/total_ratio, 0)
    majority_ratio = minority_count/majority_count
    return minority_ratio, majority_ratio

<br>  

## <span style='color:blue'>Section 5: Function - Oversample and undersample</span>  

In [None]:
def over_sample(X_to_oversample, y_to_oversample, over_sample_sampling_strategy):
    o_s = RandomOverSampler(random_state=42, sampling_strategy=over_sample_sampling_strategy)
    X_to_oversample, y_to_oversample = o_s.fit_resample(X_to_oversample, y_to_oversample.ravel())
    return X_to_oversample, y_to_oversample

def under_sample(X_to_undersample, y_to_undersample, under_sample_sampling_strategy):
    u_s = RandomUnderSampler(random_state=42, sampling_strategy=under_sample_sampling_strategy)
    X_to_undersample, y_to_undersample = u_s.fit_resample(X_to_undersample, y_to_undersample.ravel())
    return X_to_undersample, y_to_undersample

<br>  

## <span style='color:blue'>Section 6: Function - Scale</span>  

In [None]:
def scale_fit_transform(to_scale, to_scale_fit_transform):
    temp_df = []
    temp_df = to_scale_fit_transform.fit_transform(to_scale)
    temp_df = pd.DataFrame(temp_df, columns=to_scale.columns)
    to_scale = copy.deepcopy(temp_df)
    return to_scale, to_scale_fit_transform

def scale_transform(to_scale, to_scale_fit_transform):
    temp_df = []
    temp_df = to_scale_fit_transform.transform(to_scale)
    temp_df = pd.DataFrame(temp_df, columns=to_scale.columns)
    to_scale = copy.deepcopy(temp_df)
    return to_scale

<br>  

## <span style='color:blue'>Section 7: Manual search and cross validate</span>  

In [None]:
# ---------- set 15-fold cross validation ----------

cross_validate = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

# ---------- set parameters and hyperparameters ----------

search_parameters = {'majority_minority': [(1,1), (2,1)],
                     'decision_boundary': [0.50, 0.35, 0.20]}

regular_alpha_from = 0
regular_alpha_to = 100
regular_alpha_count = 11

# ---------- initialise result storages ----------

combine_train_validate_accuracy = []
combine_train_validate_recall = []
combine_train_validate_precision = []

train_accuracy = []
train_recall = []
train_precision = []

validate_accuracy = []
validate_recall = []
validate_precision = []

# ---------- loop for minority and majority ----------

for i in range(len(search_parameters['majority_minority'])):
    
    # ---------- loop for decision boundary ----------
    
    for j in range(len(search_parameters['decision_boundary'])):
        
        # ---------- loop for regularization strength alpha ----------
        
        for alpha_log_reg in np.linspace(regular_alpha_from, regular_alpha_to, regular_alpha_count):
            
            # ---------- loop for cross validation ----------

            for train_index, validate_index in cross_validate.split(X_train, y_train):
    
                # ---------- get train and validate indices ----------
    
                X_train_train, X_train_validate = X_train.iloc[train_index, :], X_train.iloc[validate_index, :]
                y_train_train, y_train_validate = y_train[train_index], y_train[validate_index]
                print('Before y_train_train:', Counter(y_train_train))
                print('Before y_train_validate:', Counter(y_train_validate))
                print('Oversample :', search_parameters['majority_minority'][i])
                print('Decision boundary :', search_parameters['decision_boundary'][j])
                print('')
        
                # ---------- impute on train ----------
    
                knn_impute = KNNImputer(n_neighbors=5)
                X_train_train_impute, knn_impute_fit_transform = impute_fit_transform(X_train_train,
                                                                                      knn_impute)

                # ---------- get oversample and undersample ratios ----------
        
                over_strategy, under_strategy = over_under_sample_ratios(y_train_train,
                                                                         search_parameters['majority_minority'][i])
                print('Over :', over_strategy)
                print('Under :', under_strategy)
        
                # ---------- oversample on train ----------
    
                X_train_train_impute_over, y_train_train_over = over_sample(X_train_train_impute,
                                                                            y_train_train,
                                                                            over_strategy)
                print('Before_Over y_train_train:', Counter(y_train_train_over))
                print('')
    
                # ---------- undersample on train ----------
    
                X_train_train_impute_over_under, y_train_train_over_under = under_sample(X_train_train_impute_over,
                                                                                         y_train_train_over,
                                                                                         under_strategy)
                print('Before_Over_Under y_train_train:', Counter(y_train_train_over_under))
                print('')

                # ---------- scale on train ----------
    
                ss = StandardScaler()
                X_train_train_impute_over_under_scale, ss_fit_transform = scale_fit_transform(X_train_train_impute_over_under, ss)
    
                # ---------- instantiate and fit regularized on train ----------
    
                X_train_train_impute_over_under_scale = sm.add_constant(X_train_train_impute_over_under_scale)
                lr = sm.Logit(y_train_train_over_under, X_train_train_impute_over_under_scale)
                lr_result = lr.fit_regularized(maxiter=500, method='l1', alpha=alpha_log_reg)
                print('alpha =', alpha_log_reg)
                print('')
    
                # ---------- predict and evaluate on train ----------
    
                y_train_train_over_under_predicted = (lr_result.predict(X_train_train_impute_over_under_scale) >= search_parameters['decision_boundary'][j]).astype(int)
        
                train_accuracy.append(accuracy_score(y_train_train_over_under, y_train_train_over_under_predicted))
                train_recall.append(recall_score(y_train_train_over_under, y_train_train_over_under_predicted))
                train_precision.append(precision_score(y_train_train_over_under, y_train_train_over_under_predicted))
    
                # ---------- impute on validate ----------
    
                X_train_validate_impute = impute_transform(X_train_validate, knn_impute_fit_transform)
    
                # ---------- scale on validate ----------
    
                X_train_validate_impute_scale = scale_transform(X_train_validate_impute, ss_fit_transform)
    
                # ---------- predict and evaluate on validate ----------
    
                X_train_validate_impute_scale = sm.add_constant(X_train_validate_impute_scale)
                y_train_validate_predicted = (lr_result.predict(X_train_validate_impute_scale) >= search_parameters['decision_boundary'][j]).astype(int)
        
                validate_accuracy.append(accuracy_score(y_train_validate, y_train_validate_predicted))
                validate_recall.append(recall_score(y_train_validate, y_train_validate_predicted))
                validate_precision.append(precision_score(y_train_validate, y_train_validate_predicted))
    
            combine_train_validate_accuracy.append([search_parameters['majority_minority'][i],
                                                    search_parameters['decision_boundary'][j],
                                                    alpha_log_reg,
                                                    np.mean(train_accuracy),
                                                    np.std(train_accuracy),
                                                    np.mean(validate_accuracy),
                                                    np.std(validate_accuracy)])
            combine_train_validate_recall.append([search_parameters['majority_minority'][i],
                                                  search_parameters['decision_boundary'][j],
                                                  alpha_log_reg,
                                                  np.mean(train_recall),
                                                  np.std(train_recall),
                                                  np.mean(validate_recall),
                                                  np.std(validate_recall)])
            combine_train_validate_precision.append([search_parameters['majority_minority'][i],
                                                     search_parameters['decision_boundary'][j],
                                                     alpha_log_reg,
                                                     np.mean(train_precision),
                                                     np.std(train_precision),
                                                     np.mean(validate_precision),
                                                     np.std(validate_precision)])
            
            train_accuracy = []
            train_recall = []
            train_precision = []

            validate_accuracy = []
            validate_recall = []
            validate_precision = []

<br>  

## <span style='color:blue'>Section 8: Save results</span>  

In [None]:
combine_train_validate_accuracy = pd.DataFrame(combine_train_validate_accuracy,
                                               columns=['majority_minority',
                                                        'decision_boundary',
                                                        'alpha',
                                                        'train_accuracy_mean',
                                                        'train_accuracy_std_dev',
                                                        'validate_accuracy_mean',
                                                        'validate_accuracy_std_dev'])
combine_train_validate_accuracy['overfit'] = (combine_train_validate_accuracy['validate_accuracy_mean']-combine_train_validate_accuracy['train_accuracy_mean'])/combine_train_validate_accuracy['train_accuracy_mean']*100
combine_train_validate_accuracy.to_csv('../data/tried_combine_train_validate_accuracy.csv', na_rep='NaN', index_label='index')

In [None]:
combine_train_validate_recall = pd.DataFrame(combine_train_validate_recall,
                                             columns=['majority_minority',
                                                      'decision_boundary',
                                                      'alpha',
                                                      'train_recall_mean',
                                                      'train_recall_std_dev',
                                                      'validate_recall_mean',
                                                      'validate_recall_std_dev'])
combine_train_validate_recall['overfit'] = (combine_train_validate_recall['validate_recall_mean']-combine_train_validate_recall['train_recall_mean'])/combine_train_validate_recall['train_recall_mean']*100
combine_train_validate_recall.to_csv('../data/tried_combine_train_validate_recall.csv', na_rep='NaN', index_label='index')

In [None]:
combine_train_validate_precision = pd.DataFrame(combine_train_validate_precision,
                                                columns=['majority_minority',
                                                         'decision_boundary',
                                                         'alpha',
                                                         'train_precision_mean',
                                                         'train_precision_std_dev',
                                                         'validate_precision_mean',
                                                         'validate_precision_std_dev'])
combine_train_validate_precision['overfit'] = (combine_train_validate_precision['validate_precision_mean']-combine_train_validate_precision['train_precision_mean'])/combine_train_validate_precision['train_precision_mean']*100
combine_train_validate_precision.to_csv('../data/tried_combine_train_validate_precision.csv', na_rep='NaN', index_label='index')