In [2]:
# native packages
import os, logging, warnings, json, functools, time

# some setup
warnings.filterwarnings('ignore', module='tqdm.auto')  # deprication warnings
warnings.filterwarnings('ignore', module='sklearn.neural_network._multilayer_perceptron')  # convergence warnings
logger = logging.getLogger(__name__)

# other packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
from scipy.stats import pointbiserialr

# custom package
import sl_utils

# sklearn imports
from sklearn import svm
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    mean_squared_error,
)
from sklearn.neighbors import ( 
    KNeighborsRegressor, 
    KNeighborsClassifier
)
from sklearn.neural_network import (
    MLPRegressor,
    MLPClassifier
)
from sklearn.feature_selection import (
    mutual_info_classif, 
    SelectKBest
)


In [3]:
# download data from kaggle and set up loader - global
DATA = sl_utils.SL_Report_Data()
DATA.set_up_data()

# experiment directory - global
EXPERIMENT_NAME = 'expr2'  # global var
if not os.path.exists(DATA.root+f'logs/{EXPERIMENT_NAME}/'):
    os.mkdir(DATA.root+f'logs/{EXPERIMENT_NAME}/')

Already have Cancer Dataset
Cancer Demo Saved
Already Have Bankruptcy Dataset
Bankruptcy Demo Saved


In [4]:
def clean_cancer_data(raw_cancer_df):
    meta = ['Patient_ID']
    targets = ['Treatment_Cost_USD', 'Survival_Years', 'Target_Severity_Score']
    categoricals = ['Gender', 'Country_Region', 'Cancer_Type', 'Cancer_Stage']

    # drop outputs and patient IDs
    cancer_inputs = pd.get_dummies(
        raw_cancer_df.drop(meta+targets, axis=1),
        columns = categoricals,
        drop_first = True
        ).astype(int)
    
    # return
    return cancer_inputs

cancer_data = clean_cancer_data(DATA.get_cancer_full())
tss = DATA.get_cancer_full()['Target_Severity_Score']

cancer_train, cancer_test, tss_train, tss_test = train_test_split(cancer_data, tss, test_size=0.25, shuffle=True, random_state=7)

financial_data = DATA.get_bankruptcy_full().drop('Bankrupt?', axis=1)
bankrupt = DATA.get_bankruptcy_full()['Bankrupt?']

financial_train, financial_test, bankrupt_train, bankrupt_test = train_test_split(financial_data, bankrupt, test_size=0.25, shuffle=True, random_state=7, stratify=bankrupt)

In [None]:
def get_best_params(log_filename, want_low):
    opt_name = log_filename.replace('.json', '')
    if not os.path.exists(f'{DATA.root}logs/{EXPERIMENT_NAME}/{log_filename}'):
        raise Exception('Does not exist')
    else:
        with open(f'{DATA.root}logs/{EXPERIMENT_NAME}/{log_filename}', 'r') as file:
            results = json.load(file)
        if want_low:
            best_result = sorted(results, key=lambda cv_results: cv_results['test_score'])[0]
        else:
            best_result = sorted(results, key=lambda cv_results: cv_results['test_score'])[-1]
        return best_result['params']
    

# cancer competitor
cancer_model = svm.SVR(**get_best_params('Cancer_SVR_RBF.json', want_low=True))
t0 = time.time()
cancer_model.fit(cancer_train, tss_train)
t1 = time.time()
print('MSE:', mean_squared_error(tss_test, cancer_model.predict(cancer_test)))
t2 = time.time()
print('n training points:', cancer_train.shape[0])
print('Train Time:', round(t1-t0,2))
print('Predict Time': round(t2-t1,2))

print()

# bankruptcy competitor
bankruptcy_model = MLPClassifier(**get_best_params('Bankruptcy_NN_Relu.json', want_low=False))
t0 = time.time()
bankruptcy_model.fit(financial_train, bankrupt_train)
t1 = time.time()
print('F1:', f1_score(bankrupt_test, bankruptcy_model.predict(financial_test)))
t2 = time.time()
print('n training points:', financial_train.shape[0])
print('Train Time:', round(t1-t0,2))
print('Predict Time': round(t2-t1,2))


MSE: 0.32214010231209916
n training points: 37500
F1: 0.019417475728155338
n training points: 5114
