In [1]:
# Cell 1
"""
Ideas for a one-size-fits-all pipeline:

Check if target column is categorical or continuous
Apply from dict of classification or regression models based on above
Check if each column is categorical or continous
perform one-hot encoding for categorical columns
Apply appropriate data visualisations to each
Apply correlations Matrix
Apply scaling and PCA if too many features (>20?)

IMPORTANT: Modify pipeline such that imputation of NA values are done using median of train
set AFTER the split. Save this median for use in test set

Next steps: 
- Remove white columns (look at original excel sheet)  (Done)
- Group features by colors according to excel sheet (Done)
- Check for level of correlation (correlation matrix) (Done)
- Visualise correlation matrix (Done)
- Apply scaling (standard/minmax) (Done)
- Use PCA to reduce dimension (Skipped) https://www.datacamp.com/community/tutorials/principal-component-analysis-in-python
- Apply classification algos for each color (Done)
- Apply above for everything combined (Done)
- Implement hyperparameter tuning (Optional)
- Additional data visualisation for each parameter (Split into categorical and continous)(Optional)
- Apply unsupervised learning to check for clustering
- Cleanup
- Write clear comments for future reference
- Write and present data nicely
"""
import warnings
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sn
import matplotlib.pyplot as plt
from pathlib import Path
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVR, SVC
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegression 
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedStratifiedKFold
from tpot import TPOTClassifier, TPOTRegressor
from sklearn.cluster import FeatureAgglomeration
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, f_regression
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer, RobustScaler
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy
from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier


data = pd.read_csv("kohsm1.csv",encoding='latin1')

#Removal of irrelevant white columns
data.drop('IDshort', axis=1, inplace=True)
data.drop('SERNO', axis=1, inplace=True)

#Full Data containing only each category as targets
data_full = data.drop('Cardiac_Category EE (0=good; 1=poor)', axis=1)
# dataEE_full = data.drop('Cardiac_Category (0=good; 1=poor)', axis=1)

#List of columns according to color category
#Blue
clinParamsCols=["IDshort","Age","Weightkg","Heightcm","Pulse","WaistCircumferencecm","SERNO","Gender","SBP","DBP","Hips_Circumference__cm","Hypertension__Yes_1_No_0","Dyslipidemia__Yes_1_No_0","Diabetes_mellitus__Yes_1_No_0","Smoking__Never_0_Current_1_Past","DiabetesmellitusDietcontrol","AlcoholNever0Current1Past"]

#Teal
exerciseCols=["PhysicalactivityfrequencyIna","IntensityTakeiteasy1Heavy","Duration15min116to30min","VO2Max"]

#Violet
echoCols=["BSA__m2","IVSD__cm","IVSS__cm","LVIDD_cm","LVIDS_cm","LVPWD_cm","LVPWS_cm","LVOT__cm","AO_cm","LA_cm","LVEF","LVFS","LVmass_echo","LVmass_index_echo","Left_atrial_volume","Left_atrial_volume_index","MV_E_peak__m_s","MV_A_peak__m_s","E_A_ratio","MV_DT__ms","PASP__mmHg","PVS__cm_s","PVD__cm_s","PVA__cm_s","PVADur","MVADur","septalS","Septal_E","Septal_A","Lateral_S","lateralE","lateralA","sinuscm","sinus_tubular_junctioncm","ave_Eprime","E_Eprime_ratio"]

#Grey
cmrCols=["LV_Mass_on_mri__g","LV_EDV","EDV067","LV_ESV","LV_SV","LVEF_A","Ss","Se","Sa","SRs","SRe","SRa","SRe_SRaratio","LVGLS","LVGCS","LVGRS","RVGLS","LAGlobalGLS","LAGlobalGCS","LAvolumeminml","LAvolumemaxml","LAEFTotal","LVMassg","LVEDVImlm2","LVESVImlm2","LVSVImlm2"]

#Beige
bloodBioCols=["MCP1pgmL","MMP9ngmL","BNP","HSTNI","GALECTIN3","sUPAR","qTL","extra_urea","extra_creatinine","extra_total_cholesterol","extra_triglycerides","extra_HDL","extra_LDL","extra_tc_HDL_ratio","extra_albumin","extra_hsCRP","HbA1c"]

#Green
physFuncParamsCols=["Gripmax","G1L","G2L","G1R","G2R","SMM","BFM","PBF","WHR","Fitness_score","BMR","Lean_LA","Lean_RA","Lean_LL","Lean_RL","Lean_T","ALM"]

#Orange
curMetaCols=["TC","FC","C2","C3","C4","C51","C5","C4OH","C6","C5OHC3DC","C4DCC6OH","C81","C8","C5DC","C81OHC61DC","C8OHC6DC","C103","C102","C101","C10","C7DC","C81DC","C8DC","C122","C121","C12","C122OHC102DC","C121OH","C12OHC10DC","C143","C142","C141","C14","C143OHC123DC","C142OH","C141OH","C14OHC12DC","C163","C162","C161","C16","C163OHC143DC","C162OH","C161OHC141DC","C16OH","C183","C182","C181","C18","C183OHC163DC","C182OHC162DC","C181OHC161DC","C18OHC16DC","C204","C203","C202","C201","C20","C203OHC183DC","C202OHC182DC","C201OHC181DC","C20OHC18DC","C225","C224","C223","C222","C221","C22","C24","C26","C28","Gly1","Ala1","Ser1","Pro1","Val1","Leu1","IleLeu1","Orn1","Met1","His1","Phe1","Arg1","Cit1","Tyr1","Asp1","Glu1","Trp1"]

#Yellow
histMetaCols=["HistoC2","HistoC3","HistoC4","HistoC51","HistoC5","HistoC4NDOH","HistoC6","HistoC5NDOHC3NDDC","HistoC4NDDCC6NDOH","HistoC81","HistoC8","HistoC5NDDC","HistoC81NDOHC61NDDC","HistoC8NDOHC6NDDC","HistoC103","HistoC102","HistoC101","HistoC10","HistoC7NDDC","HistoC81NDDC","HistoC8NDDC","HistoC122","HistoC121","HistoC12","HistoC122NDOHC102NDDC","HistoC121NDOH","HistoC12NDOHC10NDDC","HistoC143","HistoC142","HistoC141","HistoC14","HistoC143NDOHC123NDDC","HistoC142NDOH","HistoC141NDOH","HistoC14NDOHC12NDDC","HistoC163","HistoC162","HistoC161","HistoC16","HistoC163NDOHC143NDDC","HistoC162NDOH","HistoC161NDOHC141NDDC","HistoC16NDOH","HistoC183","HistoC182","HistoC181","HistoC18","HistoC183NDOHC163NDDC","HistoC182NDOHC162NDDC","HistoC181NDOHC161NDDC","HistoC18NDOHC16NDDC","HistoC204","HistoC203","HistoC202","HistoC201","HistoC20","HistoC203NDOHC183NDDC","HistoC202NDOHC182NDDC","HistoC201NDOHC181NDDC","HistoC20NDOHC18NDDC","HistoC225","HistoC224","HistoC223","HistoC222","HistoC221","HistoC22","HistoC24","HistoC26","HistoC28","HistoCode","HistoGly1_ConcµM","HistoAla1_ConcµM","HistoSer1_ConcµM","HistoPro1_ConcµM","HistoVal1_ConcµM","HistoLeu1_ConcµM","HistoIle1_ConcµM","HistoOrn1_ConcµM","HistoMet1_ConcµM","HistoHis1_ConcµM","HistoPhe1_ConcµM","HistoCit1_ConcµM","HistoTyr1_ConcµM","HistoAsp1_ConcµM","HistoGlu1_ConcµM","HistoTrp1_ConcµM"]

targets=["Cardiac_Category (0=good; 1=poor)","Cardiac_Category EE (0=good; 1=poor)"]

# List comprising of params and Cardiac Category
clinParamsColsCC = clinParamsCols + [targets[0]] 
exerciseColsCC = exerciseCols + [targets[0]]
echoColsCC = echoCols + [targets[0]]
cmrColsCC = cmrCols + [targets[0]]
bloodBioColsCC = bloodBioCols + [targets[0]]
physFuncParamsColsCC = physFuncParamsCols + [targets[0]]
curMetaColsCC = curMetaCols + [targets[0]]
histMetaColsCC = histMetaCols + [targets[0]]

# List comprising of params and Cardiac Category_EE
"""
clinParamsColsCCEE = clinParamsCols + [targets[1]] 
histMetaColsCCEE = histMetaCols + [targets[1]]
exerciseColsCCEE = exerciseCols + [targets[1]]
echoColsCCEE = echoCols + [targets[1]]
cmrColsCCEE = cmrCols + [targets[1]]
bloodBioColsCCEE = bloodBioCols + [targets[1]]
physFuncParamsColsCCEE = physFuncParamsCols + [targets[1]]
curMetaColsCCEE = curMetaCols + [targets[1]]
"""
# https://stackoverflow.com/questions/48198021/filter-pandas-dataframe-with-specific-column-names-in-python

#Dataframe containing parameters and Cardiac Category
clinParamsColsCC_df=data.loc[:, data.columns.isin(clinParamsColsCC)]
exerciseColsCC_df=data.loc[:, data.columns.isin(exerciseColsCC)]
echoColsCC_df=data.loc[:, data.columns.isin(echoColsCC)]
cmrColsCC_df=data.loc[:, data.columns.isin(cmrColsCC)]
bloodBioColsCC_df=data.loc[:, data.columns.isin(bloodBioColsCC)]
physFuncParamsColsCC_df=data.loc[:, data.columns.isin(physFuncParamsColsCC)]
curMetaColsCC_df=data.loc[:, data.columns.isin(curMetaColsCC)]
histMetaColsCC_df=data.loc[:, data.columns.isin(histMetaColsCC)]

#Dataframe containing parameters and Cardiac CategoryEE
""" 
clinParamsColsCCEE_df=data.loc[:, data.columns.isin(clinParamsColsCCEE)] 
exerciseColsCCEE_df=data.loc[:, data.columns.isin(exerciseColsCCEE)]
echoColsCCEE_df=data.loc[:, data.columns.isin(echoColsCCEE)]
cmrColsCCEE_df=data.loc[:, data.columns.isin(cmrColsCCEE)]
bloodBioColsCCEE_df=data.loc[:, data.columns.isin(bloodBioColsCCEE)]
physFuncParamsColsCCEE_df=data.loc[:, data.columns.isin(physFuncParamsColsCCEE)]
curMetaColsCCEE_df=data.loc[:, data.columns.isin(curMetaColsCCEE)]
histMetaColsCCEE_df=data.loc[:, data.columns.isin(histMetaColsCCEE)]
 """
#Dict of dataframes of each color cateogry for cardiac category
cc_dfDict = {
    "clinParams":clinParamsColsCC_df,
    "exercise":exerciseColsCC_df,
    "echo":echoColsCC_df,
    "cmr":cmrColsCC_df,
    "bloodBio":bloodBioColsCC_df,
    "physFuncParams":physFuncParamsColsCC_df,
    "curMeta":curMetaColsCC_df,
    "histMeta":histMetaColsCC_df
}

#Dict of dataframes of each color cateogry for cardiac category EE
""" 
ccee_dfDict = {
    "clinParams":clinParamsColsCCEE_df,
    "exercise":exerciseColsCCEE_df,
    "echo":echoColsCCEE_df,
    "cmr":cmrColsCCEE_df,
    "bloodBio":bloodBioColsCCEE_df,
    "physFuncParams":physFuncParamsColsCCEE_df,
    "curMeta":curMetaColsCCEE_df,
    "histMeta":histMetaColsCCEE_df
}
"""

for key in cc_dfDict:
    print(f"Shape of {key}: {cc_dfDict[key].shape}")

""" 
for key in ccee_dfDict:
    print(f"Shape of {key+'(EE)'}: {cc_dfDict[key].shape}") 
"""

Shape of clinParams: (86, 16)
Shape of exercise: (86, 5)
Shape of echo: (86, 37)
Shape of cmr: (86, 27)
Shape of bloodBio: (86, 18)
Shape of physFuncParams: (86, 18)
Shape of curMeta: (86, 89)
Shape of histMeta: (86, 87)


' \nfor key in ccee_dfDict:\n    print(f"Shape of {key+\'(EE)\'}: {cc_dfDict[key].shape}") \n'

In [2]:
# Cell 2
#https://stackoverflow.com/questions/45515031/how-to-remove-columns-with-too-many-missing-values-in-python
"""
def missing(dff):
    print (round((dff.isnull().sum() * 100/ len(dff)),2).sort_values(ascending=False))
"""

# Helper function
def rmissingvaluecol(df, threshold):
    # If threshold is 80 it means we are going to drop columns having more than 80% of missing values
    l = []
    l = list(df.drop(df.loc[:,list((100*(df.isnull().sum()/len(df.index)) >= threshold))].columns, 1).columns.values)
    print("Number of columns having more than %s percent missing values: "%threshold, (df.shape[1] - len(l)))
    print("These columns are:\n", list(set(list((df.columns.values))) - set(l)))
    # Returns columns that are missing less than threshold % of data
    return l
    
def preprocessDf(df,threshold=40,verbose=False):
    print("Preprocessing dataframe...")
    if verbose:
        print("Initial info:")
        print(df.info())
    originalShape = df.shape
    newDf = df.copy()
    
    for col in newDf:
        #Checks for categorical columns that only have 2 values, and converts them to binary
        uniqueVals = pd.unique(newDf[col])
        if len(uniqueVals) == 2:
            if not(np.int64(0) in uniqueVals and np.int64(1) in uniqueVals):
                print(f"{col} column contains only 2 values. Converting to binary")
                print(uniqueVals[0])
                newDf[col] = np.where(newDf[col] == uniqueVals[0],np.int64(0),newDf[col])
                newDf[col] = np.where(newDf[col] == uniqueVals[1],np.int64(1),newDf[col])
                #Renames column with labels for binary values
                newDf.rename(columns={col:col + f"_{uniqueVals[0]}_0_{uniqueVals[1]}_1"}, inplace=True)
    
    # Converting values that are non-numeric to NA
    newDf = newDf.apply(pd.to_numeric,errors='coerce')
    print("Converting values that are non-numeric AND non-binary to NaN...")
    newCols = rmissingvaluecol(newDf,threshold) #Removes columns with more than threshold of missing data
    print("Columns with missing values (%):")
    newDf_missing = newDf.isna().sum()
    newDf_missing = newDf_missing[newDf_missing > 0].dropna().sort_values(ascending=False)
    newDf_missing = newDf_missing.apply(lambda x: np.round((x/originalShape[0])*100,2))
    print(newDf_missing)
    newDf = newDf[newCols]

    print(f"Original Shape:{df.shape}")
    print(f"Final Shape:{newDf.shape}")
    if verbose:
        print("Final info:")
        print(newDf.info())
    print("______________________________")
    return newDf

#https://stackoverflow.com/questions/17778394/list-highest-correlation-pairs-from-a-large-correlation-matrix-in-pandas

# Helper function for get_top_abs_correlations. returns list of redundant pairs to be dropped
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

# Function that returns a table of correlation indexes in descending order
def get_top_abs_correlations(df, threshold=0.6):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[au_corr>=threshold]


# Because of mixed data type (Binary and Non-binary), it would probably be 
# more correct to use Factor analysis of mixed data (FAMD) instead of simple PCA. 
# https://github.com/MaxHalford/prince#factor-analysis-of-mixed-data-famd

# Function currently not being used
def scaleAndPca(df,target,explainabilityThreshold = 0.85):
    #print(df)
    # Removes target column
    newDf = df.drop([target], axis=1)
    
    featuresLength = newDf.shape[1]
    
    """
    nonBinary = newDf.columns[~newDf.isin([0,1]).all()]
    binary = newDf.columns[newDf.isin([0,1]).all()]
    """
    scaler = StandardScaler()
    scaler.fit(newDf)
    
    newDf = scaler.transform(newDf)
    # https://towardsdatascience.com/one-hot-encoding-standardization-pca-data-preparation-steps-for-segmentation-in-python-24d07671cf0b
    # Loop Function to identify number of principal components that explain at least threshold% of the variance
    for comp in range(1, featuresLength):
        pca = PCA(n_components= comp, random_state=0)
        pca.fit(newDf)
        comp_check = pca.explained_variance_ratio_
        final_comp = comp
        if comp_check.sum() > explainabilityThreshold:
            break 
    Final_PCA = PCA(n_components= final_comp,random_state=0)
    Final_PCA.fit(newDf)
    cluster_newDf = Final_PCA.transform(newDf)
    cluster_newDf = pd.DataFrame(cluster_newDf) 
    combinedDf = pd.concat([cluster_newDf, df[target]], axis=1)
    num_comps = comp_check.shape[0]
    print(f"Reducing {featuresLength} features to {final_comp} components, we can explain {comp_check.sum()}% of the variability in the original data.")
    return combinedDf

#scaleAndPca(cc_dfDictNew["clinParams"],"Cardiac_Category (0=good; 1=poor)")

In [3]:
# Cell 3
# Display correlation data above given threshold
def outputCorrData(df,threshold=0.6,filename="Output",generateFiles=False):
    df = preprocessDf(df)
    df = df.fillna(df.median())
    # Creates correlation matrix
    corrDf = df.corr()
    # Creates corr matrix, showing correlations above given threshold
    filteredDf = corrDf[((corrDf >= threshold) | (corrDf <= -threshold)) & (corrDf !=1.000)]
    plt.figure(figsize=(20,17))
    sn.heatmap(filteredDf, annot=True, cmap="Reds")
    plt.title(f"{filename} Correlation Heatmap (Threshold: >= abs{threshold})",fontsize=20)
    # Returns a table of correlation indexes in descending order
    
    cmrCorrTable = get_top_abs_correlations(df, threshold).to_frame()
    cmrCorrTable.columns =['Abs Corr Idx']
    plt.show

    # Generates excel and png files of output in your system for future reference
    if generateFiles:
        if not os.path.exists('Outputs/Plots'):
            os.makedirs('Outputs/Plots')
        if not os.path.exists('Outputs/Tables'):
            os.makedirs('Outputs/Tables')
        plt.savefig(f'Outputs/Plots/corr_{filename}_heatmap.png')
        cmrCorrTable.to_excel(f'Outputs/Tables/corr_{filename}_table.xlsx') 
    return corrDf




def plot_feature_importance(importance,names,title):
    # Adapted from https://www.analyseup.com/learn-python-for-data-science/python-random-forest-feature-importance-plot.html
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sn.barplot(x=fi_df['feature_importance'][:5], y=fi_df['feature_names'][:5])
    #Add chart labels
    plt.title(title)
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    print(fi_df[:5])
    plt.show()
    return fi_df # returns full feature importance dataframe

#https://analyticsindiamag.com/how-to-implement-ml-models-with-small-datasets/

# https://stats.stackexchange.com/questions/416553/can-k-fold-cross-validation-cause-overfitting#:~:text=2%20Answers&text=K%2Dfold%20cross%20validation%20is,fold%20cross%2Dvalidation%20removes%20overfitting.

# Adapted from https://www.kaggle.com/brsdincer/heart-attack-prediction-detailed-explanation

# k-fold CV is used to DETECT overfitting, since it trains the model from scratch each time 

def compareClasModel(df,target,test_size=0.25,title="Dataset",showImportance=False):
    print(f"Comparing models for {title}")
    y = df[target]
    df = df.drop(target, axis = 1)
    x = preprocessDf(df)
    
    print("Initialising models...")
    xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=test_size,random_state=0)
    xTrain = xTrain.fillna(xTrain.median())
    xTest = xTest.fillna(xTrain.median())
    #https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns

    lr = LogisticRegression(solver="liblinear").fit(xTrain,yTrain)
    gnb = GaussianNB().fit(xTrain,yTrain)
    knnc = KNeighborsClassifier().fit(xTrain,yTrain)
    dtc = DecisionTreeClassifier(random_state=0).fit(xTrain,yTrain)
    rfc = RandomForestClassifier(random_state=0,verbose=False).fit(xTrain,yTrain)
    gbmc = GradientBoostingClassifier(verbose=False).fit(xTrain,yTrain)
    xgbc = XGBClassifier(use_label_encoder=False,eval_metric= "error").fit(xTrain,yTrain)
    lgbmc = LGBMClassifier().fit(xTrain,yTrain)
    sv = SVC().fit(xTrain,yTrain)

    models = [lr,gnb,knnc,dtc,rfc,gbmc,xgbc,lgbmc,sv]
    
    # initialise empty dataframe
    comparison = pd.DataFrame(columns=["Model","AccuracyScore","MeanR2CVScore","MeanCVError"])
    # Because dataset is small, we use k-fold cross validation

    for model in models:
        name = model.__class__.__name__
        predict = model.predict(xTest)
        accuracy = accuracy_score(yTest,predict)
        r2CV = cross_val_score(model,xTest,yTest,cv=5).mean()
        error = np.sqrt(-cross_val_score(model,xTest,yTest,cv=5,scoring="neg_mean_squared_error").mean())
        result = pd.DataFrame([[name,accuracy,r2CV,error]],columns=["Model","AccuracyScore","MeanR2CVScore","MeanCVError"])
        comparison = comparison.append(result)
        if showImportance:
            try:
                importance = model.feature_importances_
                plot_feature_importance(importance,xTest.columns.tolist(),name)
            except AttributeError:
                continue
    
    print(comparison)
    print("\n")
    print(f"Highest Accuracy Score: {comparison.loc[comparison['AccuracyScore'] == comparison['AccuracyScore'].max(), 'Model'].iloc[0]}. Score: {round(comparison['AccuracyScore'].max(),2)}")
    print(f"Highest CV Score: {comparison.loc[comparison['MeanR2CVScore'] == comparison['MeanR2CVScore'].max(), 'Model'].iloc[0]}. Score: {round(comparison['MeanR2CVScore'].max(),2)}")
    print(f"Lowest CV Error: {comparison.loc[comparison['MeanCVError'] == comparison['MeanCVError'].min(), 'Model'].iloc[0]}. Score: {round(comparison['MeanCVError'].min(),2)}")
    print("\n")
    comparisonMelted = pd.melt(comparison,id_vars=['Model'])

    #sn.set_theme(style="whitegrid")
    plt.ylabel('Returns')
    
    ax = sn.barplot(x="value", y="Model", hue="variable", data=comparisonMelted)
    plt.legend(title='Metrics',bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    ax.set(title=f"{title}",ylabel='Model',xlabel='Percentage Score')
    plt.xlim(0, 1)
    plt.show()
    #plt.xticks(rotation=90)
    print("***********************************************")
    
    return comparison
    
def compareRegModel(df,target,test_size=0.25,title="Dataset",showImportance=False):
    print(f"Comparing models for {title}")
    y = df[target]
    df = df.drop(target, axis = 1)
    x = preprocessDf(df)
    
    print("Predicting " + title)
    xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=test_size,random_state=0)
    xTrain = xTrain.fillna(xTrain.median())
    xTest = xTest.fillna(xTrain.median())
    #https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns

    lr = LinearRegression().fit(xTrain,yTrain)
    pls = PLSRegression().fit(xTrain,yTrain)
    ridge = Ridge().fit(xTrain,yTrain)
    lasso = Lasso().fit(xTrain,yTrain)
    elasticnet = ElasticNet().fit(xTrain,yTrain)
    knnr = KNeighborsRegressor().fit(xTrain,yTrain)
    dtr = DecisionTreeRegressor(random_state=0).fit(xTrain,yTrain)
    baggr = BaggingRegressor(random_state=0,bootstrap_features=True,verbose=False).fit(xTrain,yTrain)
    rfr = RandomForestRegressor(random_state=0,verbose=False).fit(xTrain,yTrain)
    gbmr = GradientBoostingRegressor(verbose=False).fit(xTrain,yTrain)
    xgbr = XGBRegressor().fit(xTrain,yTrain)
    lgbmr = LGBMRegressor().fit(xTrain,yTrain)
    sv = SVR().fit(xTrain,yTrain)

    models = [lr,pls,ridge,lasso,elasticnet,knnr,dtr,baggr,rfr,gbmr,xgbr,lgbmr,sv]
    
    # initialise empty dataframe
    comparison = pd.DataFrame(columns=["Model","MeanR2CVScore","MeanR2CVError"])
    # Because dataset is small, we use k-fold cross validation

    for model in models:
        name = model.__class__.__name__
        r2CV = cross_val_score(model,xTest,yTest,cv=5,scoring="r2").mean()
        error = np.sqrt(-cross_val_score(model,xTest,yTest,cv=5,scoring="neg_mean_squared_error").mean())
        result = pd.DataFrame([[name,r2CV,error]],columns=["Model","MeanR2CVScore","MeanR2CVError"])
        comparison = comparison.append(result)
        if showImportance:
            try:
                importance = model.feature_importances_
                plot_feature_importance(importance,xTest.columns.tolist(),name)
            except AttributeError:
                continue
    
    print(comparison)
    print("\n")
    print(f"Highest R2CV Score: {comparison.loc[comparison['MeanR2CVScore'] == comparison['MeanR2CVScore'].max(), 'Model'].iloc[0]}. Score: {round(comparison['MeanR2CVScore'].max(),2)}")
    print(f"Lowest CV Error: {comparison.loc[comparison['MeanR2CVError'] == comparison['MeanR2CVError'].min(), 'Model'].iloc[0]}. Score: {round(comparison['MeanR2CVError'].min(),2)}")
    print("\n")
    comparison = comparison.drop(["MeanR2CVError"],axis = 1)
    comparisonMelted = pd.melt(comparison,id_vars=['Model'])

    #sn.set_theme(style="whitegrid")
    plt.ylabel('Returns')
    
    ax = sn.barplot(x="value", y="Model", hue="variable", data=comparisonMelted)
    plt.legend(title='Metrics',bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    ax.set(title=f"{title}",ylabel='Model',xlabel='Values')
    plt.xlim(-1,1)
    plt.show()
    #plt.xticks(rotation=90)
    print("***********************************************")
    
    return comparison
    

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier

target = "Cardiac_Category (0=good; 1=poor)"

noEAratio = data_full.drop("E_A_ratio",axis = 1)
# split into input and output elements
y = noEAratio[target]
noEAratio = noEAratio.drop(target, axis = 1)
x = preprocessDf(noEAratio)
x = x.fillna(x.median())
print(x.shape)
print(y.shape)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=10, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model.fit(x, y)
# export the best model
model.export('bestTPOTmodel3(Cardiac_Category).py')

In [None]:
from tpot import TPOTRegressor
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RepeatedKFold

target = "E_A_ratio"

noCC = data_full.drop("Cardiac_Category (0=good; 1=poor)",axis = 1)
# split into input and output elements
y = noCC[target]
noCC = noCC.drop(target, axis = 1)
x = preprocessDf(noCC)

x = x.fillna(x.median())
print(x.shape)
print(y.shape)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
model2 = TPOTRegressor(generations=10, population_size=50, scoring='neg_mean_absolute_error', cv=cv, verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model2.fit(x, y)
# export the best model
model2.export('bestTPOTmodel(E_A_ratio).py')


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeRegressor
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

target = "E_A_ratio"


noCC = data_full.drop("Cardiac_Category (0=good; 1=poor)",axis = 1)
y = noCC[target]
noCC = noCC.drop(target, axis = 1)
x = preprocessDf(noCC)

xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.25,random_state=1)
xTrain = xTrain.fillna(xTrain.median())
xTest = xTest.fillna(xTrain.median())

# Average CV score on the training set was: -0.04015196862655974
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=DecisionTreeRegressor(max_depth=1, min_samples_leaf=15, min_samples_split=18)),
    VarianceThreshold(threshold=0.0005),
    RobustScaler(),
    LassoLarsCV(normalize=True)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)

exported_pipeline.fit(xTrain, yTrain)

results = exported_pipeline.predict(xTest[0])


In [None]:
print(xTest)


In [None]:
results = exported_pipeline.predict(xTest)
print(results)

In [None]:
from tpot import TPOTRegressor
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RepeatedKFold

target = "E_A_ratio"

physFuncData = data_full[physFuncParamsCols]
y = data_full[target]
x = preprocessDf(physFuncData)
# split into input and output elements

xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.25,random_state=1)
xTrain = xTrain.fillna(xTrain.median())
xTest = xTest.fillna(xTrain.median())


# Average CV score on the training set was: -0.1808613990329195
optimisedModel = GradientBoostingRegressor(alpha=0.9, learning_rate=0.001, loss="huber", max_depth=3, max_features=0.6000000000000001, min_samples_leaf=5, min_samples_split=15, n_estimators=100, subsample=0.3)
# Fix random state in exported estimator
if hasattr(optimisedModel, 'random_state'):
    setattr(optimisedModel, 'random_state', 1)

optimisedModel.fit(xTrain, yTrain)
results = optimisedModel.predict(xTest)
print(results)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier

target = "Cardiac_Category (0=good; 1=poor)"

noEchoAndCmr = data_full.drop(echoCols + cmrCols,axis = 1)
# split into input and output elements


y = noEchoAndCmr[target]
noEchoAndCmr = noEchoAndCmr.drop(target, axis = 1)
x = preprocessDf(noEchoAndCmr)
x = x.fillna(x.median())
print(x.shape)
print(y.shape)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=200, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model.fit(x, y)
# export the best model
model.export('noEchoAndCmr(Cardiac_Category).py')


In [None]:
target = "Cardiac_Category (0=good; 1=poor)"

noEchoAndCmr = data_full.drop(echoCols + cmrCols,axis = 1)
# split into input and output elements


y = noEchoAndCmr[target]
noEchoAndCmr = noEchoAndCmr.drop(target, axis = 1)
x = preprocessDf(noEchoAndCmr)

xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.25,random_state=1)
xTrain = xTrain.fillna(xTrain.median())
xTest = xTest.fillna(xTrain.median())

# Average CV score on the training set was: 0.7209150326797386
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    RBFSampler(gamma=0.35000000000000003),
    DecisionTreeClassifier(criterion="gini", max_depth=1, min_samples_leaf=14, min_samples_split=15)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(xTrain, yTrain)
results = exported_pipeline.predict(xTest)

r2CV = cross_val_score(exported_pipeline,xTest,yTest,cv=10).mean()
print(r2CV)

print("Predictions: ")
print(results)
print(f"Answers: ")
print(yTest.to_numpy())

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTRegressor
from sklearn.model_selection import RepeatedKFold

target = "E_A_ratio"

noCC = data_full.drop("Cardiac_Category (0=good; 1=poor)",axis = 1)
noEchoAndCmr = noCC.drop(cmrCols + echoCols,axis=1)
# split into input and output elements

y = data_full[target]
# noCC = noCC.drop(target, axis = 1)
x = preprocessDf(noEchoAndCmr)

x = x.fillna(x.median())
print(x.shape)
print(y.shape)
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
# define search
model2 = TPOTRegressor(generations=200, population_size=50, scoring='neg_mean_absolute_error', cv=cv, verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model2.fit(x, y)
# export the best model
model2.export('noCmrAndEcho(E_A_ratio)2.py')

In [None]:
target = "E_A_ratio"

noCC = data_full.drop("Cardiac_Category (0=good; 1=poor)",axis = 1)
noEchoAndCmr = noCC.drop(cmrCols + echoCols,axis=1)
# split into input and output elements

y = data_full[target]
# noCC = noCC.drop(target, axis = 1)
x = preprocessDf(noEchoAndCmr)

xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.25,random_state=42)
xTrain = xTrain.fillna(xTrain.median())
xTest = xTest.fillna(xTrain.median())

# Average CV score on the training set was: -0.155515661171092
exported_pipeline = make_pipeline(
    make_union(
        make_union(
            make_pipeline(
                make_union(
                    FunctionTransformer(copy),
                    make_union(
                        RobustScaler(),
                        FunctionTransformer(copy)
                    )
                ),
                SelectPercentile(score_func=f_regression, percentile=16),
                StackingEstimator(estimator=LassoLarsCV(normalize=True)),
                VarianceThreshold(threshold=0.0001)
            ),
            StackingEstimator(estimator=make_pipeline(
                make_union(
                    FunctionTransformer(copy),
                    make_pipeline(
                        make_union(
                            FeatureAgglomeration(affinity="manhattan", linkage="complete"),
                            FunctionTransformer(copy)
                        ),
                        Normalizer(norm="l2")
                    )
                ),
                LassoLarsCV(normalize=True)
            ))
        ),
        FunctionTransformer(copy)
    ),
    GradientBoostingRegressor(alpha=0.99, learning_rate=0.01, loss="huber", max_depth=9, max_features=0.9500000000000001, min_samples_leaf=3, min_samples_split=15, n_estimators=100, subsample=0.9500000000000001)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)


exported_pipeline.fit(xTrain, yTrain)

results = exported_pipeline.predict(xTest)
r2CV = cross_val_score(exported_pipeline,xTest,yTest,cv=5,scoring="r2").mean()
print(r2CV)

In [None]:
print(results)
print(yTest.to_numpy())

In [None]:
x[target] = y
print(x)

In [None]:
target = "Cardiac_Category (0=good; 1=poor)"

noEchoAndCmr = data_full.drop(echoCols + cmrCols,axis = 1)
# split into input and output elements


y = noEchoAndCmr[target]
noEchoAndCmr = noEchoAndCmr.drop(target, axis = 1)
x = preprocessDf(noEchoAndCmr)

xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.25,random_state=42)
xTrain = xTrain.fillna(xTrain.median())
xTest = xTest.fillna(xTrain.median())

# Average CV score on the training set was: 0.7681917211328977
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        RobustScaler()
    ),
    FeatureAgglomeration(affinity="manhattan", linkage="average"),
    StackingEstimator(estimator=SGDClassifier(alpha=0.001, eta0=0.1, fit_intercept=False, l1_ratio=0.5, learning_rate="invscaling", loss="hinge", penalty="elasticnet", power_t=0.5)),
    RBFSampler(gamma=0.15000000000000002),
    RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.8, min_samples_leaf=7, min_samples_split=20, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(xTrain, yTrain)
results = exported_pipeline.predict(xTest)

r2CV = cross_val_score(exported_pipeline,xTest,yTest,cv=5).mean()
print(r2CV)

print("Predictions: ")
print(results)
print(f"Answers: ")
print(yTest.to_numpy())


In [15]:
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier

target = "Cardiac_Category (0=good; 1=poor)"

bloodBioData = data_full[bloodBioCols + [target]]
# split into input and output elements


y = bloodBioData[target]
bloodBioData = bloodBioData.drop(target, axis = 1)
x = preprocessDf(bloodBioData)
x = x.fillna(x.median())
print(x.shape)
print(y.shape)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=200, population_size=50, max_time_mins=720,cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=1)
# perform the search
model.fit(x, y)
# export the best model
model.export('bloodBioData(Cardiac_Category).py')



Preprocessing dataframe...
Converting values that are non-numeric AND non-binary to NaN...
Number of columns having more than 40 percent missing values:  0
These columns are:
 []
Columns with missing values (%):
BNP          5.81
GALECTIN3    5.81
HSTNI        4.65
MMP9ngmL     2.33
sUPAR        1.16
qTL          1.16
dtype: float64
Original Shape:(86, 17)
Final Shape:(86, 17)
______________________________
(86, 17)
(86,)

Generation 1 - Current best internal CV score: 0.6479302832244008

Generation 2 - Current best internal CV score: 0.6479302832244008

Generation 3 - Current best internal CV score: 0.6479302832244008

Generation 4 - Current best internal CV score: 0.6479302832244008

Generation 5 - Current best internal CV score: 0.6479302832244008

Generation 6 - Current best internal CV score: 0.6479302832244008

Generation 7 - Current best internal CV score: 0.6479302832244008

Generation 8 - Current best internal CV score: 0.6479302832244008

Generation 9 - Current best internal 

In [28]:
target = "Cardiac_Category (0=good; 1=poor)"

bloodBioData = data_full[bloodBioCols + [target]]
# split into input and output elements


y = bloodBioData[target]
bloodBioData = bloodBioData.drop(target, axis = 1)
x = preprocessDf(bloodBioData)
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.25,random_state=42)

xTrain = xTrain.fillna(xTrain.median())
xTest = xTest.fillna(xTrain.median())

# Average CV score on the training set was: 0.740958605664488
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.05, min_samples_leaf=20, min_samples_split=13, n_estimators=100)),
    Normalizer(norm="max"),
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.5, max_depth=3, min_child_weight=4, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)),
    GradientBoostingClassifier(learning_rate=0.1, max_depth=3, max_features=0.6000000000000001, min_samples_leaf=13, min_samples_split=19, n_estimators=100, subsample=0.6500000000000001)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(xTrain, yTrain)
results = exported_pipeline.predict(xTest)

r2CV = cross_val_score(exported_pipeline,xTest,yTest,cv=3).mean()
print(cross_val_score(exported_pipeline,xTest,yTest,cv=3))
print(r2CV)

accuracy = accuracy_score(yTest,results)

print("Accuracy: ")
print(accuracy)

print("Predictions: ")
print(results)
print(f"Answers: ")
print(yTest.to_numpy())

Preprocessing dataframe...
Converting values that are non-numeric AND non-binary to NaN...
Number of columns having more than 40 percent missing values:  0
These columns are:
 []
Columns with missing values (%):
BNP          5.81
GALECTIN3    5.81
HSTNI        4.65
MMP9ngmL     2.33
sUPAR        1.16
qTL          1.16
dtype: float64
Original Shape:(86, 17)
Final Shape:(86, 17)
______________________________
[0.5        0.57142857 0.57142857]
0.5476190476190476
Accuracy: 
0.7727272727272727
Predictions: 
[1 1 1 1 0 0 1 1 1 0 1 0 1 0 0 1 0 0 0 1 1 1]
Answers: 
[0 1 1 0 0 0 1 1 1 0 1 0 0 0 1 1 0 0 1 1 1 1]


In [5]:
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier

target = "Cardiac_Category (0=good; 1=poor)"

selfMadeCols = ["WaistCircumferencecm","Hips_Circumference__cm","C28","Trp1","HistoC101","HistoC143NDOHC123NDDC","HistoC162","HistoC163","HistoC142","HistoC141","HistoC143","HistoHis1_ConcµM","HistoC225","HistoC203","HistoC182","HistoC181","Cardiac_Category (0=good; 1=poor)"]
curatedDf = data_full[selfMadeCols]



y = curatedDf[target]
curatedDf = curatedDf.drop(target, axis = 1)
x = preprocessDf(curatedDf)
x = x.fillna(x.median())
print(x.shape)
print(y.shape)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=200, population_size=50, max_time_mins=720,cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=1)
# perform the search
model.fit(x, y)
# export the best model
model.export('curatedDf(Cardiac_Category).py')


Preprocessing dataframe...
Converting values that are non-numeric AND non-binary to NaN...
Number of columns having more than 40 percent missing values:  0
These columns are:
 []
Columns with missing values (%):
C28    13.95
dtype: float64
Original Shape:(86, 16)
Final Shape:(86, 16)
______________________________
(86, 16)
(86,)

Generation 1 - Current best internal CV score: 0.7252723311546843

Generation 2 - Current best internal CV score: 0.7446623093681917

Generation 3 - Current best internal CV score: 0.7446623093681917

Generation 4 - Current best internal CV score: 0.7446623093681917

Generation 5 - Current best internal CV score: 0.7446623093681917

Generation 6 - Current best internal CV score: 0.7446623093681917

Generation 7 - Current best internal CV score: 0.7446623093681917

Generation 8 - Current best internal CV score: 0.7446623093681917

Generation 9 - Current best internal CV score: 0.7527233115468411

Generation 10 - Current best internal CV score: 0.752723311546841

In [4]:
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier

target = "Cardiac_Category (0=good; 1=poor)"

optimisedFeatureSet = ['C224', 'HistoC162', 'Hips_Circumference__cm', 'HistoC10','HistoC203NDOHC183NDDC', 'HistoC16', 'C222', 'C28', 'Tyr1', 'VO2Max','HistoC143NDOHC123NDDC', 'HistoC143', 'Pulse', 'HistoC121NDOH', 'WHR','C81', 'C101',"Cardiac_Category (0=good; 1=poor)"]
curatedDf = data_full[optimisedFeatureSet]



y = curatedDf[target]
curatedDf = curatedDf.drop(target, axis = 1)
x = preprocessDf(curatedDf)
x = x.fillna(x.median())
print(x.shape)
print(y.shape)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1)
# define search
model = TPOTClassifier(generations=200, population_size=50,cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=2)
# perform the search
model.fit(x, y)
# export the best model
model.export('optimisedFeatureSetDf6(Cardiac_Category).py')

Preprocessing dataframe...
Converting values that are non-numeric AND non-binary to NaN...
Number of columns having more than 40 percent missing values:  0
These columns are:
 []
Columns with missing values (%):
C28     13.95
WHR      5.81
C222     1.16
C101     1.16
dtype: float64
Original Shape:(86, 17)
Final Shape:(86, 17)
______________________________
(86, 17)
(86,)

Generation 1 - Current best internal CV score: 0.7673856209150327

Generation 2 - Current best internal CV score: 0.7673856209150327

Generation 3 - Current best internal CV score: 0.7703267973856209

Generation 4 - Current best internal CV score: 0.7703267973856209

Generation 5 - Current best internal CV score: 0.7703267973856209

Generation 6 - Current best internal CV score: 0.7713071895424836

Generation 7 - Current best internal CV score: 0.7741176470588234

Generation 8 - Current best internal CV score: 0.7741176470588234

Generation 9 - Current best internal CV score: 0.7741176470588234

Generation 10 - Curren

In [18]:
target = "Cardiac_Category (0=good; 1=poor)"

selfMadeCols = ["WaistCircumferencecm","Hips_Circumference__cm","C28","Trp1","HistoC101","HistoC143NDOHC123NDDC","HistoC162","HistoC163","HistoC142","HistoC141","HistoC143","HistoHis1_ConcµM","HistoC225","HistoC203","HistoC182","HistoC181","Cardiac_Category (0=good; 1=poor)"]
curatedDf = data_full[selfMadeCols]


#selfMadeCols = data_full[bloodBioCols + [target]]
# split into input and output elements


y = curatedDf[target]
curatedDf = curatedDf.drop(target, axis = 1)
x = preprocessDf(curatedDf)
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.25,random_state=42)

xTrain = xTrain.fillna(xTrain.median())
xTest = xTest.fillna(xTrain.median())



# Average CV score on the training set was: 0.8069716775599128
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.35000000000000003, min_samples_leaf=1, min_samples_split=4, n_estimators=100)),
    GradientBoostingClassifier(learning_rate=0.01, max_depth=8, max_features=0.55, min_samples_leaf=1, min_samples_split=6, n_estimators=100, subsample=0.6000000000000001)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

#exported_pipeline = GaussianNB().fit(xTrain,yTrain)

exported_pipeline.fit(xTrain, yTrain)
results = exported_pipeline.predict(xTest)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
CVAccuracyArray = cross_val_score(exported_pipeline,xTest,yTest,cv=cv,scoring='accuracy')
MeanCVAccuracyScore = CVAccuracyArray.mean()
print(CVAccuracyArray)
print(MeanCVAccuracyScore)

accuracy = accuracy_score(yTest,results)

print("Accuracy: ")
print(accuracy)

print("Predictions: ")
print(results)
print(f"Answers: ")
print(yTest.to_numpy())

Preprocessing dataframe...
Converting values that are non-numeric AND non-binary to NaN...
Number of columns having more than 40 percent missing values:  0
These columns are:
 []
Columns with missing values (%):
C28    13.95
dtype: float64
Original Shape:(86, 16)
Final Shape:(86, 16)
______________________________
[0.8  0.2  0.5  0.5  0.75 0.4  0.6  0.75 0.75 0.75 0.6  0.6  0.25 0.75
 0.5  0.4  0.6  0.75 0.75 0.75 0.6  0.6  0.5  0.   0.25 0.8  0.8  0.25
 0.25 0.5  0.2  0.6  0.5  0.5  0.5  0.4  0.2  0.5  0.25 0.75 0.8  0.4
 0.5  0.5  0.75 0.6  0.4  0.25 0.25 0.25]
0.512
Accuracy: 
0.5909090909090909
Predictions: 
[0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1]
Answers: 
[0 1 1 0 0 0 1 1 1 0 1 0 0 0 1 1 0 0 1 1 1 1]
