In [9]:
from medpredictor import Graph, Encoder, Config, Utils
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn import metrics
from IPython.display import Markdown

### **Setting up the dataframe**

In [15]:
df = pd.read_csv(Config.data_cleaned_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  object 
 1   HighBP                253680 non-null  object 
 2   HighChol              253680 non-null  object 
 3   CholCheck             253680 non-null  object 
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  object 
 6   Stroke                253680 non-null  object 
 7   HeartDiseaseorAttack  253680 non-null  object 
 8   PhysActivity          253680 non-null  object 
 9   Fruits                253680 non-null  object 
 10  Veggies               253680 non-null  object 
 11  HvyAlcoholConsump     253680 non-null  object 
 12  AnyHealthcare         253680 non-null  object 
 13  NoDocbcCost           253680 non-null  object 
 14  GenHlth               253680 non-null  object 
 15  

### **Utils**

In [16]:
def model_metrics(y_test, y_pred):
    metrics = {'Accuracy':round((metrics.accuracy_score(y_test, y_pred) * 100), 2),
              'Recall': round((metrics.recall_score(y_test, y_pred) * 100), 2),
              'Precision': round((metrics.precision_score(y_test, y_pred) * 100), 2),
              'F1-score': round((metrics.f1_score(y_test, y_pred) * 100), 2)
              }
    return metrics

def cross_val_metrics(model, X, y):
    metrics = {'Accuracy':round((np.mean(cross_val_score(model, X, y, scoring='accuracy', cv=5)) * 100), 2),
              'Recall': round((np.mean(cross_val_score(model, X, y, scoring='recall', cv=5)) * 100), 2),
              'Precision': round((np.mean(cross_val_score(model, X, y, scoring='precision', cv=5)) * 100), 2),
              'F1-score': round((np.mean(cross_val_score(model, X, y, scoring='f1', cv=5)) * 100), 2)
              }
    return metrics

def show_metrics(metrics):
    for key, value in metrics.items():
        display(Markdown(f"{key}: **{value}**%"))
    return


### **Feature engineering**

In [23]:
df_enc_dict = {}

df_dec_dict = {}

orders = {}

num_columns = ['BMI', 'MentHlth', 'PhysHlth']

df_copy = df.drop(columns=num_columns)

enc = Encoder()

df_numerics = enc.robust_scaler_method(df=df, columns_name=num_columns)

df_enc_dict.update({'numerics':df_numerics})

orders = {'age': Utils.values_age_order,
          'health_status': Utils.values_status_order,
          'income': Utils.values_income_order,
          'education': Utils.values_education_order,
          'diabetes': Utils.values_diabetes_order}

for column in df_copy.columns:
    order = orders.get(column, None)
    if order is not None:
        enc_, dec_ = enc.ordinal_encoder_method(df=df_copy, 
                                          column_name=column, 
                                          order=order)
        df_enc_dict[column] = pd.DataFrame({column + '_enc': enc_})
        df_dec_dict[column] = pd.DataFrame({column + '_dec': dec_})
        continue
    enc_, dec_ = enc.label_encoder_method(df=df_copy,
                                    column_name=column)
    df_enc_dict[column] = pd.DataFrame({column + '_enc': enc_})
    df_dec_dict[column] = pd.DataFrame({column + '_dec': dec_})

df_enc = pd.concat(list(df_enc_dict.values()), axis=1)
df_dec = pd.concat(list(df_dec_dict.values()), axis=1)

df_enc.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   BMI                       253680 non-null  float64
 1   MentHlth                  253680 non-null  float64
 2   PhysHlth                  253680 non-null  float64
 3   Diabetes_012_enc          253680 non-null  int64  
 4   HighBP_enc                253680 non-null  int64  
 5   HighChol_enc              253680 non-null  int64  
 6   CholCheck_enc             253680 non-null  int64  
 7   Smoker_enc                253680 non-null  int64  
 8   Stroke_enc                253680 non-null  int64  
 9   HeartDiseaseorAttack_enc  253680 non-null  int64  
 10  PhysActivity_enc          253680 non-null  int64  
 11  Fruits_enc                253680 non-null  int64  
 12  Veggies_enc               253680 non-null  int64  
 13  HvyAlcoholConsump_enc     253680 non-null  i

In [25]:
X_h1 = df_enc.drop(columns=['MentHlth', 'PhysHlth',
                            'Diabetes_012_enc', 'CholCheck_enc',
                            'AnyHealthcare_enc', 'NoDocbcCost_enc',
                            'Education_enc', 'Income_enc' 
                            ])
y_h1 = df_enc['Diabetes_012_enc']

X_h2 = df_enc.drop(columns=['MentHlth', 'PhysHlth',
                            'Stroke_enc', 'AnyHealthcare_enc',
                            'NoDocbcCost_enc', 'Education_enc',
                            'Income_enc'])
y_h2 = df_enc['Stroke_enc']

X_h3 = df_enc.drop(columns=['MentHlth', 'PhysHlth',
                            'HeartDiseaseorAttack_enc','AnyHealthcare_enc',
                            'NoDocbcCost_enc', 'Education_enc',
                            'Income_enc'])
y_h3 = df_enc['HeartDiseaseorAttack_enc']