# Import libraries

In [284]:
import pandas as pd
import numpy as np
from  sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,f1_score,precision_score,recall_score,accuracy_score,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import pickle
import os
import mlflow
from getpass import getpass

# Import Data

In [285]:
df = pd.read_csv("../Dataset/health_lifestyle_classification.csv")
df.columns

Index(['survey_code', 'age', 'gender', 'height', 'weight', 'bmi',
       'bmi_estimated', 'bmi_scaled', 'bmi_corrected', 'waist_size',
       'blood_pressure', 'heart_rate', 'cholesterol', 'glucose', 'insulin',
       'sleep_hours', 'sleep_quality', 'work_hours', 'physical_activity',
       'daily_steps', 'calorie_intake', 'sugar_intake', 'alcohol_consumption',
       'smoking_level', 'water_intake', 'screen_time', 'stress_level',
       'mental_health_score', 'mental_health_support', 'education_level',
       'job_type', 'occupation', 'income', 'diet_type', 'exercise_type',
       'device_usage', 'healthcare_access', 'insurance', 'sunlight_exposure',
       'meals_per_day', 'caffeine_intake', 'family_history', 'pet_owner',
       'electrolyte_level', 'gene_marker_flag', 'environmental_risk_score',
       'daily_supplement_dosage', 'target'],
      dtype='object')

# Train test split

In [286]:
X = df.drop('target',axis=1)
y = df['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=11,test_size=.25)

# Preprocessing

## Remove nulls / duplicates

In [287]:
def remove_null_duplicates(df):
    df = df.dropna()
    df = df.drop_duplicates()
    return df 

X_train = remove_null_duplicates(X_train)
X_test = remove_null_duplicates(X_test)

# match y index with corresponding x
train_index = X_train.index
test_index = X_test.index
y_train = y_train.loc[train_index]
y_test = y_test.loc[test_index]


## 5 point summary

In [288]:
summary = df.describe()
summary.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
survey_code,100000.0,50000.5,28867.657797,1.0,25000.75,50000.5,75000.25,100000.0
age,100000.0,48.52599,17.886768,18.0,33.0,48.0,64.0,79.0
height,100000.0,170.023707,9.982798,140.0,163.306615,170.016778,176.72892,210.0
weight,100000.0,70.064862,14.693667,40.0,59.856938,69.924141,80.027418,139.250894
bmi,100000.0,24.493876,5.951069,9.988495,20.271405,24.156734,28.258696,59.234792
bmi_estimated,100000.0,24.493876,5.951069,9.988495,20.271405,24.156734,28.258696,59.234792
bmi_scaled,100000.0,73.481627,17.853206,29.965484,60.814215,72.470201,84.776088,177.704377
bmi_corrected,100000.0,24.49414,5.954184,9.893845,20.271059,24.151699,28.247648,59.142646
waist_size,100000.0,84.933043,12.040314,34.093185,76.795185,84.957139,93.018713,133.153631
blood_pressure,92331.0,119.980149,15.015503,59.128168,109.81206,119.951794,130.120621,184.439195


## Correlation matrics

In [289]:
num_data = X_train.select_dtypes(include=[int,float])
num_data.corr()

Unnamed: 0,survey_code,age,height,weight,bmi,bmi_estimated,bmi_scaled,bmi_corrected,waist_size,blood_pressure,...,water_intake,screen_time,stress_level,mental_health_score,income,meals_per_day,electrolyte_level,gene_marker_flag,environmental_risk_score,daily_supplement_dosage
survey_code,1.0,0.007951,-0.011659,-0.00439,0.001207,0.001207,0.001207,0.001253,-0.012145,0.016916,...,0.007046,0.0188,0.004465,-0.005946,-0.009655,-0.013951,,,,0.03231
age,0.007951,1.0,-0.012157,0.001663,0.005697,0.005697,0.005697,0.005546,0.005334,-0.008125,...,0.004892,-0.008155,0.012583,0.005557,0.001858,-0.008385,,,,-0.010159
height,-0.011659,-0.012157,1.0,0.01098,-0.479509,-0.479509,-0.479509,-0.479098,-2e-05,0.007407,...,-0.002107,-0.001482,-0.003479,-0.002995,0.012179,-0.005167,,,,-0.005716
weight,-0.00439,0.001663,0.01098,1.0,0.863839,0.863839,0.863839,0.86342,0.004689,0.001283,...,0.004591,0.017155,0.007664,-0.020338,-0.008517,-0.016276,,,,0.000392
bmi,0.001207,0.005697,-0.479509,0.863839,1.0,1.0,1.0,0.999436,0.00216,-0.003283,...,0.005545,0.015754,0.006082,-0.01823,-0.013101,-0.012922,,,,0.001483
bmi_estimated,0.001207,0.005697,-0.479509,0.863839,1.0,1.0,1.0,0.999436,0.00216,-0.003283,...,0.005545,0.015754,0.006082,-0.01823,-0.013101,-0.012922,,,,0.001483
bmi_scaled,0.001207,0.005697,-0.479509,0.863839,1.0,1.0,1.0,0.999436,0.00216,-0.003283,...,0.005545,0.015754,0.006082,-0.01823,-0.013101,-0.012922,,,,0.001483
bmi_corrected,0.001253,0.005546,-0.479098,0.86342,0.999436,0.999436,0.999436,1.0,0.001475,-0.003064,...,0.005945,0.015683,0.006214,-0.017675,-0.013328,-0.012103,,,,0.001764
waist_size,-0.012145,0.005334,-2e-05,0.004689,0.00216,0.00216,0.00216,0.001475,1.0,0.004232,...,-0.004483,0.000885,-0.003523,-0.009865,0.010369,0.001624,,,,-0.004159
blood_pressure,0.016916,-0.008125,0.007407,0.001283,-0.003283,-0.003283,-0.003283,-0.003064,0.004232,1.0,...,-0.016879,0.014496,0.010216,0.002732,-0.004268,-0.002411,,,,0.000864


## Rmove unnecessory columns 

In [290]:
def rem_columns(df):
    df = df.drop(['electrolyte_level', 'gene_marker_flag', 'environmental_risk_score'],axis=1)
    return df

X_train = rem_columns(X_train)
X_test = rem_columns(X_test)

## Encoding

In [291]:
encoding_mapper = {}
encoder = OrdinalEncoder()
X_train[X_train.select_dtypes(include='object').columns] = encoder.fit_transform(X_train.select_dtypes(include='object'))
X_test[X_test.select_dtypes(include='object').columns] = encoder.transform(X_test.select_dtypes(include='object'))


with open ('../pickles/enocoder.pkl','wb') as file:
    pickle.dump(encoder,file)


## Scaling

In [292]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train),columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns)

# dagshub credentials

In [None]:
username = getpass("Enter your DagsHub username: ")  # hides input
token = getpass("Enter your DagsHub token: ")        # hides input

import os
os.environ["MLFLOW_TRACKING_USERNAME"] = username
os.environ["MLFLOW_TRACKING_PASSWORD"] = token

mlflow.set_tracking_uri('https://dagshub.com/hrishi331/MLops-Demo-1.mlflow')
mlflow.set_experiment('Health_lifestyle_classification')

MlflowException: API request to endpoint /api/2.0/mlflow/experiments/get-by-name failed with error code 401 != 200. Response body: '=============== ATTENTION! ===============

To use authentication, you must first: 
    Get your default access token from: https://dagshub.com/user/settings/tokens
    OR 
    Set a password: https://dagshub.com/user/settings/password 
=========================================='

# Modeling

## Logistic Regression

In [None]:
with mlflow.start_run():
    # Model
    model = DecisionTreeClassifier(random_state=11)
    path = model 
    model_name = 'Decision Tree Classifier' # change model name here
    # Training
    model.fit(X_train,y_train)
    # Prediction
    y_pred_dtc = model.predict(X_test)
    # Metrics
    classes = ['diseased','healthy']
    eval_metric = {}
    for i in classes:
        eval_metric[f'{i}_f1_score'] = f1_score(y_test,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_recall_score'] = recall_score(y_test,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_precision_score'] = precision_score(y_test,y_pred_dtc,pos_label=i)
    eval_metric['accuracy_score'] = accuracy_score(y_test,y_pred_dtc)

    mlflow.log_metrics(eval_metric)
    mlflow.sklearn.Model(path,model_name)
    



🏃 View run salty-croc-751 at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0/runs/74255dcfe05b41e885b04177cc13169a
🧪 View experiment at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0


## Random Forest

In [None]:
with mlflow.start_run():
    # Model
    model = RandomForestClassifier(random_state=11)
    path = model 
    model_name = 'Random Forest Classifier' # change model name here
    # Training
    model.fit(X_train,y_train)
    # Prediction
    y_pred_dtc = model.predict(X_test)
    # Metrics
    classes = ['diseased','healthy']
    eval_metric = {}
    for i in classes:
        eval_metric[f'{i}_f1_score'] = f1_score(y_test,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_recall_score'] = recall_score(y_test,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_precision_score'] = precision_score(y_test,y_pred_dtc,pos_label=i)
    eval_metric['accuracy_score'] = accuracy_score(y_test,y_pred_dtc)

    mlflow.log_metrics(eval_metric)
    mlflow.sklearn.Model(path,model_name)
    



🏃 View run handsome-loon-964 at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0/runs/9964c2cc59ac46d7a5317fbe8d316bf0
🧪 View experiment at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0


## KNN model

In [None]:
with mlflow.start_run():
    # Model
    model = KNeighborsClassifier(n_neighbors=5)
    path = model 
    model_name = 'KNN Classifier ' # change model name here
    # Training
    model.fit(X_train,y_train)
    # Prediction
    y_pred_dtc = model.predict(X_test)
    # Metrics
    classes = ['diseased','healthy']
    eval_metric = {}
    for i in classes:
        eval_metric[f'{i}_f1_score'] = f1_score(y_test,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_recall_score'] = recall_score(y_test,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_precision_score'] = precision_score(y_test,y_pred_dtc,pos_label=i)
    eval_metric['accuracy_score'] = accuracy_score(y_test,y_pred_dtc)

    mlflow.log_metrics(eval_metric)
    mlflow.sklearn.Model(path,model_name)



🏃 View run thundering-sow-600 at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0/runs/c8996fe1510643b982211b8175f60200
🧪 View experiment at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0


## Xgboost model

In [None]:
y_train_encoded = y_train.map({'healthy':0,'diseased':1})
y_test_encoded = y_test.map({'healthy':0,'diseased':1})

with mlflow.start_run():
    # Model
    model = XGBClassifier()
    path = model 
    model_name = 'XG Boost Classifier' # change model name here
    # Training
    model.fit(X_train,y_train_encoded)
    # Prediction
    y_pred_dtc = model.predict(X_test)
    # Metrics
    classes = [1,0]
    eval_metric = {}
    for i in classes:
        eval_metric[f'{i}_f1_score'] = f1_score(y_test_encoded,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_recall_score'] = recall_score(y_test_encoded,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_precision_score'] = precision_score(y_test_encoded,y_pred_dtc,pos_label=i)
    eval_metric['accuracy_score'] = accuracy_score(y_test_encoded,y_pred_dtc)

    mlflow.log_metrics(eval_metric)
    mlflow.sklearn.Model(path,model_name)



🏃 View run sassy-fish-37 at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0/runs/4a5cc82b377f41bba178460170b6f362
🧪 View experiment at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0


## Lightgbm model

In [None]:
with mlflow.start_run():
    # Model
    model = LGBMClassifier()
    path = model 
    model_name = 'Light GBM Classifier' # change model name here
    # Training
    model.fit(X_train,y_train)
    # Prediction
    y_pred_dtc = model.predict(X_test)
    # Metrics
    classes = ['diseased','healthy']
    eval_metric = {}
    for i in classes:
        eval_metric[f'{i}_f1_score'] = f1_score(y_test,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_recall_score'] = recall_score(y_test,y_pred_dtc,pos_label=i)
        eval_metric[f'{i}_precision_score'] = precision_score(y_test,y_pred_dtc,pos_label=i)
    eval_metric['accuracy_score'] = accuracy_score(y_test,y_pred_dtc)

    mlflow.log_metrics(eval_metric)
    mlflow.sklearn.Model(path,model_name)

[LightGBM] [Info] Number of positive: 7653, number of negative: 3224
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000737 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6028
[LightGBM] [Info] Number of data points in the train set: 10877, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.703595 -> initscore=0.864475
[LightGBM] [Info] Start training from score 0.864475




🏃 View run agreeable-bear-958 at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0/runs/45405335b0a44d2c92b9c6070b831ac9
🧪 View experiment at: https://dagshub.com/hrishi331/MLops-Demo-1.mlflow/#/experiments/0
