## Training 

The main task on this dataset is to predict based on the given attributes of a patient that whether that particular person has heart disease or not and other is the experimental task to diagnose and find out various insights from this dataset which could help in understanding the problem more.

#### Libraries and modules to use

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import mlflow

#### Data reading

In [2]:
import pandas as pd

file_url = "https://mlrawdata123.blob.core.windows.net/rawdata/raw_data.csv"
df = pd.read_csv(file_url)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


#### Data preprocessing

Train-test split

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop("target", axis=1), df["target"], test_size=0.3
)

Define numerical and categorical features

In [4]:
num_features = X_train.select_dtypes('number').columns.tolist()
cat_features = X_train.select_dtypes('object').columns.tolist()

Define preprocessing pipeline

In [5]:
column_transformer = ColumnTransformer(
    [("OHE", OneHotEncoder(sparse_output=False), cat_features),
     ("scaler", MinMaxScaler(), num_features)
     ]
).set_output(transform="pandas")

### Modeling

Set experiment

In [6]:
mlflow.set_experiment(experiment_name="heart_disease_exp2")

<Experiment: artifact_location='', creation_time=1704222578148, experiment_id='bf65e0e6-f77a-49c0-a50f-d7826d5ebffe', last_update_time=None, lifecycle_stage='active', name='heart_disease_exp2', tags={}>

#### Logistic regression

In [41]:
with mlflow.start_run(run_name = 'log_reg_baseline'):
    model_name = LogisticRegression().__class__.__name__
    mlflow.set_tag('model_name',model_name)

    pipe = Pipeline(steps = [
        ("transformers", column_transformer),
        (model_name, LogisticRegression(random_state=1234, penalty='l2'))
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_pred_prob = pipe.predict_proba(X_test)[:,1]

    accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    recall = metrics.recall_score(y_true=y_test, y_pred=y_pred)
    precision = metrics.recall_score(y_true=y_test, y_pred=y_pred)
    auc_score = metrics.roc_auc_score(y_true=y_test, y_score=y_pred_prob)

    mlflow.log_metrics({"test_accuracy": accuracy, "test_recall": recall, "test_precision": precision,
                        "test_auc": auc_score})
    
    mlflow.log_param("penalty", "l2")

    mlflow.sklearn.log_model(pipe, f"heart_disease_{model_name}")



#### Decision Tree

In [42]:
with mlflow.start_run(run_name = 'decision_tree'):
    model_name = DecisionTreeClassifier().__class__.__name__
    mlflow.set_tag('model_name',model_name)

    pipe = Pipeline(steps = [
        ("transformers", column_transformer),
        (model_name, DecisionTreeClassifier(random_state=1234, max_depth=10))
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_pred_prob = pipe.predict_proba(X_test)[:,1]

    accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    recall = metrics.recall_score(y_true=y_test, y_pred=y_pred)
    precision = metrics.recall_score(y_true=y_test, y_pred=y_pred)
    auc_score = metrics.roc_auc_score(y_true=y_test, y_score=y_pred_prob)

    mlflow.log_metrics({"test_accuracy": accuracy, "test_recall": recall, "test_precision": precision,
                        "test_auc": auc_score})

    mlflow.log_param("max_depth", 10)

    mlflow.sklearn.log_model(pipe, f"heart_disease_{model_name}")

#### Random Forest

In [43]:
with mlflow.start_run(run_name = 'random_forest'):
    model_name = RandomForestClassifier().__class__.__name__
    mlflow.set_tag('model_name',model_name)

    pipe = Pipeline(steps = [
        ("transformers", column_transformer),
        (model_name, RandomForestClassifier(random_state=1234, n_estimators=50))
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_pred_prob = pipe.predict_proba(X_test)[:,1]

    accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    recall = metrics.recall_score(y_true=y_test, y_pred=y_pred)
    precision = metrics.recall_score(y_true=y_test, y_pred=y_pred)
    auc_score = metrics.roc_auc_score(y_true=y_test, y_score=y_pred_prob)

    mlflow.log_metrics({"test_accuracy": accuracy, "test_recall": recall, "test_precision": precision,
                        "test_auc": auc_score})

    mlflow.log_param("n_estimators", 50)

    mlflow.sklearn.log_model(pipe, f"heart_disease_{model_name}")

### Model Registry

Set MLflow client

In [7]:
from mlflow.tracking import MlflowClient

ML_FLOW_TRACKING_URI = mlflow.get_tracking_uri()
client = MlflowClient(tracking_uri=ML_FLOW_TRACKING_URI)
client.search_experiments()

[<Experiment: artifact_location='', creation_time=1702957804159, experiment_id='d8fd18d1-884d-4a8d-8f4a-8eddef71c4d2', last_update_time=None, lifecycle_stage='active', name='heart_disease_exp', tags={}>,
 <Experiment: artifact_location='', creation_time=1704222578148, experiment_id='bf65e0e6-f77a-49c0-a50f-d7826d5ebffe', last_update_time=None, lifecycle_stage='active', name='heart_disease_exp2', tags={}>]

In [8]:
experiment_id = mlflow.get_experiment_by_name("heart_disease_exp2").experiment_id

Search runs with accuracy higher than 0.79

In [9]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids=experiment_id,
    filter_string='metrics.test_accuracy > 0.79',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=3
)

runs

[<Run: data=<RunData: metrics={'test_accuracy': 0.8681318681318682,
  'test_auc': 0.9387878787878787,
  'test_precision': 0.72,
  'test_recall': 0.72}, params={'penalty': 'l2'}, tags={'mlflow.rootRunId': '95542723-d632-4256-b2de-1ee66a0ddb66',
  'mlflow.runName': 'log_reg_baseline',
  'mlflow.user': 'Jose Luis Alcocer Cáceres',
  'model_name': 'LogisticRegression'}>, info=<RunInfo: artifact_uri='', end_time=1704222633210, experiment_id='bf65e0e6-f77a-49c0-a50f-d7826d5ebffe', lifecycle_stage='active', run_id='95542723-d632-4256-b2de-1ee66a0ddb66', run_name='log_reg_baseline', run_uuid='95542723-d632-4256-b2de-1ee66a0ddb66', start_time=1704222627432, status='FINISHED', user_id='Jose Luis Alcocer Cáceres'>, inputs=<RunInputs: dataset_inputs=[]>>,
 <Run: data=<RunData: metrics={'test_accuracy': 0.8131868131868132,
  'test_auc': 0.7593939393939395,
  'test_precision': 0.64,
  'test_recall': 0.64}, params={'max_depth': '10'}, tags={'mlflow.rootRunId': 'df216edd-113b-4dd3-b028-76708473726b',


In [10]:
for run in runs:
    artifact = client.list_artifacts(run_id=run.info.run_id)[0].path
    print(f"run id: {run.info.run_id}, test_accuracy: {run.data.metrics['test_accuracy']:.4f},  artifact: {artifact}")

run id: 95542723-d632-4256-b2de-1ee66a0ddb66, test_accuracy: 0.8681,  artifact: heart_disease_LogisticRegression
run id: df216edd-113b-4dd3-b028-76708473726b, test_accuracy: 0.8132,  artifact: heart_disease_DecisionTreeClassifier
run id: 26edd519-5513-4bb2-8628-76232614028c, test_accuracy: 0.8571,  artifact: heart_disease_RandomForestClassifier


Registry Random Forest Classifier

In [11]:
run_id = "26edd519-5513-4bb2-8628-76232614028c"
artifact = "heart_disease_RandomForestClassifier"
model_uri = f'runs:/{run_id}/{artifact}'
mlflow.register_model(model_uri=model_uri, name = 'heart_disease_model')

Registered model 'heart_disease_model' already exists. Creating a new version of this model...
2024/01/02 19:16:00 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: heart_disease_model, version 3
Created version '3' of model 'heart_disease_model'.


<ModelVersion: aliases=[], creation_timestamp=1704222960774, current_stage='None', description='', last_updated_timestamp=1704222960774, name='heart_disease_model', run_id='26edd519-5513-4bb2-8628-76232614028c', run_link='', source='azureml://centralus.api.azureml.ms/mlflow/v2.0/subscriptions/fe465efa-7c36-4422-ab58-265e48560849/resourceGroups/machine-learning/providers/Microsoft.MachineLearningServices/workspaces/machine-learning-space/experiments/bf65e0e6-f77a-49c0-a50f-d7826d5ebffe/runs/26edd519-5513-4bb2-8628-76232614028c/artifacts/heart_disease_RandomForestClassifier', status='READY', status_message='', tags={}, user_id='', version='3'>

Change stage to production

In [15]:
model_name = 'heart_disease_model'
model_version = 3
new_stage = 'Production'
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1704222960774, current_stage='Production', description='', last_updated_timestamp=1704223236195, name='heart_disease_model', run_id='26edd519-5513-4bb2-8628-76232614028c', run_link='', source='azureml://centralus.api.azureml.ms/mlflow/v2.0/subscriptions/fe465efa-7c36-4422-ab58-265e48560849/resourceGroups/machine-learning/providers/Microsoft.MachineLearningServices/workspaces/machine-learning-space/experiments/bf65e0e6-f77a-49c0-a50f-d7826d5ebffe/runs/26edd519-5513-4bb2-8628-76232614028c/artifacts/heart_disease_RandomForestClassifier', status='READY', status_message='', tags={}, user_id='', version='3'>

In [19]:
model_name = 'heart_disease_model'
version = 3
model_uri = f"models:/{model_name}/{version}"
model=mlflow.sklearn.load_model(model_uri)