# Model Training Pipeline

<img src="img/mlops_training_data_pipeline.png" width=1000/>

Our training should run periodically or when triggered by a dataset arrival. The main output of the training project is a new model that is generated as output and registered in the Model Registry with different details.

The pipeline steps are:
- **Train**
- **Evaluate**
- **Register**
    
With these three distinct phases, we ensure reproducibility of the model training process and visibility and clear separation of the different steps of the process.

In [None]:
import os

from data_utils import get_train_test_split_for_stock
from config import *

In [None]:
#! conda update -n base conda

In [None]:
print(os.getcwd())

In [None]:
os.chdir(PATH_TO_TRAINING) 
print(os.getcwd())

In [None]:
from pathlib import Path

# Create the folder if does not exist
Path(PATH_TO_TRAINING).mkdir(parents=True, exist_ok=True)

## MLflow Project

In [None]:
# Create the MLProject file
_mlproject = "MLProject"

In [None]:
%%writefile {_mlproject}

name: model_training_pipeline

conda_env: 
    conda.yaml

entry_points:

  main:
    command: "python main.py"

  train_model:
    command: "python train_model.py"

  evaluate_model:
    command: "python evaluate_model.py"

  register_model:
    parameters:
      model_uri: string
    command: "python register_model.py {model_uri}"

In [None]:
# Create the conda.yaml file
_conda = "conda.yaml"

In [None]:
%%writefile {_conda}

name: pystock-data-features
channels:
    - defaults
dependencies:
    - python=3.8
    - numpy
    - scipy
    - pandas
    - cloudpickle
    - pip
    - pip:
        - git+https://github.com/mlflow/mlflow
        - sklearn
        - pandas_datareader
        - great-expectations
        - pandas-profiling
        - xgboost

In [None]:
# Create the main.py file
_main = "main.py"

In [None]:
%%writefile {_main}

import mlflow
import os


def _run(entrypoint, 
         parameters={}, 
         source_version=None, 
         use_cache=True):
    print("---------------------")
    print(f"Launching new run for entrypoint={entrypoint} and parameters={parameters}")
    submitted_run = mlflow.run(".", entrypoint, parameters=parameters)
    return mlflow.tracking.MlflowClient().get_run(submitted_run.run_id)


def workflow():
    with mlflow.start_run(run_name ="model-training") as active_run:
        mlflow.set_tag("mlflow.runName", "model-training")
        
        # ------
        # Train model
        train_run = _run("train_model")
        model_uri = os.path.join(train_run.info.artifact_uri, "model")
        
        print(f"SUCCESS: Model {model_uri} trained.")
        
        # ------
        # Evaluate model
        evaluate_run = _run("evaluate_model")
        print(f"SUCCESS: Model {evaluate_run} validated.")

        # ------
        # Register model   
        #mlflow.register_model(model_uri, "model-trained-evaluated")
        register_run = _run("register_model", parameters={'model_uri': model_uri})
        print(f"SUCCESS: Model {register_run} registered.")
        
        
if __name__=="__main__":
    
    workflow()
    

## Build model
Load the training data to fit and produce a model. Test predictions will be produced and persisted in the environment so that other steps of the workflow can use the data to evaluate the model.

In [None]:
# Create the train_model.py file
_train_model = "train_model.py"

In [None]:
%%writefile {_train_model}

import sys
from pathlib import Path
PATH_TO_CONFIG = "/home/ksatola/work/src"
sys.path.insert(1, PATH_TO_CONFIG)


import os
import pandas as pd
import xgboost as xgb
import mlflow
import mlflow.xgboost
from sklearn.model_selection import train_test_split
from config import *


def train_test_split_df(
    pandas_df,
    t_size=SPLIT_RATIO,
    r_tate=RANDOM_STATE):
    """
    Docstring here...
    """
    
    X = pandas_df.iloc[:, :-1]
    Y = pandas_df.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                        test_size=t_size, 
                                                        random_state=r_tate)
    return X_train, X_test, y_train, y_test


if __name__ == "__main__":

    mlflow.xgboost.autolog()
    
    with mlflow.start_run(run_name="train_model") as run:
        
        mlflow.set_tag("mlflow.runName", "train_model")

        #df = pd.read_csv("/home/ksatola/work/data/training/data.csv", header=None)
        df = pd.read_csv(os.path.join(PATH_TO_DATA_PIPELINE, "training", "data.csv"))

        X_train, X_test, y_train, y_test = train_test_split_df(df)

        train_data = xgb.DMatrix(X_train, label=y_train)
        test_data =  xgb.DMatrix(X_test)

        model = xgb.train(dtrain=train_data, params={})
        
        y_hats = model.predict(test_data) 
        y_hats = [1 if y_hat > CLASS_THRESHOLD else 0. for y_hat in y_hats]

        test_prediction_results = pd.DataFrame(data={
            'y_pred':y_hats,
            'y_test':y_test
        })
        
        print(test_prediction_results)
        
        # Create the folder if does not exist
        Path(PATH_TO_TRAINING, "predictions").mkdir(parents=True, exist_ok=True)

        #test_prediction_result.to_csv("/home/ksatola/work/data/predictions/test_predictions.csv")
        test_prediction_results.to_csv(os.path.join(PATH_TO_TRAINING, "predictions", "test_predictions.csv"))
        

### Produce model performance metrics

At this stage, we have our model saved and persisted on the artifacts of our MLflow installation. We will now move on to collect evaluation metrics for our model, to add to the metadata of the model.

In [None]:
# Create the evaluate_model.py file
_evaluate_model = "evaluate_model.py"

In [None]:
%%writefile {_evaluate_model}

import sys
PATH_TO_CONFIG = "/home/ksatola/work/src"
sys.path.insert(1, PATH_TO_CONFIG)

import os
import pandas as pd
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    auc,
    average_precision_score,
    balanced_accuracy_score,
    f1_score,
    fbeta_score,
    hamming_loss,
    jaccard_score,
    log_loss,
    matthews_corrcoef,
    precision_score,
    recall_score,
    zero_one_loss,
)
from config import *


def classification_metrics(df:None):
    metrics = {}
    metrics["accuracy_score"] = accuracy_score(df["y_pred"], df["y_test"])
    metrics["average_precision_score"] = average_precision_score(df["y_pred"], df["y_test"], average='micro')
    metrics["f1_score"] = f1_score(df["y_pred"], df["y_test"], average='micro')
    metrics["jaccard_score"] = jaccard_score( df["y_pred"], df["y_test"], average='micro')
    metrics["log_loss"] = log_loss(df["y_pred"], df["y_test"])
    metrics["matthews_corrcoef"] = matthews_corrcoef(df["y_pred"], df["y_test"])
    metrics["precision_score"] = precision_score(df["y_pred"], df["y_test"], average='micro')
    metrics["recall_score"] = recall_score(df["y_pred"], df["y_test"], average='micro')
    metrics["zero_one_loss"] = zero_one_loss(df["y_pred"], df["y_test"])
    return metrics
    

if __name__ == "__main__":

    with mlflow.start_run(run_name="evaluate_model") as run:
        
        mlflow.set_tag("mlflow.runName", "evaluate_model")
        
        #df = pd.read_csv("/home/ksatola/work/data/predictions/test_predictions.csv")
        df = pd.read_csv(os.path.join(PATH_TO_TRAINING, "predictions", "test_predictions.csv"))
        
        metrics = classification_metrics(df)
        
        mlflow.log_metrics(metrics)
        

### Register model

Register the model in the Model Registry. A model will be created if it doesn't already exist. If it's already in the registry, a new version will be added, allowing the deployment tools to look at the models and trace the training jobs and metrics. Having this step separated, allows a decision to be made as to whether to promote the model to production or not.

In [None]:
# Create the register_model.py file
_register_model = "register_model.py"

In [None]:
%%writefile {_register_model}

import mlflow
import sys

if __name__ == "__main__":
    
    model_uri = str(sys.argv[1])
    
    with mlflow.start_run(run_name="register_model") as run:

        mlflow.set_tag("mlflow.runName", "register_model")

        #result = mlflow.register_model(model_uri, "model-trained-evaluated")
        mlflow.register_model(model_uri, "SP_XGBoost")
        

## Run training pipeline

In [None]:
# In the Jupyter terminal
cd /home/ksatola/work/data/training_pipeline

mlflow run . --experiment-name="SP_Model_Training"