# 05a - Modelling with mlflow

__Goal__: Add `mlflow` to a simpliefied version of notebook `04a`. 

### Import

In [1]:
%load_ext autoreload
%autoreload 2
import joblib
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
from pathlib import Path
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier

from weather.transformers.skl_transformer_makers import (
    FeatureNames,
    TargetChoice,
    make_dataset_ingestion_transformer,
    make_target_creation_transformer,
    make_remove_horizonless_rows_transformer, 
    make_predictors_feature_engineering_transformer,
)
from weather.data.prep_datasets import (
    prepare_binary_classification_tabular_data, 
    transform_dataset_and_create_target,
)
from weather.helpers.utils import camel_to_snake
from weather.models.skl_train_models import (
    #score_evaluation,
    score_evaluation_dict, #TODO: remove  it
    #print_score_dict_results,
    #confusion_matrix_evaluation,
    #confusion_matrix_display,
)

from weather.models.skl_tracked_train_models import Experiment
from weather.models.skl_tracked_train_models import train_and_evaluate_with_tracking
from weather.models.skl_train_models import train_and_evaluate

### Set the directory paths

In [2]:
data_dir =  Path.cwd().parent / "data"
models_dir = Path.cwd().parent / "models"
models_dir.mkdir(exist_ok=True)

# 1. Prepare the `dataset` for modelization

### Select the predictors and set the target

In [3]:
# Select the predictors
feature_names = FeatureNames(
    numerical=[
        "Temperature",
        "Humidity",
        "Wind_speed",
        "Wind_bearing",
        "Visibility",
        "Pressure",
    ],
    categorical=[],  # Add or remove "Weather", "Month" to the predictors
)

# Set "Weather" within 4 hours as target
target_name = "Weather"
horizon = 4
target_choice = TargetChoice(target_name, horizon) 

### Set the dataset transformers

In [4]:
oldnames_newnames_dict = {
    "Temperature_C": "Temperature", 
    "Apparent_Temperature_C": "Apparent_temperature",
    "Wind_speed_kmph": "Wind_speed",
    "Wind_bearing_degrees": "Wind_bearing",
    "Visibility_km": "Visibility",
    "Pressure_millibars": "Pressure",
    "Weather_conditions": "Weather"}

dataset_ingestion_transformer = make_dataset_ingestion_transformer(target_choice, oldnames_newnames_dict)
remove_horizonless_rows_transformer = make_remove_horizonless_rows_transformer(target_choice)
target_creation_transformer = make_target_creation_transformer(target_choice)                       
predictors_feature_engineering_transformer = make_predictors_feature_engineering_transformer(feature_names, target_choice)

### Read the data

In [5]:
df = pd.read_csv(data_dir / 'weather_dataset_raw_development.csv')
df.head(1)

Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
0,2881,2006-01-01 00:00:00+00:00,"Port of Turku, Finland",1.161111,-3.238889,0.85,16.6152,139,9.9015,1016.15,rain


### Transform the dataset and split it

In [6]:
# Three transformers: "dataset__ingestion_transformer", "remove_horizonless_rows_transformer", "target_creation_transformer"
transformed_data, created_target = transform_dataset_and_create_target(
    df,   
    dataset_ingestion_transformer,
    remove_horizonless_rows_transformer,
    target_creation_transformer,
)

# Split the dataset
dataset = prepare_binary_classification_tabular_data(
    transformed_data,
    created_target,
)

### Define candidate models

In [7]:
random_state = 1234

models = {
    "DecisionTree": {
        "model": DecisionTreeClassifier(max_depth=4, random_state=random_state),
    },
    "LinearSvc": {
        "model": LinearSVC(max_iter=10_000, random_state=random_state),
    },
    "LogisticRegression": {
        "model": LogisticRegression(),
    },
    "RandomForest": {
        "model": RandomForestClassifier(max_depth=4, random_state=random_state),
        #"param_grid": {"model__n_estimators": [5, 10], "model__max_depth": [None, 5, 10]},
    },
    "SvcWithRbfKernel": {
        "model": SVC(kernel="rbf", gamma=0.7, random_state=0),
    },
}

# 1. Modeling without mlflow

In [8]:
# classifiers_list = [LogisticRegression, LinearSVC, DecisionTreeClassifier, RandomForestClassifier]
# results = train_and_evaluate(dataset, predictors_feature_engineering_transformer, classifiers_list, f1_score)
# pprint(results, sort_dicts=False)

# 2. Modeling with mlflow

__WARNING__: As the method `.fit()` has already been called on the pipeline `predictors_feature_engineering_transformer` by section `1.` of this notebook within `train_and_evaluate()`,  it is redundantly called here for a second time in `train_and_evaluate_with_tracking()` but this does not affect the result.

In [9]:
MLFlow_URI = 'http://127.0.0.1:5000'
experiment_name = "test_of_four_classifiers_bis"

experiment = Experiment(MLFlow_URI, experiment_name)

In [11]:
classifiers_list = [LogisticRegression, LinearSVC, DecisionTreeClassifier, RandomForestClassifier]
classifiers_list = [LogisticRegression, LinearSVC, DecisionTreeClassifier, RandomForestClassifier]
train_and_evaluate_with_tracking(dataset, predictors_feature_engineering_transformer, classifiers_list, f1_score, experiment)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

