# Modeling process A - Y1

## Root folder and read env variables

In [None]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('\\')[:-1]
root_path = '\\'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

In [None]:
import os
from dotenv import load_dotenv, find_dotenv # package used in jupyter notebook to read the variables in file .env

""" get env variable from .env """
load_dotenv(find_dotenv())

""" Read env variables and save it as python variable """
PROJECT_GCP = os.environ.get("PROJECT_GCP", "")

## RUN TRAINING

In [None]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import gcsfs
import pickle

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

# transform
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

# models
from sklearn.linear_model import LinearRegression # lr
from sklearn.linear_model import Ridge # ridge
from sklearn.linear_model import Lasso # lasso
from sklearn.tree import DecisionTreeRegressor # tree
from sklearn.ensemble import GradientBoostingRegressor #gb
from sklearn.ensemble import RandomForestRegressor #rf
#from xgboost import XGBRegressor # xgb
from  sklearn.neural_network import MLPRegressor # mlp

### 0. Define name process

In [None]:
name_process = 'process_a'

### 1. Read data

In [None]:
# load X_train
path_X_train = f'artifacts/data_training/{name_process}/X_train.pkl'
X_train = pd.read_pickle(path_X_train)

# load y_train
path_y_train = f'artifacts/data_training/{name_process}/y_train.pkl'
y_train = pd.read_pickle(path_y_train)


# ---
# load X_test
path_X_test = f'artifacts/data_training/{name_process}/X_test.pkl'
X_test = pd.read_pickle(path_X_test)

# load y_test
path_y_test = f'artifacts/data_training/{name_process}/y_test.pkl'
y_test = pd.read_pickle(path_y_test)

In [None]:
print('shape data')
print('\n --- TRAIN ---')
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)

print('\n --- TEST ---')
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

### 2. Read master tags data for this process. Sort features used to train according this order

In [None]:
### read master table - list tags
path_maestro_tags_d0eop = f'config/config_ml_models_development/MasterTable_{name_process}.xlsx'
maestro_tags = pd.read_excel(path_maestro_tags_d0eop)
maestro_tags

### 2. Define target according master table
In the master table with the list of features and target, it is possible that there are a lot of features and differents models with differents subsets of features could be trained. In the list defined bellow there are the features used in the trainning and there are the features that will be saved as output of training

In [None]:
list_target = ['Y1']
list_target

### 3. Define features and sort it acording master table
In the master table with the list of features and target, it is possible that there are a lot of features and differents models with differents subsets of features could be trained. In the list defined bellow there are the features used in the trainning and there are the features that will be saved as output of training

In [None]:
# mnaully set list of features used in training
list_features = ['O1', 'O2', 'O3', 'X1']

In [None]:
### sort list of features according the order in master table

list_features = [tag for tag in maestro_tags['TAG'].tolist() if tag in list_features]
list_features

### 6. Train a model. Transformations in columns + Gradient Boosting

In [None]:
# train lr
lr_model = LinearRegression()

In [None]:
# train
param_n_estimators = 5

transformer_log = make_column_transformer(
    (FunctionTransformer(np.log1p), ["O1"]),
    remainder='passthrough' # Leave other columns unchanged
)

gb_simple_model = GradientBoostingRegressor(random_state = 42,
                                     n_estimators = param_n_estimators,
                                      min_samples_split = 0.2,
                                    min_samples_leaf = 0.1,
                                    #max_depth = 2)
                                     )

model = Pipeline([
    #('log feature_o1', transformer_log),
    ('scaler', StandardScaler() ), # minmax scaler its not supported by gurobi
    #('gb_simple',  gb_simple_model)
     ('lr',  lr_model)
])

model.fit(X_train, y_train)

In [None]:
# r2 score
r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)

print('r2_train: ', r2_train)
print('r2_test: ', r2_test)

## SAVE OUTPUTS TRAINING

Al terminar el entrenamiento, los siguientes outputs deben de ser generados:

----
#### Artefacto Analitico:
- **modelo entrenado** y guardado como pkl

----
#### Listado de features:
- **listado de features** (listado de todas las features que ve el modelo)

- **listado de features variables controlables** (listado de todas las features que ve el modelo y que son variables controlables y por lo tanto
variables de decisión en un modelo de optimización)

- **listado de target** (lista con el target del modelo)


----
#### Example Input:
- **X_train.head(1)**: se necesita saber el orden de las features utilizadas y los nombres de las columnas. Ambos se deben de respetar. Con el listado de features se debe de poder deducir, pero de todas formas se guarda un ejemplo de la instancia de entrenamiento X

### 1. Save artifact model

In [None]:
# save model
path_model = f'artifacts/models/{name_process}/model.pkl'
with open(path_model, "wb") as output:
    pickle.dump(model, output)
    output.close()

### 2. Save list of features
Save table master tag only with the tags used to train the model. 

OBS IMPORTANT: remember that the list of features was sorted according the master table so this order was used to train. Also this table has the differentation between no-controlable, controlable and targer variables

In [None]:
# generate a list of features + target
list_features_target = list_features + list_target

# filter master tag with only the features+target used to train the ml models
maestro_tags = maestro_tags[maestro_tags['TAG'].isin(list_features_target)]
maestro_tags = maestro_tags.reset_index().drop(columns = 'index')

In [None]:
# save master in config folder that will used to create the optimization engine
path_list_features_target_to_optimization = f'config/optimization_engine/ml_models/MasterTable_{name_process}.xlsx'
maestro_tags.to_excel(path_list_features_target_to_optimization, index = False)

### 3. Save example input

In [None]:
# example input
example_input = X_train.head(1)
example_input

In [None]:
# save example input

path_example_input_ml_model = f'config/optimization_engine/ml_models/{name_process}-example-input-model.xlsx'
example_input.to_excel(path_example_input_ml_model)