# Modeling process C - It is a custom model that need to be written in gurobi direcly - doesn't apply machine learning model

## Root folder and read env variables

In [1]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('\\')[:-1]
root_path = '\\'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

root path:  D:\github-mi-repo\Gurobi-ML-tips-modeling


In [2]:
import os
from dotenv import load_dotenv, find_dotenv # package used in jupyter notebook to read the variables in file .env

""" get env variable from .env """
load_dotenv(find_dotenv())

""" Read env variables and save it as python variable """
PROJECT_GCP = os.environ.get("PROJECT_GCP", "")

## RUN TRAINING

In [3]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import gcsfs
import pickle

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

# transform
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

# models
from sklearn.linear_model import LinearRegression # lr
from sklearn.linear_model import Ridge # ridge
from sklearn.linear_model import Lasso # lasso
from sklearn.tree import DecisionTreeRegressor # tree
from sklearn.ensemble import GradientBoostingRegressor #gb
from sklearn.ensemble import RandomForestRegressor #rf
#from xgboost import XGBRegressor # xgb
from  sklearn.neural_network import MLPRegressor # mlp

### 0. Define name process

In [4]:
name_process = 'process_c'

### 1. Read data

In [5]:
# load X_train
path_X_train = f'artifacts/data_training/{name_process}/X_train.pkl'
X_train = pd.read_pickle(path_X_train)

# load y_train
path_y_train = f'artifacts/data_training/{name_process}/y_train.pkl'
y_train = pd.read_pickle(path_y_train)


# ---
# load X_test
path_X_test = f'artifacts/data_training/{name_process}/X_test.pkl'
X_test = pd.read_pickle(path_X_test)

# load y_test
path_y_test = f'artifacts/data_training/{name_process}/y_test.pkl'
y_test = pd.read_pickle(path_y_test)

In [6]:
print('shape data')
print('\n --- TRAIN ---')
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)

print('\n --- TEST ---')
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

shape data

 --- TRAIN ---
X_train:  (800, 2)
y_train:  (800, 1)

 --- TEST ---
X_test:  (200, 2)
y_test:  (200, 1)


### 2. Read master tags data for this process. Sort features used to train according this order

In [7]:
### read master table - list tags
path_maestro_tags_d0eop = f'config/config_ml_models_development/MasterTable_{name_process}.xlsx'
maestro_tags = pd.read_excel(path_maestro_tags_d0eop)
maestro_tags

Unnamed: 0,TAG,FEATURES_NAMES,DESCRIPCION,CLASIFICACION_NAME,CLASIFICACION,USE_ACTUAL_MODEL
0,X3,X3,Variable de entrada al proceso C. Aparece por ...,Primary,P,PR_C_Y2
1,O7,O7,Variable de entrada al proceso C. No es una va...,Observed,O,PR_C_Y2
2,Y2,Y2,Variable target del proceso B y proceso C (y v...,Target,T,PR_C_Y2


### 2. Define target according master table
In the master table with the list of features and target, it is possible that there are a lot of features and differents models with differents subsets of features could be trained. In the list defined bellow there are the features used in the trainning and there are the features that will be saved as output of training

In [8]:
list_target = ['Y2']
list_target

['Y2']

### 3. Define features and sort it acording master table
In the master table with the list of features and target, it is possible that there are a lot of features and differents models with differents subsets of features could be trained. In the list defined bellow there are the features used in the trainning and there are the features that will be saved as output of training

In [9]:
# mnaully set list of features used in training
list_features = ['X3', 'O7']

In [10]:
### sort list of features according the order in master table

list_features = [tag for tag in maestro_tags['TAG'].tolist() if tag in list_features]
list_features

['X3', 'O7']

# Modeling process C - It is a custom model that need to be written in gurobi direcly - doesn't apply machine learning model

In [11]:
# parametes alpha
alpha_feature_1 = 1/5
alpha_feature_2 = 15

In [12]:
target_predicted = alpha_feature_1 * X_train['X3'] + alpha_feature_2 * X_train['O7']
target_predicted

29     121.268603
535    142.448910
695    107.708068
557    126.989957
836    134.573703
          ...    
106    121.717527
270    131.115932
860    114.289161
435    149.389299
102    138.630305
Length: 800, dtype: float64

In [13]:
y_train

Unnamed: 0,Y2
29,121.268603
535,142.448910
695,107.708068
557,126.989957
836,134.573703
...,...
106,121.717527
270,131.115932
860,114.289161
435,149.389299


In [14]:
from sklearn.metrics import r2_score
r2_score(target_predicted, y_train) # r2 = 1 if the value generated using the formula are equal to target. In this example always r2 = 1 because the data was generate to achieve that

1.0

## SAVE OUTPUTS TRAINING
Save the map altough no machine learning model was trained. But the features and target for the formula model is also mapped

### 1. Save artifact model. In this case there is not a ml model. The factor of each feature is saved in a excel instead of pkl model

In [15]:
#### generate excel with the formula to custom model
artifact_model = pd.DataFrame()
artifact_model['feature_name'] = list_features # add list features
artifact_model['factor_model'] = [alpha_feature_1, alpha_feature_2] # add list of factor for each feature
artifact_model

Unnamed: 0,feature_name,factor_model
0,X3,0.2
1,O7,15.0


In [16]:
# save excel
path_model = f'artifacts/models/{name_process}/model.xlsx'
artifact_model.to_excel(path_model, index = False)

### 2. Save list of features
Save table master tag only with the tags used to train the model. 

OBS IMPORTANT: remember that the list of features was sorted according the master table so this order was used to train. Also this table has the differentation between no-controlable, controlable and targer variables

In [17]:
# generate a list of features + target
list_features_target = list_features + list_target

# filter master tag with only the features+target used to train the ml models
maestro_tags = maestro_tags[maestro_tags['TAG'].isin(list_features_target)]
maestro_tags = maestro_tags.reset_index().drop(columns = 'index')

In [18]:
# save master in config folder that will used to create the optimization engine
path_list_features_target_to_optimization = f'config/optimization_engine/ml_models/MasterTable_{name_process}.xlsx'
maestro_tags.to_excel(path_list_features_target_to_optimization, index = False)

### 3. Save example input

In [19]:
# example input
example_input = X_train.head(1)
example_input

Unnamed: 0,X3,O7
29,2.939042,8.045386


In [20]:
# save example input

path_example_input_ml_model = f'config/optimization_engine/ml_models/{name_process}-example-input-model.xlsx'
example_input.to_excel(path_example_input_ml_model)