# Modeling process C - It is a custom model that need to be written in gurobi direcly - doesn't apply machine learning model

## Root folder and read env variables

In [1]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('\\')[:-1]
root_path = '\\'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

root path:  D:\github-mi-repo\Gurobi-ML-tips-modeling


In [2]:
import os
from dotenv import load_dotenv, find_dotenv # package used in jupyter notebook to read the variables in file .env

""" get env variable from .env """
load_dotenv(find_dotenv())

""" Read env variables and save it as python variable """
PROJECT_GCP = os.environ.get("PROJECT_GCP", "")

## RUN TRAINING

In [3]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import gcsfs
import pickle

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

# transform
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

# models
from sklearn.linear_model import LinearRegression # lr
from sklearn.linear_model import Ridge # ridge
from sklearn.linear_model import Lasso # lasso
from sklearn.tree import DecisionTreeRegressor # tree
from sklearn.ensemble import GradientBoostingRegressor #gb
from sklearn.ensemble import RandomForestRegressor #rf
#from xgboost import XGBRegressor # xgb
from  sklearn.neural_network import MLPRegressor # mlp

### 0. Define name process

In [4]:
name_process = 'process_c'

### 1. Read data

In [5]:
# load X_train
path_X_train = f'artifacts/data_training/{name_process}/X_train.pkl'
X_train = pd.read_pickle(path_X_train)

# load y_train
path_y_train = f'artifacts/data_training/{name_process}/y_train.pkl'
y_train = pd.read_pickle(path_y_train)


# ---
# load X_test
path_X_test = f'artifacts/data_training/{name_process}/X_test.pkl'
X_test = pd.read_pickle(path_X_test)

# load y_test
path_y_test = f'artifacts/data_training/{name_process}/y_test.pkl'
y_test = pd.read_pickle(path_y_test)

In [6]:
print('shape data')
print('\n --- TRAIN ---')
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)

print('\n --- TEST ---')
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

shape data

 --- TRAIN ---
X_train:  (800, 2)
y_train:  (800,)

 --- TEST ---
X_test:  (200, 2)
y_test:  (200,)


### 2. Read master tags data for this process. Sort features used to train according this order

In [7]:
### read master table - list tags
path_maestro_tags_d0eop = f'config/config_ml_models_development/MasterTable_{name_process}.xlsx'
maestro_tags = pd.read_excel(path_maestro_tags_d0eop)
maestro_tags

Unnamed: 0,TAG,FEATURES_NAMES,DESCRIPCION,CLASIFICACION_NAME,CLASIFICACION,USE_ACTUAL_MODEL
0,X3,X3,Variable de entrada al proceso C. Aparece por ...,Primary,P,MLC
1,O6,O6,Variable de entrada al proceso C. No es una va...,Observed,O,MLC
2,Y2,Y2,Variable target del proceso B y proceso C (y v...,Target,T,MLC


### 2. Define target according master table
In the master table with the list of features and target, it is possible that there are a lot of features and differents models with differents subsets of features could be trained. In the list defined bellow there are the features used in the trainning and there are the features that will be saved as output of training

In [8]:
list_target = ['Y2']
list_target

['Y2']

### 3. Define features and sort it acording master table
In the master table with the list of features and target, it is possible that there are a lot of features and differents models with differents subsets of features could be trained. In the list defined bellow there are the features used in the trainning and there are the features that will be saved as output of training

In [9]:
# mnaully set list of features used in training
list_features = ['X3', 'O6']

In [10]:
### sort list of features according the order in master table

list_features = [tag for tag in maestro_tags['TAG'].tolist() if tag in list_features]
list_features

['X3', 'O6']

# Modeling process C - It is a custom model that need to be written in gurobi direcly - doesn't apply machine learning model

In [11]:
# parametes alpha
alpha_feature_1 = 1/5
alpha_feature_2 = 15

In [12]:
target_predicted = alpha_feature_1 * X_train['X3'] + alpha_feature_2 * X_train['06']
target_predicted

29     54.601936
535    75.782243
695    41.041401
557    60.323291
836    67.907037
         ...    
106    55.050860
270    64.449265
860    47.622495
435    82.722632
102    71.963638
Length: 800, dtype: float64

In [13]:
y_train

29     54.601936
535    75.782243
695    41.041401
557    60.323291
836    67.907037
         ...    
106    55.050860
270    64.449265
860    47.622495
435    82.722632
102    71.963638
Name: Y2, Length: 800, dtype: float64

In [14]:
from sklearn.metrics import r2_score
r2_score(target_predicted, y_train)

1.0