# Simulations
Use one of the model trained to simulations. The user define the inputs of the models and the model, and the app return the prediction of this model

**It is necesary have the codes of simulations in a cloud function**

In [1]:
# ---------------------------- read env variables used in the app ----------------------------
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
PROJECT_GCP = os.environ.get("PROJECT_GCP", "")
REGION_GCP = os.environ.get("REGION_GCP", "")
BUCKET_GCP = os.environ.get("BUCKET_GCP", "")

## I) PACKAGES

In [2]:
import pandas as pd
import numpy as np
import gcsfs
import json

## II) LOAD ARTIFACTS MODEL

### 1. Load model name - SELECTED BY THE USER
The user needs to specify:
- EXPERIMENT (vertex experiment): the name of the dataset loaded
- RUN (vertex experiment): the name of the model to evaluate

In [3]:
# PARAMETERS BY THE USER
NAME_DATASET = 'develop-app-final-v2'
SELECTED_RUN = 'run-lr'

### 2. Load data train
In this case the datasets are loaded using the path that was defined when the artifacts are registry in vertex experiment. And not use the native methods of vertex to get the path of the artifacts

Dataset train are used to get an initial value

In [4]:
# example path
# f'gs://{bucket_gcs}/{experiment_name}/{run_name}/{path_artifact_locally}'

In [5]:
# X_train
path_X_train = f'gs://{BUCKET_GCP}/{NAME_DATASET}/{SELECTED_RUN}/X_train.pkl'
X_train = pd.read_pickle(path_X_train)

# y_train
path_y_train = f'gs://{BUCKET_GCP}/{NAME_DATASET}/{SELECTED_RUN}/y_train.pkl'
y_train = pd.read_pickle(path_y_train)

### 3. Load model trained

In [6]:
# load model
path_model = f'gs://{BUCKET_GCP}/{NAME_DATASET}/{SELECTED_RUN}/model.pkl'
model = pd.read_pickle(path_model)
model

### 4. Load list features

In [7]:
list_features = X_train.columns.tolist()

list_target = y_train.columns.tolist()

list_features_target = list_features + list_target

## III) SIMULATIONS

### 0. Read json config

In [8]:
def read_json_config(bucket_gcp, name_dataset):
    """
    Read json config
    """
    # connect to GCS as pythonic way
    fs = gcsfs.GCSFileSystem()
    
    # path json
    path_gcs_json = f'gs://{bucket_gcp}/{name_dataset}/data/parameters.json'
    
    # read json
    with fs.open(path_gcs_json, 'r') as file:
        dict_parameters_data = json.load(file)
    
    return dict_parameters_data

In [9]:
dict_parameters_data = read_json_config(BUCKET_GCP, NAME_DATASET)
# forecast
steps_forecast = dict_parameters_data['steps_forecast']

### 1. Get initial values

#### 1.1 Calculate min and max value of each feature

In [10]:
min_values_features = X_train.min()
min_values_features

CMPC.SN     1852.572550
CHILE.SN      56.518215
COPEC.SN    7707.690430
MSFT         208.021673
AAPL         117.101184
GOOG          86.740835
TSLA         198.043335
O             48.015911
BHP           53.130472
dtype: float64

In [11]:
max_values_features = X_train.max()
max_values_features

CMPC.SN     2232.611247
CHILE.SN      68.359014
COPEC.SN    9028.075195
MSFT         252.983510
AAPL         140.057912
GOOG         114.947001
TSLA         292.005564
O             57.970872
BHP           63.351933
dtype: float64

#### 1.2 Calculate min and max date to get the initial value

In [12]:
# min date
min_date = X_train.index.min()

# max date
max_date = X_train.index.max()

# median date
index_median = round(X_train.index.shape[0] / 2, 0)
median_date = X_train.iloc[[index_median], :].index[0]

print(min_date)
print(max_date)
print(median_date)

2021-01-06 00:00:00
2021-04-22 00:00:00
2021-03-02 00:00:00


In [13]:
stop

NameError: name 'stop' is not defined

#### 1.3 Given a certain date, get the initial values

In [15]:
#### SELECT A INITIAL DATE FROM CALENDER
INITIAL_DATE = '2021-02-27'

#### 1.4 Get Initial Instance to do Inference

In [16]:
#### GET DATA FROM DATE SELECTED

# transform string into datetime format pandas
initial_date_to_inference = pd.Timestamp(INITIAL_DATE)

# try get the row of the initial date - if the date doesn't exist, search the most close date
try:
    initial_instance = X_train.loc[[initial_date_to_inference]]
except KeyError:
    #closest_date = X_train.index[(X_train.index - initial_date_to_inference).argmin()] # this has negative values
    closest_date = X_train.index[np.abs((X_train.index - initial_date_to_inference)).argmin()]
    initial_instance = X_train.loc[[closest_date]]

In [17]:
initial_instance

Unnamed: 0_level_0,CMPC.SN,CHILE.SN,COPEC.SN,MSFT,AAPL,GOOG,TSLA,O,BHP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-02-26,2012.220459,60.380447,8218.581055,226.008748,120.43308,102.723167,233.304443,51.26775,61.722312


In [23]:
initial_instance['CMPC.SN'].values[0]

2012.2204589843748

### 2. Predict with the initial value

#### 2.1 Predict with the initial value

In [18]:
prediction = model.predict(initial_instance)
prediction

array([[336.54807089]])

#### 2.2 Show the true value of this instance

In [19]:
# try get the row of the initial date - if the date doesn't exist, search the most close date
try:
    y_true = y_train.loc[[initial_date_to_inference]].values
except KeyError:
    closest_date = y_train.index[np.abs((y_train.index - initial_date_to_inference)).argmin()]
    y_true = y_train.loc[[closest_date]].values

In [20]:
y_true

array([[332.95641073]])