## Engenharia de features

### Template - Predição climática

---

In [1]:
# !pip install awswrangler

In [2]:
import os 
import sys
import boto3
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
import awswrangler as wr
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

sys.path.append(os.path.dirname(os.getcwd())+'/src')
import get_data
import feature_engineering



Dados do projeto:

In [3]:
project_name = 'ds-mlops'

In [4]:
bucket = 'ons-{}'.format(project_name)

---

### Criação dos datasets de treino e validação

In [5]:
file_path = 'framework-overview/data/raw/weather.csv'
df = get_data.read_csv(bucket, file_path)

In [6]:
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [7]:
target = 'weather'
features = [i for i in df.columns if i != target]
print('target: ', target, ', features: ', features)

target:  weather , features:  ['date', 'precipitation', 'temp_max', 'temp_min', 'wind']


In [8]:
# training set
df_train = df[df['date']<'2015-01-01'].copy()
print(df_train.shape)

(1096, 6)


In [9]:
# validation set
df_val = df[df['date']>='2015-01-01'].copy()
print(df_val.shape)

(365, 6)


In [10]:
print(f'total de amostras na origem: {len(df)}')
print(f'total de amostras usadas no treinamento: {len(df_train)}')
print(f'total de amostras usadas na validação: {len(df_val)}')
if len(df) == len(df_train) + len(df_val):
    print('Todas as amostras estão sendo usadas na modelagem!')
else:
    print('Nem todas as amostras estão sendo usadas na modelagem!')

total de amostras na origem: 1461
total de amostras usadas no treinamento: 1096
total de amostras usadas na validação: 365
Todas as amostras estão sendo usadas na modelagem!


---

### Aplicação do pipeline

In [11]:
pipeline = Pipeline([
  ('engenharia_de_features', feature_engineering.feature_engineering_pipeline())  
])

In [12]:
# verificando steps registrados
pipeline

Pipeline(memory=None,
         steps=[('engenharia_de_features', feature_engineering_pipeline())],
         verbose=False)

Aplicação no dataset de treino:

In [13]:
df_train = pipeline.fit_transform(df_train)
df_train.head()

Unnamed: 0_level_0,weather,precipitation,wind,temp_delta,temp_mean,cyclical_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-01,2,0.0,4.7,7.8,8.9,1.0
2012-01-02,1,10.9,4.5,7.8,6.7,0.999852
2012-01-03,1,0.8,2.3,4.5,9.45,0.999408
2012-01-04,1,20.3,4.7,6.6,8.9,0.998669
2012-01-05,1,1.3,6.1,6.1,5.85,0.997634


Aplicação no dataset de validação:

In [14]:
df_val = pipeline.transform(df_val)
df_val.head()

Unnamed: 0_level_0,weather,precipitation,wind,temp_delta,temp_mean,cyclical_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01,4,0.0,1.2,8.8,1.2,0.999991
2015-01-02,1,1.5,2.3,5.6,2.8,0.999769
2015-01-03,3,0.0,1.7,3.3,3.35,0.999251
2015-01-04,1,10.2,4.5,7.3,6.95,0.998438
2015-01-05,1,8.1,6.4,2.8,10.8,0.997329


---

### Salvar dados para a etapa de modelagem

In [15]:
silver_path_train = 'framework-overview/data/processed/train/dataset.csv'
df_train.to_csv(f's3://{bucket}/{silver_path_train}', header=False, index=False)

silver_path_val = 'framework-overview/data/processed/validation/dataset.csv'
df_val.to_csv(f's3://{bucket}/{silver_path_val}', header=False, index=False)

---

### Salvar pipeline como artefato

In [16]:
feature_engineering.save_pipeline(pipeline, bucket, 'framework-overview/artifacts/fe_pipeline/pipeline.joblib')

--- 
### Ler pipeline

In [17]:
pipeline = feature_engineering.read_pipeline(bucket, 'framework-overview/artifacts/fe_pipeline/pipeline.joblib')

In [18]:
pipeline

Pipeline(memory=None,
         steps=[('engenharia_de_features', feature_engineering_pipeline())],
         verbose=False)

In [19]:
pipeline[0].gen_time

datetime.datetime(2022, 8, 7, 23, 6, 45, 631085)