In [1]:
from zipfile import ZipFile
import pandas as pd

zip_file = ZipFile("/content/data/jena_climate_2009_2016.csv.zip")

with zip_file.open("jena_climate_2009_2016.csv") as zf:
    df = pd.read_csv(zf)
    
#Trabalhamos com horas
df=df[5::6]

# Retirar coluna alvo e armazenar em uma variável separada
y=df['T (degC)'].values
df=df.drop(columns=['T (degC)'])


In [2]:
#Transformar em numpy
import numpy as np

X=np.zeros((len(df),13))
for i in range(len(df)):
    cols=df.iloc[i]
    for j in range(13):
        X[i][j]=cols[j+1]

In [3]:
X.shape,y.shape

((70091, 13), (70091,))

## MLFlow

- **Melhores**:
    - Macro (Metade treino e validação)
        - prever outra metade (RandomForestRegressor)
        - prever um ano (LinearRegression)
    - Micro
       - 30 horas para prever as próximas 10 horas (ElasticNet)
       - 100 horas para prever as próximas 100 horas (ElasticNet)


In [4]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")

In [5]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [6]:
mlflow.set_tracking_uri("http://mlflow-server:5000")

## Macro

In [7]:
mlflow.set_experiment("Macro")

# metade treino e validação
forecast_len=len(X)//2
train_x, train_y, test_x, test_y = X[:len(X)//2],y[:len(X)//2],X[len(X)//2:forecast_len+len(X)//2],y[len(X)//2:forecast_len+len(X)//2]


with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "metade-para-prever-metade")
    lr = RandomForestRegressor(random_state=42)
    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("RMSE: %s" % rmse)
    print("MAE: %s" % mae)
    print("R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":

        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(lr, "model", registered_model_name="RandomForestRegressor-metade-prever-metade")
    else:
        mlflow.sklearn.log_model(lr, "model")

2022/11/27 17:12:03 INFO mlflow.tracking.fluent: Experiment with name 'Macro' does not exist. Creating a new experiment.


RMSE: 0.017477145690490678
MAE: 0.0035196946782763533
R2: 0.9999952417168407


Successfully registered model 'RandomForestRegressor-metade-prever-metade'.
2022/11/27 17:12:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: RandomForestRegressor-metade-prever-metade, version 1
Created version '1' of model 'RandomForestRegressor-metade-prever-metade'.


In [8]:
mlflow.set_experiment("Macro")

# metade treino e validação para prever um ano
forecast_len=8766 #8766 horas=1 ano
train_x, train_y, test_x, test_y = X[:len(X)//2],y[:len(X)//2],X[len(X)//2:forecast_len+len(X)//2],y[len(X)//2:forecast_len+len(X)//2]


with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "metade-para-prever-1-ano")
    lr = LinearRegression()
    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("RMSE: %s" % rmse)
    print("MAE: %s" % mae)
    print("R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":

        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(lr, "model", registered_model_name="LinearRegression-metade-prever-um-ano")
    else:
        mlflow.sklearn.log_model(lr, "model")

RMSE: 0.009231508833922214
MAE: 0.006771716537976536
R2: 0.9999988489986125


Successfully registered model 'LinearRegression-metade-prever-um-ano'.
2022/11/27 17:12:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: LinearRegression-metade-prever-um-ano, version 1
Created version '1' of model 'LinearRegression-metade-prever-um-ano'.


## Micro

#### 30 horas para prever 10h

In [9]:
N=30
seg_X=[]
seg_y=[]
c=0
c1=0
c2=0
sample=[]
sample_y=[]
while(c<X.shape[0]):
    if c1<N:
        sample.append(X[c])
        c1+=1
    else:
        sample_y.append(y[c])
        if len(sample_y)==10:
            c1=0
            c2=0
            seg_X.append(sample)
            seg_y.append(sample_y)
            sample=[]
            sample_y=[]
        c2+=1
    c+=1

In [10]:
seg_X=np.asarray(seg_X)
seg_y=np.asarray(seg_y)
seg_X=seg_X.reshape((seg_X.shape[0],seg_X.shape[1]*seg_X.shape[2]))

seg_X.shape,seg_y.shape

((1752, 390), (1752, 10))

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(
    seg_X, seg_y, test_size=0.33, random_state=42)

In [12]:
mlflow.set_experiment("Micro")

with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "30-horas-para-prever-10-horas")
    lr = ElasticNet()
    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("RMSE: %s" % rmse)
    print("MAE: %s" % mae)
    print("R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":

        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(lr, "model", registered_model_name="ElasticNet-30-horas-prever-10-horas")
    else:
        mlflow.sklearn.log_model(lr, "model")

2022/11/27 17:12:24 INFO mlflow.tracking.fluent: Experiment with name 'Micro' does not exist. Creating a new experiment.


RMSE: 2.231146276331902
MAE: 1.6182535174756567
R2: 0.9334201830059466


Successfully registered model 'ElasticNet-30-horas-prever-10-horas'.
2022/11/27 17:12:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: ElasticNet-30-horas-prever-10-horas, version 1
Created version '1' of model 'ElasticNet-30-horas-prever-10-horas'.


####  100 horas para prever 10h

In [13]:
N=100
seg_X=[]
seg_y=[]
c=0
c1=0
c2=0
sample=[]
sample_y=[]
while(c<X.shape[0]):
    if c1<N:
        sample.append(X[c])
        c1+=1
    else:
        sample_y.append(y[c])
        if len(sample_y)==10:
            c1=0
            c2=0
            seg_X.append(sample)
            seg_y.append(sample_y)
            sample=[]
            sample_y=[]
        c2+=1
    c+=1

In [14]:
seg_X=np.asarray(seg_X)
seg_y=np.asarray(seg_y)
seg_X=seg_X.reshape((seg_X.shape[0],seg_X.shape[1]*seg_X.shape[2]))

seg_X.shape,seg_y.shape

((637, 1300), (637, 10))

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(
    seg_X, seg_y, test_size=0.33, random_state=42)

In [16]:
with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "100-horas-para-prever-10-horas")
    lr = ElasticNet()
    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("RMSE: %s" % rmse)
    print("MAE: %s" % mae)
    print("R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":

        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(lr, "model", registered_model_name="ElasticNet-100-horas-prever-10-horas")
    else:
        mlflow.sklearn.log_model(lr, "model")

RMSE: 2.4608776646906025
MAE: 1.8010746663016142
R2: 0.9213792386289871


Successfully registered model 'ElasticNet-100-horas-prever-10-horas'.
2022/11/27 17:12:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: ElasticNet-100-horas-prever-10-horas, version 1
Created version '1' of model 'ElasticNet-100-horas-prever-10-horas'.
