In [55]:
import pandas as pd
import numpy as np
import scripts.mytools as mytools

In [56]:
df = pd.read_csv('dados_A215_H_2008-06-13_2024-01-01.csv',sep= ';', header = 9)
df['Data Medicao'] = pd.to_datetime(df['Data Medicao'])
df['Hora Medicao'] = pd.to_datetime(df['Hora Medicao'])
df['ano'] = df['Data Medicao'].dt.year
df['mes'] = df['Data Medicao'].dt.month
df['dia'] = df['Data Medicao'].dt.day
# df['hora']= df['Hora Medicao'].str[-4:-2].astype(int)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace(',', '.').astype(float)


df['Hora Medicao'] = df['Hora Medicao'].astype(str)
df['hora'] = df['Hora Medicao'].str[-4:-2].astype(int)
df = df.drop(['Hora Medicao'],axis=1)
df['hora'] = pd.to_timedelta(df['hora'], unit='h') - pd.Timedelta(hours=3)
df['Data_Hora'] = df['Data Medicao'] + df['hora']
df.set_index('Data_Hora', inplace=True)
df['Data_Hora'] = df['Data Medicao'] + df['hora']
df = df.drop('Unnamed: 22', axis =1)
df['estacao'] = df.apply(mytools.encontrar_estacao, axis=1)
df = df.sort_index()

In [57]:
df = df.dropna()

In [58]:
col_numericas = [
    'PRECIPITACAO TOTAL, HORARIO(mm)', 'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA(mB)',
    'PRESSAO ATMOSFERICA REDUZIDA NIVEL DO MAR, AUT(mB)',
    'PRESSAO ATMOSFERICA MAX.NA HORA ANT. (AUT)(mB)',
    'PRESSAO ATMOSFERICA MIN. NA HORA ANT. (AUT)(mB)',
    'RADIACAO GLOBAL(Kj/m²)', 'TEMPERATURA DA CPU DA ESTACAO(°C)',
    'TEMPERATURA DO AR - BULBO SECO, HORARIA(°C)',
    'TEMPERATURA DO PONTO DE ORVALHO(°C)',
    'TEMPERATURA MAXIMA NA HORA ANT. (AUT)(°C)',
    'TEMPERATURA MINIMA NA HORA ANT. (AUT)(°C)',
    'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT)(°C)',
    'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT)(°C)',
    'TENSAO DA BATERIA DA ESTACAO(V)',
    'UMIDADE REL. MAX. NA HORA ANT. (AUT)(%)',
    'UMIDADE REL. MIN. NA HORA ANT. (AUT)(%)',
    'UMIDADE RELATIVA DO AR, HORARIA(%)', 'VENTO, RAJADA MAXIMA(m/s)',
    'VENTO, VELOCIDADE HORARIA(m/s)'
]

col_categoricas = ['estacao']

In [59]:
# features = ['TEMPERATURA DO PONTO DE ORVALHO(°C)','VENTO, VELOCIDADE HORARIA(m/s)', 'UMIDADE RELATIVA DO AR, HORARIA(%)', 'TEMPERATURA DO AR - BULBO SECO, HORARIA(°C)']
parametros = (col_numericas + col_categoricas)
variavel = ['RADIACAO GLOBAL(Kj/m²)']
features = df[parametros]
target = df[variavel]
# X = df[target].values
# y = df[features].values

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
# X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

In [61]:
numeric_features = df[col_numericas]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(missing_values= np.nan ,strategy="median")),
           ("scaler", MinMaxScaler())]
)

categorical_features = df[col_categoricas]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", None),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, col_numericas),
        ("cat", categorical_transformer, col_categoricas),
    ]
)


In [62]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

In [63]:
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits = 5)
for train_index, test_index in tss.split(df):
    X_train, X_test = features.iloc[train_index, :], features.iloc[test_index,:]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]


In [64]:
from sklearn.ensemble import RandomForestRegressor
Rf = RandomForestRegressor()
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", Rf)],
    verbose = True
)

pipe.fit(X_train, y_train)
print("model score: %.3f" % pipe.score(X_test, y_test))


[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.2s


  return fit_method(estimator, *args, **kwargs)


[Pipeline] ............. (step 2 of 2) Processing model, total= 1.2min
model score: 1.000


In [65]:
import joblib
filename = 'models/pipelinemodel.json'
joblib.dump(pipe, filename)

['models/pipelinemodel.json']

In [66]:
loaded_model = joblib.load(filename) 

In [67]:
loaded_model.predict(X_test)

array([278.97614,   9.2012 ,   1.5802 , ..., 101.59369,   1.80153,
        -2.86381])