In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transform_output_format import get_4D_output, get_2D_output
from sklearn.base import clone
from utils import load_data_input
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor, RegressorChain

In [2]:
GHI,CLS,SZA,SAA,dates = load_data_input("/home/jambe/solar-forecasting/X_train_copernicus.npz")
y_train_csv = pd.read_csv('/home/jambe/solar-forecasting/y_train_zRvpCeO_nQsYtKN.csv')
y_train_4D = get_4D_output(y_train_csv)

In [17]:
X = np.concatenate([GHI,CLS,SZA,SAA], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y_train_4D, test_size=0.05, random_state=42)

In [4]:
def prepare_data(sequence):
    """_summary_

    Args:
        sequence (array(nb_examples,nb_img,81,81)): _description_
    """
    nb_samples, nb_img, size1, size2 = sequence.shape
    seq_swap = sequence.swapaxes(1,2).swapaxes(2,3)
    return seq_swap.reshape((nb_samples*size1*size2,nb_img))


In [45]:
def create_model_BT(n_estimators=50, lr=0.1, max_depth=4, subsample=0.5, min_samples_split = 0.05, max_features=0.45, n_jobs=-1, verbose=0):
    model_basis = GradientBoostingRegressor(loss="ls", learning_rate=lr, n_estimators=n_estimators,max_depth=max_depth, subsample=subsample, min_samples_split=min_samples_split, max_features=max_features, verbose=verbose)
    model = RegressorChain(model_basis)
    return model


In [46]:
model= create_model_BT(n_estimators=60, max_depth=12, subsample=0.7, verbose=1)
X_train_reshape = prepare_data(X_train[:,:,15:66,15:66])
y_train_reshape = prepare_data(y_train[:])
X_test_reshape = prepare_data(X_test[:,:,15:66,15:66])
y_test_reshape = prepare_data(y_test[:])
y_train_reshape.shape

(4556952, 4)

In [47]:
model.fit(X_train_reshape,y_train_reshape)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       54555.1510       12439.0886           23.66m
         2       44459.0277       10074.6002           21.97m
         3       36282.8427        8187.8436           20.76m
         4       29636.3197        6647.6111           19.85m
         5       24213.2322        5425.1642           19.09m
         6       19860.5029        4360.5399           18.58m
         7       16278.1157        3572.5116           18.22m
         8       13372.5631        2901.4661           17.73m
         9       11027.1473        2337.7452           17.48m
        10        9149.6166        1878.0228           17.16m
        20        1672.3000         249.1195           13.84m
        30         645.7247          35.0684           11.09m
        40         463.3287           9.6850            8.17m
        50         410.6481           3.0178            4.48m
        60         384.4958           2.2079            0.00s
      I

MemoryError: could not allocate 36455616 bytes

In [36]:
model.score(X_test_reshape, y_test_reshape)

0.5537894399956342

In [16]:
filename = 'bt_100trees_15depth.sav'
pickle.dump(model, open(filename, 'wb'))

In [2]:
loaded_model = pickle.load(open('boosted_trees_0.sav', 'rb'))

In [5]:
#GHI_test,CLS_test,SZA_test,SAA_test,dates_test = load_data_input("/home/jambe/solar-forecasting/X_test_copernicus.npz")
X_test = np.concatenate([GHI_test,CLS_test,SZA_test,SAA_test], axis=1)
X_test_reshape = prepare_data(X_test[:,:,15:66,15:66])


In [6]:
X_test_reshape.shape

(4788441, 28)

In [9]:
y_predict = loaded_model.predict(X_test_reshape)
# y_preds = y_predict.reshape(1841,4,51,51)

In [14]:
print(y_predict.shape)
y_predict = y_predict.reshape(1841, 51, 51, 4)
y_predict = y_predict.swapaxes(2,3).swapaxes(1,2)
y_preds_2D = get_2D_output(y_predict)
y_preds_2D.to_csv('boosted_trees.csv', index=False)

(4788441, 4)
