This is a base jupyter notebook for other solutions. It goes through the dataset A, and it finds the highest and the lowest energy production. It generates the solution with random floats inbetween the highest and the lowest production.

In [317]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [318]:
%autoreload

# load libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# load my custom function
from solutions.few_regression_types import data_preprocess


In [319]:
# read datasets
# for simplicity, I use X_train_estimated as test data for cross validation
y = pd.read_parquet("../../dataset/A/train_targets.parquet")
X_train = pd.read_parquet("../../dataset/A/X_train_observed.parquet")
X_test = pd.read_parquet("../../dataset/A/X_train_estimated.parquet")

In [320]:
# edit data
X_train, y_train = data_preprocess.preprocess_train_data(X_train, y, "everything")
X_test, y_test = data_preprocess.preprocess_train_data(X_test, y, "everything")

In [321]:
print(f"X_train.shape = {X_train.shape}")
print(f"X_test.shape = {X_test.shape}")
print(f"y_train.shape = {y_train.shape}")
print(f"y_test.shape = {y_test.shape}")

X_train.shape = (29667, 47)
X_test.shape = (4394, 47)
y_train.shape = (29667, 1)
y_test.shape = (4394, 1)


## Machine learning stuff

In [310]:
# decision tree
decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, y_train)
y_pred_tree = decision_tree.predict(X_test)

mae_tree = np.mean(np.abs(np.array(y_test) - y_pred_tree))
mae_tree

621.0646291618334

In [311]:
# random forest
random_forest = RandomForestRegressor(n_estimators=100)  # You can adjust the number of trees (n_estimators) as needed
random_forest.fit(X_train, y_train)
y_pred_forest = random_forest.predict(X_test)

mae_forest = np.mean(np.abs(np.array(y_test) - y_pred_forest))
mae_forest

  random_forest.fit(X_train, y_train)


601.50148550096

In [312]:
# gradient boosting
gradient_boosting = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)  # You can adjust the number of trees (n_estimators) as needed
gradient_boosting.fit(X_train, y_train)
y_pred_grad = gradient_boosting.predict(X_test)

mae_grad = np.mean(np.abs(np.array(y_test) - y_pred_grad))
mae_grad

  y = column_or_1d(y, warn=True)


592.2928322998536

In [313]:
# elastic net
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)
elastic_net.fit(X_train, y_train)
y_pred_elast_net = elastic_net.predict(X_test)

mae_elast_net = np.mean(np.abs(np.array(y_test) - y_pred_elast_net))
mae_elast_net

  model = cd_fast.enet_coordinate_descent(


599.946498024572

In [314]:
# support vector regression
svr_model = SVR(kernel='rbf', C=1)
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

mae_svr = np.mean(np.abs(np.array(y_test) - y_pred_svr))
mae_svr

  y = column_or_1d(y, warn=True)


473.76274242555763

In [334]:
# neural network regression
nn_model = MLPRegressor(hidden_layer_sizes=(512, 512), activation='relu', verbose=True, learning_rate="adaptive")
nn_model.fit(X_train, y_train)
y_pred_nn = nn_model.predict(X_test)
mae_nn = np.mean(np.abs(np.array(y_test) - y_pred_nn))
mae_nn

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 388104585.95711404
Iteration 2, loss = 212355.63657117
Iteration 3, loss = 196769.53189138
Iteration 4, loss = 203915.13007081
Iteration 5, loss = 206784.05595162
Iteration 6, loss = 194935.93921548
Iteration 7, loss = 201101.64384462
Iteration 8, loss = 208473.30727063
Iteration 9, loss = 519577.43169216
Iteration 10, loss = 224599.43467255
Iteration 11, loss = 256750.77851939
Iteration 12, loss = 261413.44894045
Iteration 13, loss = 237746.85662769
Iteration 14, loss = 292756.00359130
Iteration 15, loss = 317125.58913073
Iteration 16, loss = 1819653.70409033
Iteration 17, loss = 832021.81594735
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


1106.8348379014724

In [332]:
y_pred_nn[y_pred_nn < 0] = 0
mae_nn = np.mean(np.abs(np.array(y_test) - y_pred_nn))
mae_nn

654.6119942364744

# Prediction on real test data


SVR model came out with the lowest mean absolute error. So far we did only cross validation on the training data. SVR will be used on the real test data, on the datasets B and C and to generate the output csv file.

In [316]:
# first dataset
prediction = []

for dataset in ['A', 'B', 'C']:
    # read the data
    print(f"dataset {dataset}")
    X_train = pd.concat([
        pd.read_parquet("../../dataset/A/X_train_observed.parquet"),
        pd.read_parquet("../../dataset/A/X_train_estimated.parquet")
    ], ignore_index=True)
    y_train = pd.read_parquet("../../dataset/A/train_targets.parquet")
    X_test = pd.read_parquet("../../dataset/A/X_test_estimated.parquet")
    # preprocess the data
    X_train, y_train = data_preprocess.preprocess_train_data(X_train, y_train, "everything")
    X_test = data_preprocess.preprocess_test_data(X_test, "everything")
    # learn 
    model = SVR(kernel='rbf', C=1.0)
    model.fit(X_train, y_train)
    prediction = np.concatenate((prediction, model.predict(X_test)))
prediction[prediction < 0.] = 0.
df = pd.DataFrame({'prediction': prediction})
df.to_csv('svr.csv', index_label='id')

dataset A


  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 