In [38]:
import torch
import random
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.optim import AdamW
from xgboost import XGBClassifier

import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pickle as pkl
import scipy
import os

from torch.nn import Linear, ReLU, Dropout
from torch.nn.functional import relu
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [39]:
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = (
        False  # Force cuDNN to use a consistent convolution algorithm
    )
    torch.backends.cudnn.deterministic = (
        True  # Force cuDNN to use deterministic algorithms if available
    )
    torch.use_deterministic_algorithms(
        True
    )  # Force torch to use deterministic algorithms if available


In [40]:
set_seeds(42)

In [41]:
try:
    corlat_dataset = pkl.load(open("Data/corlat/processed_data/corlat_preprocessed.pickle", "rb"))
except:
    # move dir to /ibm/gpfs/home/yjin0055/Project/DayAheadForecast
    os.chdir("/ibm/gpfs/home/yjin0055/Project/DayAheadForecast")
    corlat_dataset = pkl.load(open("Data/corlat/processed_data/corlat_preprocessed.pickle", "rb"))

In [42]:
# Obtain the maximum size of N_constraints and N_variables across the dataset.

max_N_constraints = max(
    len(x["constraint_node_features"]) for x in corlat_dataset
)

max_N_variables = max(
    len(x["var_node_features"]) for x in corlat_dataset
)

min_N_constraints = min(
    len(x["constraint_node_features"]) for x in corlat_dataset
)

min_N_variables = min(
    len(x["var_node_features"]) for x in corlat_dataset
)

In [43]:
print("Maximum number of variables: ", max_N_variables)
print("Maximum number of constraints: ", max_N_constraints)
print("Minimum number of variables: ", min_N_variables)
print("Minimum number of constraints: ", min_N_constraints)

Maximum number of variables:  466
Maximum number of constraints:  551
Minimum number of variables:  466
Minimum number of constraints:  470


In [44]:
print("Number of variable node features: ", len(corlat_dataset[0]["var_node_features"].columns))
print("Number of constraint node features: ", len(corlat_dataset[0]["constraint_node_features"].columns))

Number of variable node features:  18
Number of constraint node features:  10


In [45]:
# for each variable node features, pad with 0.0s to make it the same length as the maximum number of variables

var_node_features = np.stack(
    [
        np.pad(
            x["var_node_features"].values,
            ((0, max_N_variables - len(x["var_node_features"])), (0, 0)),
            "constant",
            constant_values=0.0,
        )
        for x in corlat_dataset
    ]
)
            

In [46]:
constraint_node_features = np.stack(
    [
        np.pad(
            x["constraint_node_features"].values,
            ((0, max_N_constraints - len(x["constraint_node_features"])), (0, 0)),
            "constant",
            constant_values=0.0,
        )
        for x in corlat_dataset
    ]   
)

In [47]:
# for var_node_features and constraint_node_features, reshape to (N_samples, -1) to feed into the neural network
var_input = var_node_features.reshape(var_node_features.shape[0], -1)
constraint_input = constraint_node_features.reshape(constraint_node_features.shape[0], -1)

In [48]:
var_node_features.shape

(2001, 466, 18)

In [49]:
print("Shape of variable features input: ", var_input.shape)
print("Shape of constraint features input: ", constraint_input.shape)

Shape of variable features input:  (2001, 8388)
Shape of constraint features input:  (2001, 5510)


In [50]:
# get A matrix input by stacking the csr_matrix of each sample getting shape of N_samples x (A.shape[0] x A.shape[1])
A_input = np.vstack([x["A"] for x in corlat_dataset])

In [51]:
A_feature_list = []
for i in range(len(corlat_dataset)):
    n_cons = corlat_dataset[i]["A"].shape[0]

    # for row in range(n_vars):
    #     for col in range(n_cons):
    #         if input_dict_list[i]["A"][row, col] != 0:
    #             adj_matrix[row, n_vars + col] = input_dict_list[i]["A"][row, col]
    #             adj_matrix[n_vars + col, row] = input_dict_list[i]["A"][row, col]

    I, J, V = scipy.sparse.find(corlat_dataset[i]["A"])
    # adj_matrix[I, n_vars + J] = V
    # adj_matrix[n_vars + J, I] = V

    # # convert to COO format
    edge_index = torch.stack([torch.tensor(I), torch.tensor(n_cons + J)], dim=0)

    # expand V to 2D
    edge_attr = torch.tensor(V).unsqueeze(1)

    tmp_dict = {"edge_index": edge_index, "edge_attr": edge_attr}
    A_feature_list.append(tmp_dict)

In [52]:
# for each sample, pad the edge_index and edge_attr to make it the same length as the maximum length of edge_index and edge_attr
max_edge_index_len = max([len(x["edge_index"][0]) for x in A_feature_list])
max_edge_attr_len = max([len(x["edge_attr"]) for x in A_feature_list])

for i in range(len(A_feature_list)):
    edge_index = A_feature_list[i]["edge_index"]
    edge_attr = A_feature_list[i]["edge_attr"]

    # pad edge_index
    edge_index = torch.cat(
        [
            edge_index,
            torch.zeros(
                2, max_edge_index_len - len(edge_index[0]), dtype=torch.long
            ),
        ],
        dim=1,
    )

    # pad edge_attr
    edge_attr = torch.cat(
        [
            edge_attr,
            torch.zeros(
                max_edge_attr_len - len(edge_attr), 1, dtype=torch.float32
            ),
        ],
        dim=0,
    )

    A_feature_list[i]["edge_index"] = edge_index
    A_feature_list[i]["edge_attr"] = edge_attr

In [53]:
# check if the padding is correct by checking the shape of edge_index and edge_attr
for i in range(len(A_feature_list)):
    assert A_feature_list[i]["edge_index"].shape == (2, max_edge_index_len)
    assert A_feature_list[i]["edge_attr"].shape == (max_edge_attr_len, 1)

In [54]:
# for each solution convert the dictionary to a list of values
solutions = [
    list(corlat_dataset[i]["solution"].values())
    if type(corlat_dataset[i]["solution"]) == dict
    else corlat_dataset[i]["solution"]
    for i in range(len(corlat_dataset))    
    ]

In [55]:
# combine the variable features and constraint features into a single input
X = np.hstack([var_input, constraint_input])

In [56]:
n_features = X.shape[1]
out_channels = solutions[0].shape[1]

In [57]:
out_channels

100

In [58]:
# convert X and solutions to float32
X = X.astype(np.float32)
solutions = np.array([solution.astype(np.float32) for solution in solutions])

  solutions = np.array([solution.astype(np.float32) for solution in solutions])


In [59]:
# train test split to get indices for train and test
train_idx, test_idx = train_test_split(
    np.arange(len(solutions)), test_size=0.2, random_state=42
)

X_train = X[train_idx]
X_test = X[test_idx]
y_train = solutions[train_idx]
y_test = solutions[test_idx]

In [60]:
np.save("Data/corlat/train_test_data/X_train.npy", X_train)
np.save("Data/corlat/train_test_data/X_test.npy", X_test)
np.save("Data/corlat/train_test_data/y_train.npy", y_train)
np.save("Data/corlat/train_test_data/y_test.npy", y_test)
np.save("Data/corlat/train_test_data/train_idx.npy", train_idx)
np.save("Data/corlat/train_test_data/test_idx.npy", test_idx)