# Create presolved CORLAT dataset to be used for Neural Network Training

In this notebook, the preprocessed dataset from `preprocess_presolved_corlat.ipynb` will be loaded to be further processed into a numpy arrays suitable to be loaded in directly in a Neural Network training script.
This notebook will output the following numpy files for training and testing:

    1. `train_weights.npy`, which are the weights for the custom feasibility promoting weighted BCELoss.

    2. `X_train.npy`, the input training data.

    3. `X_test.npy`, the input testing data.

    4. `y_train.npy`, the output target solutions.

    5. `y_test.npy`, the output testing solutions.
    
    6. `train_idx.npy`, the indices for the corresponding gurobi model files of training.

    7. `test_idx.npy`, the indices for the corresponding gurobi model files of testing.

In [1]:
import torch
import random
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.optim import AdamW
from xgboost import XGBClassifier

import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pickle as pkl
import scipy
import os

from torch.nn import Linear, ReLU, Dropout
from torch.nn.functional import relu
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [2]:
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = (
        False  # Force cuDNN to use a consistent convolution algorithm
    )
    torch.backends.cudnn.deterministic = (
        True  # Force cuDNN to use deterministic algorithms if available
    )
    torch.use_deterministic_algorithms(
        True
    )  # Force torch to use deterministic algorithms if available


In [3]:
set_seeds(42)

In [4]:
try:
    corlat_presolved_dataset = pkl.load(open("Data/corlat_presolved/processed_data/corlat_presolved_preprocessed.pickle", "rb"))
except:
    # move dir to /ibm/gpfs/home/yjin0055/Project/DayAheadForecast
    os.chdir("/ibm/gpfs/home/yjin0055/Project/DayAheadForecast")
    corlat_presolved_dataset = pkl.load(open("Data/corlat_presolved/processed_data/corlat_presolved_preprocessed.pickle", "rb"))

In [5]:
# Obtain the maximum size of N_constraints and N_variables across the dataset.

max_N_constraints = max(
    len(x["constraint_node_features"]) for x in corlat_presolved_dataset
)

max_N_variables = max(
    len(x["var_node_features"]) for x in corlat_presolved_dataset
)

min_N_constraints = min(
    len(x["constraint_node_features"]) for x in corlat_presolved_dataset
)

min_N_variables = min(
    len(x["var_node_features"]) for x in corlat_presolved_dataset
)

In [6]:
print("Maximum number of variables: ", max_N_variables)
print("Maximum number of constraints: ", max_N_constraints)
print("Minimum number of variables: ", min_N_variables)
print("Minimum number of constraints: ", min_N_constraints)

Maximum number of variables:  457
Maximum number of constraints:  455
Minimum number of variables:  9
Minimum number of constraints:  8


In [7]:
print("Number of variable node features: ", len(corlat_presolved_dataset[0]["var_node_features"].columns))
print("Number of constraint node features: ", len(corlat_presolved_dataset[0]["constraint_node_features"].columns))

Number of variable node features:  21
Number of constraint node features:  11


### Padding with 0s to match the maximum number of variables and constraints

Pad with 0s to match maximum number of constraints and variables. After that stack them together to form a `numpy.NDArray`

`var_node_features` will have shape of `(n_samples, max_vars, n_var_features)`

`constraint_node_features` will have shape of `(n_samples, max_constraints, n_constraint_features)`

In [8]:
# for each variable node features, pad with 0.0s to make it the same length as the maximum number of variables

var_node_features = np.stack(
    [
        np.pad(
            x["var_node_features"].values,
            ((0, max_N_variables - len(x["var_node_features"])), (0, 0)),
            "constant",
            constant_values=0.0,
        )
        for x in corlat_presolved_dataset
    ]
)
            

In [9]:
constraint_node_features = np.stack(
    [
        np.pad(
            x["constraint_node_features"].values,
            ((0, max_N_constraints - len(x["constraint_node_features"])), (0, 0)),
            "constant",
            constant_values=0.0,
        )
        for x in corlat_presolved_dataset
    ]   
)

In [10]:
# for var_node_features and constraint_node_features, reshape to (N_samples, -1) to feed into the neural network
var_input = var_node_features.reshape(var_node_features.shape[0], -1)
constraint_input = constraint_node_features.reshape(constraint_node_features.shape[0], -1)

In [11]:
var_node_features.shape

(1943, 457, 21)

In [12]:
print("Shape of variable features input: ", var_input.shape)
print("Shape of constraint features input: ", constraint_input.shape)

Shape of variable features input:  (1943, 9597)
Shape of constraint features input:  (1943, 5005)


In [13]:
var_input[0]

array([8.0, 6.0, 1.0, ..., 0.0, 0.0, 0.0], dtype=object)

### Commented code used for generating edge index and edge attribute in COO format.

$\color{lightblue}\text{This section of code is not used in the current paradigm of training a standard neural network.}$

This section of commented code might be useful for Graph Neural Network training, where the edge index and edge attributes are needed.

In [14]:
# # get A matrix input by stacking the csr_matrix of each sample getting shape of N_samples x (A.shape[0] x A.shape[1])
# A_input = np.vstack([x["A"] for x in corlat_presolved_dataset])

# A_feature_list = []
# for i in range(len(corlat_presolved_dataset)):
#     n_cons = corlat_presolved_dataset[i]["A"].shape[0]

#     # for row in range(n_vars):
#     #     for col in range(n_cons):
#     #         if input_dict_list[i]["A"][row, col] != 0:
#     #             adj_matrix[row, n_vars + col] = input_dict_list[i]["A"][row, col]
#     #             adj_matrix[n_vars + col, row] = input_dict_list[i]["A"][row, col]

#     I, J, V = scipy.sparse.find(corlat_presolved_dataset[i]["A"])
#     # adj_matrix[I, n_vars + J] = V
#     # adj_matrix[n_vars + J, I] = V

#     # # convert to COO format
#     edge_index = torch.stack([torch.tensor(I), torch.tensor(n_cons + J)], dim=0)

#     # expand V to 2D
#     edge_attr = torch.tensor(V).unsqueeze(1)

#     tmp_dict = {"edge_index": edge_index, "edge_attr": edge_attr}
#     A_feature_list.append(tmp_dict)
    
#     # for each sample, pad the edge_index and edge_attr to make it the same length as the maximum length of edge_index and edge_attr
# max_edge_index_len = max([len(x["edge_index"][0]) for x in A_feature_list])
# max_edge_attr_len = max([len(x["edge_attr"]) for x in A_feature_list])

# for i in range(len(A_feature_list)):
#     edge_index = A_feature_list[i]["edge_index"]
#     edge_attr = A_feature_list[i]["edge_attr"]

#     # pad edge_index
#     edge_index = torch.cat(
#         [
#             edge_index,
#             torch.zeros(
#                 2, max_edge_index_len - len(edge_index[0]), dtype=torch.long
#             ),
#         ],
#         dim=1,
#     )

#     # pad edge_attr
#     edge_attr = torch.cat(
#         [
#             edge_attr,
#             torch.zeros(
#                 max_edge_attr_len - len(edge_attr), 1, dtype=torch.float32
#             ),
#         ],
#         dim=0,
#     )

#     A_feature_list[i]["edge_index"] = edge_index
#     A_feature_list[i]["edge_attr"] = edge_attr
    
#     # check if the padding is correct by checking the shape of edge_index and edge_attr
# for i in range(len(A_feature_list)):
#     assert A_feature_list[i]["edge_index"].shape == (2, max_edge_index_len)
#     assert A_feature_list[i]["edge_attr"].shape == (max_edge_attr_len, 1)

### Create the dataset for input and targets

Create the input and targets, specifically we create input dataset `X` and output targets `solutions`.

In [15]:
# for each solution convert the dictionary to a list of values
solutions = [
    list(corlat_presolved_dataset[i]["solution"].values())
    if type(corlat_presolved_dataset[i]["solution"]) == dict
    else corlat_presolved_dataset[i]["solution"]
    for i in range(len(corlat_presolved_dataset))    
    ]

In [16]:
# combine the variable features and constraint features into a single input
X = np.hstack([var_input, constraint_input])

In [17]:
n_features = X.shape[1]
out_channels = solutions[0].shape[1]

In [18]:
out_channels

100

In [19]:
# convert X and solutions to float32
X = X.astype(np.float32)
solutions = np.array([solution.astype(np.float32) for solution in solutions])

  solutions = np.array([solution.astype(np.float32) for solution in solutions])


### Train test split to create training and testing dataset

In [20]:
# train test split to get indices for train and test
train_idx, test_idx = train_test_split(
    np.arange(len(solutions)), test_size=0.2, random_state=42
)

X_train = X[train_idx]
X_test = X[test_idx]
y_train = solutions[train_idx]
y_test = solutions[test_idx]

### Create weights for feasibility promoting weighted BCELoss during training. 

The weights for feasibility promoting weighted BCELoss during training and testing is created. 

The test weights are not utilized, $\color{lightblue}\text{we are not utilizing the test loss as a performance metric during this experiment}$. The test loss is not calculated. Only the number of violated constraints and the performance of optimization is taken as a performance metric for now.



In [22]:
# create train weights
train_weights = []
for i in range(len(y_train)):
    train_weights.append(corlat_presolved_dataset[train_idx[i]]["current_instance_weight"])
train_weights = np.array(train_weights)

# create test weights
test_weights = []
for i in range(len(y_test)):
    test_weights.append(corlat_presolved_dataset[test_idx[i]]["current_instance_weight"])
test_weights = np.array(test_weights)

  train_weights = np.array(train_weights)
  test_weights = np.array(test_weights)


In [23]:
# copy pickle_filenames from processed_data folder to the train_test_data folder
os.system("cp Data/corlat_presolved/processed_data/pickle_filenames.pkl Data/corlat_presolved/train_test_data/")

0

In [24]:
np.save("Data/corlat_presolved/train_test_data/train_weights.npy", train_weights)
np.save("Data/corlat_presolved/train_test_data/test_weights.npy", test_weights)
np.save("Data/corlat_presolved/train_test_data/X_train.npy", X_train)
np.save("Data/corlat_presolved/train_test_data/X_test.npy", X_test)
np.save("Data/corlat_presolved/train_test_data/y_train.npy", y_train)
np.save("Data/corlat_presolved/train_test_data/y_test.npy", y_test)
np.save("Data/corlat_presolved/train_test_data/train_idx.npy", train_idx)
np.save("Data/corlat_presolved/train_test_data/test_idx.npy", test_idx)