# Preprocessing CORLAT dataset generated from `corlat.py`

This notebook intends to preprocess the dataset generated from `corlat_presolved.py`.

Preprocessing steps involved:
1. Combine individual .pkl dataset (each represents a sample, or data from one model instance) into a large dataset.
2. Convert data type of each feature to the correct dtype.
3. One-hot encoding for categorical features.
4. Check for duplicates in binary solution.
5. Save the dataset to "Data/corlat_presolved/processed_data/corlat_presolved_preprocessed.pickle"

The resulting dataset is a dictionary, where the arrays of `var_node_features` and `constraint_node_features` are replaced with a dataframe with correct dtypes and one-hot encoded categorical features.

In [2]:
import torch
import pandas as pd
import numpy as np
import pickle as pkl
import scipy
import os

In [3]:
!nvidia-smi

Fri Jun  9 15:39:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  On   | 00000000:17:00.0 Off |                   On |
| N/A   33C    P0    41W / 300W |     24MiB / 81920MiB |     N/A      Default |
|                               |                      |              Enabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80G...  On   | 00000000:65:00.0 Off |                   On |
| N/A   32C    P0    42W / 300W |     24MiB / 81920MiB |     N/A      Default |
|       

In [4]:
try:
    # test if "Data/corlat/pickle_raw_data/" exists
    os.listdir("Data/corlat_presolved/pickle_raw_data/")
except:
    # move directory to project folder
    os.chdir("/ibm/gpfs/home/yjin0055/Project/DayAheadForecast")

In [7]:
corlat_presolved_dataset = []
# load every pickle file and append to a list
pickle_filenames = os.listdir("Data/corlat_presolved/pickle_raw_data/")

In [8]:
# save pickle_files to pkl
with open("Data/corlat_presolved/processed_data/pickle_filenames.pkl", "wb") as f:
    pkl.dump(pickle_filenames, f)


In [45]:
for file in pickle_filenames:
    with open("Data/corlat_presolved/pickle_raw_data/"+file, "rb") as f:
        corlat_presolved_dataset.append(pkl.load(f))

In [46]:
# save corlat_presolved_dataset.pkl
with open("Data/corlat_presolved/processed_data/corlat_presolved_dataset.pkl", "wb") as f:
    pkl.dump(corlat_presolved_dataset, f)

In [47]:
del corlat_presolved_dataset

In [48]:
with open("Data/corlat_presolved/processed_data/corlat_presolved_dataset.pkl", "rb") as f:
    corlat_presolved_dataset = pkl.load(f)

In [None]:
# try:
#     corlat_presolved_dataset = pkl.load(open("Data/corlat_presolved/corlat_presolved.pickle", "rb"))
# except:
#     # move dir to /ibm/gpfs/home/yjin0055/Project/DayAheadForecast
#     os.chdir("/ibm/gpfs/home/yjin0055/Project/DayAheadForecast")
#     corlat_presolved_dataset = pkl.load(open("Data/corlat_presolved/corlat_presolved.pickle", "rb"))

In [49]:
# print the keys of the dataset
print("keys: ", corlat_presolved_dataset[0].keys())


keys:  dict_keys(['var_node_features', 'constraint_node_features', 'solution', 'indices', 'A', 'current_instance_weight'])


In [50]:
print("Var node features shape: ", corlat_presolved_dataset[0]["var_node_features"].shape)
print("Constraint node features shape: ", corlat_presolved_dataset[0]["constraint_node_features"].shape)

Var node features shape:  (411, 17)
Constraint node features shape:  (407, 9)


## Make dataframe for `var_node_features` and converting dtypes of each feature

In [51]:
# names of the variable features
# 1. Variable objective coefficient
# 2. Variable type
# 3. Number of non-zero coefficients in the constraint
# 4. LP relaxation value at root node
# 5. Is LP relaxation value fractional
# 6. LP solution value equals lower bound
# 7. LP solution value equals upper bound
# 8. Has lower bound
# 9. Has upper bound
# 10. Mean degree of the constraint nodes connected to the variable
# 11. Std. deviation of the degree of the constraint nodes connected to the variable
# 12. Min. degree of the constraint nodes connected to the variable
# 13. Max. degree of the constraint nodes connected to the variable
# 14. Mean coefficient of the constraint nodes connected to the variable
# 15. Std. deviation of the coefficient of the constraint nodes connected to the variable
# 16. Min. coefficient of the constraint nodes connected to the variable
# 17. Max. coefficient of the constraint nodes connected to the variable

In [52]:
# print feature type for variable node
for i in range(corlat_presolved_dataset[0]["var_node_features"].shape[1]):
    print("feature", i, ":", type(corlat_presolved_dataset[0]["var_node_features"][0][i]))

feature 0 : <class 'numpy.str_'>
feature 1 : <class 'numpy.str_'>
feature 2 : <class 'numpy.str_'>
feature 3 : <class 'numpy.str_'>
feature 4 : <class 'numpy.str_'>
feature 5 : <class 'numpy.str_'>
feature 6 : <class 'numpy.str_'>
feature 7 : <class 'numpy.str_'>
feature 8 : <class 'numpy.str_'>
feature 9 : <class 'numpy.str_'>
feature 10 : <class 'numpy.str_'>
feature 11 : <class 'numpy.str_'>
feature 12 : <class 'numpy.str_'>
feature 13 : <class 'numpy.str_'>
feature 14 : <class 'numpy.str_'>
feature 15 : <class 'numpy.str_'>
feature 16 : <class 'numpy.str_'>


In [53]:
# print one row of the variable node features
print(corlat_presolved_dataset[0]["var_node_features"][0])

['8.0' 'B' '6' '1.0' '0.0' '0.0' '1.0' '1.0' '1.0' '32.5'
 '41.419600835031396' '2.0' '93.0' '-49.833333333333336'
 '50.17109614996357' '-100.0' '1.0']


In [54]:
# make dataframe for variable node features
for i in range(len(corlat_presolved_dataset)):
    corlat_presolved_dataset[i]["var_node_features"] = pd.DataFrame(
        corlat_presolved_dataset[i]["var_node_features"]
    )
    corlat_presolved_dataset[i]["var_node_features"].columns = [
        "var_obj_coef",
        "var_type",
        "num_nonzero_coef",
        "lp_relax_val",
        "is_lp_relax_val_frac",
        "lp_sol_val_eq_lb",
        "lp_sol_val_eq_ub",
        "has_lb",
        "has_ub",
        "mean_degree",
        "std_degree",
        "min_degree",
        "max_degree",
        "mean_coef",
        "std_coef",
        "min_coef",
        "max_coef",
    ]

In [55]:
corlat_presolved_dataset[0]["var_node_features"]["var_type"].value_counts()

C    322
B     89
Name: var_type, dtype: int64

In [56]:
# print head of the variable node features
corlat_presolved_dataset[0]["var_node_features"].head()

Unnamed: 0,var_obj_coef,var_type,num_nonzero_coef,lp_relax_val,is_lp_relax_val_frac,lp_sol_val_eq_lb,lp_sol_val_eq_ub,has_lb,has_ub,mean_degree,std_degree,min_degree,max_degree,mean_coef,std_coef,min_coef,max_coef
0,8.0,B,6,1.0,0.0,0.0,1.0,1.0,1.0,32.5,41.4196008350314,2.0,93.0,-49.833333333333336,50.17109614996357,-100.0,1.0
1,8.0,B,6,-0.0,0.0,1.0,0.0,1.0,1.0,32.5,41.4196008350314,2.0,93.0,-49.0,51.04246598013593,-100.0,6.0
2,8.0,B,6,1.0,0.0,0.0,1.0,1.0,1.0,32.5,41.4196008350314,2.0,93.0,-49.5,50.51319959508934,-100.0,3.0
3,3.0,B,6,-0.0,0.0,1.0,0.0,1.0,1.0,32.5,41.4196008350314,2.0,93.0,-48.333333333333336,51.777300903860265,-100.0,10.0
4,7.0,B,6,-0.0,0.0,1.0,0.0,1.0,1.0,32.5,41.4196008350314,2.0,93.0,-48.66666666666666,51.40579301553041,-100.0,8.0


In [57]:
# convert column types
# dtype for variable node features
# 1. float
# 2. str
# 3. float
# 4. float
# 5. bool
# 6. bool
# 7. bool
# 8. bool
# 9. bool
# 10. float
# 11. float
# 12. float
# 13. float
# 14. float
# 15. float
# 16. float
# 17. float
var_column_types = {
    "var_obj_coef": float,
    "var_type": str,
    "num_nonzero_coef": float,
    "lp_relax_val": float,
    "is_lp_relax_val_frac": bool,
    "lp_sol_val_eq_lb": bool,
    "lp_sol_val_eq_ub": bool,
    "has_lb": bool,
    "has_ub": bool,
    "mean_degree": float,
    "std_degree": float,
    "min_degree": float,
    "max_degree": float,
    "mean_coef": float,
    "std_coef": float,
    "min_coef": float,
    "max_coef": float,
}
for i in range(len(corlat_presolved_dataset)):
    corlat_presolved_dataset[i]["var_node_features"] = corlat_presolved_dataset[i]["var_node_features"].astype(var_column_types)

In [58]:
# head of the variable node features
corlat_presolved_dataset[0]["var_node_features"].head()

Unnamed: 0,var_obj_coef,var_type,num_nonzero_coef,lp_relax_val,is_lp_relax_val_frac,lp_sol_val_eq_lb,lp_sol_val_eq_ub,has_lb,has_ub,mean_degree,std_degree,min_degree,max_degree,mean_coef,std_coef,min_coef,max_coef
0,8.0,B,6.0,1.0,True,True,True,True,True,32.5,41.419601,2.0,93.0,-49.833333,50.171096,-100.0,1.0
1,8.0,B,6.0,-0.0,True,True,True,True,True,32.5,41.419601,2.0,93.0,-49.0,51.042466,-100.0,6.0
2,8.0,B,6.0,1.0,True,True,True,True,True,32.5,41.419601,2.0,93.0,-49.5,50.5132,-100.0,3.0
3,3.0,B,6.0,-0.0,True,True,True,True,True,32.5,41.419601,2.0,93.0,-48.333333,51.777301,-100.0,10.0
4,7.0,B,6.0,-0.0,True,True,True,True,True,32.5,41.419601,2.0,93.0,-48.666667,51.405793,-100.0,8.0


In [59]:
corlat_presolved_dataset[7]["var_node_features"].var_type.value_counts()

C    118
B     37
I      1
Name: var_type, dtype: int64

## One-hot encoding of categorical `var_node_features`

In [60]:
# now one hot encode var_type column
# ('C' for continuous, 'B' for binary, 'I' for integer, 'S' for semi-continuous, or 'N' for semi-integer).
# Therefore we result in a total of 5 columns for var_type after one hot encoding
# Specify all possible categories
categories = ['C', 'B', 'I', 'S', 'N']

# Ensure every category is always accounted for
for i in range(len(corlat_presolved_dataset)):
    corlat_presolved_dataset[i]["var_node_features"]["var_type"] = pd.Categorical(corlat_presolved_dataset[i]["var_node_features"]["var_type"], categories=categories)
    dummies = pd.get_dummies(corlat_presolved_dataset[i]["var_node_features"]["var_type"], prefix="var_type", prefix_sep="_")
    
    # simple unit test to make sure that the one hot encoding converts the var_type column to 5 columns
    print("dummies shape: ", dummies.shape)
    # assert dummies.shape[1] == 2
    
    # add the one hot encoded columns to the dataframe
    corlat_presolved_dataset[i]["var_node_features"] = pd.concat(
        [corlat_presolved_dataset[i]["var_node_features"], dummies], axis=1
    )
    
    # drop the original var_type column
    corlat_presolved_dataset[i]["var_node_features"] = corlat_presolved_dataset[i][
        "var_node_features"
    ].drop("var_type", axis=1) 

dummies shape:  (411, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (146, 5)
dummies shape:  (457, 5)
dummies shape:  (442, 5)
dummies shape:  (457, 5)
dummies shape:  (156, 5)
dummies shape:  (402, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (213, 5)
dummies shape:  (457, 5)
dummies shape:  (453, 5)
dummies shape:  (453, 5)
dummies shape:  (398, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (102, 5)
dummies shape:  (457, 5)
dummies shape:  (429, 5)
dummies shape:  (457, 5)
dummies shape:  (389, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (100, 5)
dummies shape:  (385, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (457, 5)
dummies shape:  (412, 5)
dummies shape:  (457, 5)
dummies shape:  (174, 5)
dummies shape:  (457, 5)
dummies shape:  (453, 5)
dummies shape:  (453, 5)
dummies shape:  (185, 5)


In [61]:
corlat_presolved_dataset[0]["var_node_features"].head()

Unnamed: 0,var_obj_coef,num_nonzero_coef,lp_relax_val,is_lp_relax_val_frac,lp_sol_val_eq_lb,lp_sol_val_eq_ub,has_lb,has_ub,mean_degree,std_degree,...,max_degree,mean_coef,std_coef,min_coef,max_coef,var_type_C,var_type_B,var_type_I,var_type_S,var_type_N
0,8.0,6.0,1.0,True,True,True,True,True,32.5,41.419601,...,93.0,-49.833333,50.171096,-100.0,1.0,0,1,0,0,0
1,8.0,6.0,-0.0,True,True,True,True,True,32.5,41.419601,...,93.0,-49.0,51.042466,-100.0,6.0,0,1,0,0,0
2,8.0,6.0,1.0,True,True,True,True,True,32.5,41.419601,...,93.0,-49.5,50.5132,-100.0,3.0,0,1,0,0,0
3,3.0,6.0,-0.0,True,True,True,True,True,32.5,41.419601,...,93.0,-48.333333,51.777301,-100.0,10.0,0,1,0,0,0
4,7.0,6.0,-0.0,True,True,True,True,True,32.5,41.419601,...,93.0,-48.666667,51.405793,-100.0,8.0,0,1,0,0,0


## Make dataframe for `constraint_node_features` and converting dtypes of each feature

In [62]:
# get feature type for constraint_node_features
for i in range(corlat_presolved_dataset[0]["constraint_node_features"].shape[1]):
    print("feature", i, ":", type(corlat_presolved_dataset[0]["constraint_node_features"][0][i]))

feature 0 : <class 'numpy.str_'>
feature 1 : <class 'numpy.str_'>
feature 2 : <class 'numpy.str_'>
feature 3 : <class 'numpy.str_'>
feature 4 : <class 'numpy.str_'>
feature 5 : <class 'numpy.str_'>
feature 6 : <class 'numpy.str_'>
feature 7 : <class 'numpy.str_'>
feature 8 : <class 'numpy.str_'>


In [63]:
# print one row of the variable constraint features
print(corlat_presolved_dataset[0]["constraint_node_features"][0])

['<' '72.0' '89' '0.7481996337087321' '4.966292134831461'
 '2.927780300447926' '1.0' '10.0' '442.0']


In [64]:
# names of the constraint node features
# 1. Constraint type
# 2. RHS value
# 3. Number of non-zero coefficients in the constraint
# 4. Cosine similarity with the objective function
# 5. Mean of coefficients of the variables connected to the constraint
# 6. Std. deviation of coefficients of the variables connected to the constraint
# 7. Min. coefficient of the variables connected to the constraint
# 8. Max. coefficient of the variables connected to the constraint
# 9. Sum of norm of absolute values of coefficients of the variable nodes a constraint node is connected to

In [65]:
# make dataframe for constraint node features
for i in range(len(corlat_presolved_dataset)):
    corlat_presolved_dataset[i]["constraint_node_features"] = pd.DataFrame(
        corlat_presolved_dataset[i]["constraint_node_features"]
    )
    corlat_presolved_dataset[i]["constraint_node_features"].columns = [
        "constraint_type",
        "rhs",
        "num_nonzero_coef",
        "cos_sim_obj_func",
        "mean_coef",
        "std_coef",
        "min_coef",
        "max_coef",
        "sum_norm_abs_coef",
    ]

In [66]:
# convert column types
# dtype for variable node features
# 1. str
# 2. float
# 3. float
# 4. float
# 5. float
# 6. float
# 7. float
# 8. float
# 9. float
constraint_column_types = {
    "constraint_type": str,
    "rhs": float,
    "num_nonzero_coef": float,
    "cos_sim_obj_func": float,
    "mean_coef": float,
    "std_coef": float,
    "min_coef": float,
    "max_coef": float,
    "sum_norm_abs_coef": float,
}

for i in range(len(corlat_presolved_dataset)):
    corlat_presolved_dataset[i]["constraint_node_features"] = corlat_presolved_dataset[i]["constraint_node_features"].astype(constraint_column_types)

## One-hot encoding of categorical `constraint_node_features`

In [67]:
# get number of unique constraint types throughout the dataset
constraint_types = set()
for i in range(len(corlat_presolved_dataset)):
    constraint_types.update(corlat_presolved_dataset[i]["constraint_node_features"]["constraint_type"].unique())

# print the unique constraint types
print(constraint_types)

{'=', '<'}


In [68]:
# now one hot encode constraint_type column
constraint_categories = ['<', '>', '=']
for i in range(len(corlat_presolved_dataset)):
    corlat_presolved_dataset[i]["constraint_node_features"]["constraint_type"] = pd.Categorical(corlat_presolved_dataset[i]["constraint_node_features"]["constraint_type"], categories=constraint_categories)
    dummies = pd.get_dummies(corlat_presolved_dataset[i]["constraint_node_features"]["constraint_type"], prefix="constraint_type", prefix_sep="_")
        
    # add the one hot encoded columns to the dataframe
    corlat_presolved_dataset[i]["constraint_node_features"] = pd.concat(
        [corlat_presolved_dataset[i]["constraint_node_features"], dummies], axis=1
    )
    
    # drop the original constraint_type column
    corlat_presolved_dataset[i]["constraint_node_features"] = corlat_presolved_dataset[i][
        "constraint_node_features"
    ].drop("constraint_type", axis=1)


In [69]:
# get head of constraint node features
corlat_presolved_dataset[0]["constraint_node_features"].head()

Unnamed: 0,rhs,num_nonzero_coef,cos_sim_obj_func,mean_coef,std_coef,min_coef,max_coef,sum_norm_abs_coef,constraint_type_<,constraint_type_>,constraint_type_=
0,72.0,89.0,0.7482,4.966292,2.92778,1.0,10.0,442.0,1,0,0
1,0.0,2.0,-0.135919,-49.5,50.5,-100.0,1.0,101.0,1,0,0
2,0.0,2.0,-0.135919,-49.5,50.5,-100.0,1.0,101.0,1,0,0
3,0.0,2.0,-0.135919,-49.5,50.5,-100.0,1.0,101.0,1,0,0
4,0.0,2.0,-0.135919,-49.5,50.5,-100.0,1.0,101.0,1,0,0


# Check for duplicates and drop them, for output solutions

In [70]:
def check_duplicates(arr, indices=None, drop=True):
    """
    This function takes in a list of lists and returns True if there are any duplicates, False otherwise.
    If drop=True, it also returns a new list of lists with duplicates removed.
    """
    
    # if arr is list of lists, convert to numpy array
    if isinstance(arr, list):
        arr = np.array(arr)
    
    if indices is not None:
        indexed_arr = arr[:, indices]
    
    else:
        indexed_arr = arr
    
    pairwise_comp = np.all(indexed_arr[:, np.newaxis, :] == indexed_arr[np.newaxis, :, :], axis=-1)
    duplicates = np.where(np.triu(pairwise_comp, k=1))
    if duplicates[0].size > 0:
        if drop:
            arr_unique = np.delete(arr, duplicates[0], axis=0)
            return True, arr_unique.tolist()
        else:
            return True
    else:
        if drop:
            return False, arr.tolist()
        else:
            return False

In [71]:
# for each solution, check if there are any duplicate solutions
# if there are, drop the duplicates
# first get the indices for binary variables

for i in range(len(corlat_presolved_dataset)):
    # get indices for binary variables
    binary_indices = corlat_presolved_dataset[i]["indices"]["indices"]
    
    # convert dictionary of solutions to array of arrays
    if isinstance(corlat_presolved_dataset[i]["solution"], dict):
        corlat_presolved_dataset[i]["solution"] = np.array(list(corlat_presolved_dataset[i]["solution"].values()))
    
    # check for duplicates
    has_duplicates, unique_solutions = check_duplicates(corlat_presolved_dataset[i]["solution"], indices=binary_indices)
    
    # if there are duplicates, drop them
    if has_duplicates:
        corlat_presolved_dataset[i]["solution"] = np.array(unique_solutions)[:, binary_indices]
    else:
        corlat_presolved_dataset[i]["solution"] = corlat_presolved_dataset[i]["solution"][:, binary_indices]

In [72]:
corlat_presolved_dataset[0]["solution"][:, binary_indices].shape

(100, 100)

In [73]:
# save the dataset as corlat_presolved_preprocessed.pickle
with open("Data/corlat_presolved/processed_data/corlat_presolved_preprocessed.pickle", "wb") as f:
    pkl.dump(corlat_presolved_dataset, f)

In [74]:
# load the preprocessed dataset
with open("Data/corlat_presolved/processed_data/corlat_presolved_preprocessed.pickle", "rb") as f:
    preprocessed_corlat_dataset = pkl.load(f)


In [75]:
preprocessed_corlat_dataset[1050]["var_node_features"].values.ravel()[::18]

array([4.0, 0, 5.0, -49.166666666666664, 45.18634257776962, True, True,
       1.0, 0, 8.0, -38.6, 45.18634257776962, True, True, 10.0, 0, 8.0,
       -56.142857142857146, 43.26567192760259, True, True, 8.0, 0, 4.0,
       -56.285714285714285, 43.26567192760259, True, True, 4.0, 0, 6.0,
       -55.857142857142854, 43.26567192760259, True, True, 4.0, 0, 8.0,
       -55.857142857142854, 43.26567192760259, True, True, 5.0, 0, 3.0,
       -49.333333333333336, 43.26567192760259, True, True, 2.0, 0, 9.0,
       -56.285714285714285, 43.26567192760259, True, True, 7.0, 0, 8.0,
       -56.42857142857143, 43.26567192760259, True, True, 1.0, 0, 9.0,
       -55.714285714285715, 45.18634257776962, True, True, 3.0, 0, 4.0,
       -56.857142857142854, 43.26567192760259, True, True, 10.0, 0, 3.0,
       -49.0, 43.26567192760259, True, True, 5.0, 0, 6.0,
       -56.285714285714285, 43.26567192760259, True, True, 8.0, 0, 4.0,
       -56.142857142857146, 43.26567192760259, True, True, 9.0, 0, 4.0,
      

In [76]:
# for all solutions print shape
for i in range(len(preprocessed_corlat_dataset)):
    print(preprocessed_corlat_dataset[i]["solution"].shape)

(100, 100)
(100, 100)
(100, 100)
(44, 100)
(100, 100)
(100, 100)
(100, 100)
(23, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(11, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(14, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(9, 100)
(100, 100)
(100, 100)
(88, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(22, 100)
(100, 100)
(100, 100)
(2, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(3, 100)
(100, 100)
(100, 100)
(100, 100)
(23, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(100, 100)
(33, 100)
(100, 100)
(100, 100)
(100, 100)
(3, 100)
(2, 100)
(100, 100)
(100, 100)
(100, 

In [77]:
# for all solutions print length of dictionary
for i in range(len(corlat_presolved_dataset)):
    print(len(corlat_presolved_dataset[i]["solution"]))

100
100
100
44
100
100
100
23
100
100
100
100
100
100
100
100
100
100
11
100
100
100
100
100
100
100
100
100
14
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
9
100
100
88
100
100
100
100
100
100
100
100
100
22
100
100
2
100
100
100
100
100
100
3
100
100
100
23
100
100
100
100
100
100
100
100
100
100
100
33
100
100
100
3
2
100
100
100
100
100
100
100
100
2
100
100
28
39
100
100
100
100
100
100
100
100
24
100
100
100
100
100
100
100
100
3
100
100
2
100
100
100
100
100
100
100
2
100
100
100
100
100
100
10
100
100
28
100
100
100
100
100
100
100
100
100
12
100
100
100
100
16
100
100
100
100
3
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
23
100
100
100
100
35
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
2
100
100
100
100
100
100
2
100
9
100
100
100
100
100
100
2
100
100
100
100
100
100
100
100
100
100
4
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
3
100
2
100
100
100
100
100
100
100
100