In [1]:
## Load modules

from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold

import pandas as pd
import h5py
import numpy as np
import time

from tools import add_interactions  # custom tools defined in this directory
from tools import save_model
from math import isclose

In [13]:
data = pd.read_csv("data/WS_data/preprocessing_helpers/WS_data.csv")
np.random.seed(10)
data= data.iloc[np.random.permutation(len(data))]

In [21]:
X = data[data.keys()[[1,2,3,6]].union(data.keys()[10:])].drop(columns=["node_cat", "proper leaf"])
y = data["snbs"]

In [22]:
isclose(len(X) * 0.85 % 1, 0, abs_tol=10e-12)

False

In [23]:
train_fraction = 0.7
X_test = X.iloc[int(len(X) * 0.85):]
y_test = y.iloc[int(len(y) * 0.85):]
X_train = X.iloc[:int(len(X) * train_fraction)]
y_train = y[:int(len(y) * train_fraction)]

  y_train = y[:int(len(y) * train_fraction)]


In [24]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import OrthogonalMatchingPursuit

In [61]:
model = make_pipeline(StandardScaler(), 
    VarianceThreshold(),  
    HistGradientBoostingRegressor(
        max_iter=1750,
        loss="poisson",
        early_stopping=False,
        learning_rate=0.09,
        random_state=1))


In [26]:
model = make_pipeline(StandardScaler(), 
    VarianceThreshold(),  
    LinearRegression())

In [62]:
model.fit(X_train, y_train)

In [63]:
train_r2 = r2_score(y_train, model.predict(X_train))
train_r2

0.9152524609875418

In [64]:
r2_score(y_test, model.predict(X_test))

0.7069339580142777

# Dataset v2


In [None]:
with h5py.File("data/n128/data/ml_input_grid_data_hetero.h5", "r") as f:
    f.create_dataset("X_train", data=X_train)
    f.create_dataset("y_train", data=y_train)
    f.create_dataset("X_test", data=X_test)
    f.create_dataset("y_test", data=y_test)
data = pd.read_csv("data/n128/data/ml_input_grid_data_hetero.h5")

In [68]:
f = h5py.File("data/n128/data/ml_input_grid_data_hetero.h5", "r")

In [77]:
np.array(f.get("grids/1/node_features_group/normalForm/features"))

array([[ 2.21018933e+00, -8.40978776e-01,  2.35205888e+00,
         1.23934760e+00,  3.19985139e+00, -2.11736794e+00,
        -1.40965597e+00,  3.45405602e+00, -3.05196313e+00,
         1.95911899e+00, -2.82282378e+00, -2.92695262e+00,
         2.88221929e+00, -6.75261449e-01, -1.98016408e+00,
        -2.11494787e+00, -2.25238370e+00,  4.33767253e+00,
        -1.13043468e+00, -1.82434102e+00, -2.88317742e+00,
         2.95524435e+00, -2.04597868e+00,  3.50119006e+00,
         1.91684462e+00, -1.75330066e+00,  2.81422772e+00,
         2.16254966e+00, -2.08359987e+00, -5.75702591e-01,
         3.38674423e+00, -1.86382492e+00,  2.23594585e+00,
        -1.40705547e+00,  1.01948939e+00,  3.75866860e+00,
        -8.57343957e-01, -2.95656293e+00, -2.55887695e+00,
        -4.22866156e+00,  2.78897724e+00,  2.46290237e+00,
         3.08847101e-01,  1.29355776e+00, -5.63702824e-01,
         9.87247732e-01, -3.13813245e-01, -3.42613683e+00,
         1.48714893e+00, -2.19836767e+00, -1.55152744e+0

In [146]:
f.get("grids/1/node_features_group/load/features")#.keys()

In [144]:
np.array(f["grids/1/line_features_group/1/edge_attr"]).shape

(5, 192)

In [141]:
np.array(f["grids/1/line_features_group/2/edge_index"])

array([[  1,   2],
       [  2,   1],
       [  1,   3],
       [  3,   1],
       [  3,   4],
       [  4,   3],
       [  3,   5],
       [  5,   3],
       [  3,  12],
       [ 12,   3],
       [  4,   6],
       [  6,   4],
       [  4,  13],
       [ 13,   4],
       [  4,  14],
       [ 14,   4],
       [  4,  18],
       [ 18,   4],
       [  4,  61],
       [ 61,   4],
       [  4, 110],
       [110,   4],
       [  5,  10],
       [ 10,   5],
       [  6,  76],
       [ 76,   6],
       [  8,   9],
       [  9,   8],
       [  8,  11],
       [ 11,   8],
       [  8,  28],
       [ 28,   8],
       [  9,  11],
       [ 11,   9],
       [  9, 127],
       [127,   9],
       [ 10,  52],
       [ 52,  10],
       [ 12,  40],
       [ 40,  12],
       [ 13,  14],
       [ 14,  13],
       [ 13,  41],
       [ 41,  13],
       [ 14,  46],
       [ 46,  14],
       [ 15,  16],
       [ 16,  15],
       [ 15,  51],
       [ 51,  15],
       [ 16,  70],
       [ 70,  16],
       [ 17,

In [85]:
f["grids/1/node_features_group"].keys()

<KeysViewHDF5 ['SlackAlgebraic', 'load', 'normalForm']>

In [96]:
f = h5py.File("data/n128/data/grids/grid_00001_vertexpos.h5", "r")

In [97]:
f.keys()

<KeysViewHDF5 ['vertexpos']>

In [99]:
np.array(f.get("vertexpos"))

array([[-0.35645227, -0.33880049, -0.61809404,  0.74169797, -0.75704659,
         0.73837364, -0.36629057,  0.75320539,  0.61952139,  0.25554202,
        -0.80102946, -0.39729654,  0.16468708,  0.67410781,  0.83691853,
        -0.0425468 ,  0.29083475,  0.84526295, -0.22301171, -0.0682922 ,
         0.63005799,  0.90089005, -0.54119894, -0.90145306, -0.31153162,
        -0.67671327, -0.95949586, -0.84530899,  0.6511806 ,  0.1140642 ,
         0.74891452,  0.20114837, -0.01564218, -0.44902919, -0.30510266,
        -0.63785062, -0.22846382,  0.98866145,  0.23397264, -0.95701104,
        -0.30650703, -0.88240746, -0.7200148 , -0.34887757,  0.41991327,
         0.72232208,  0.98969828, -0.45106687,  0.43884802, -0.56529771,
         0.67107037,  0.07988417, -0.92203264,  0.80420674, -0.10535536,
        -0.09847178,  0.2269991 , -0.62117109, -0.14534255, -0.14378347,
         0.74388277, -0.12008027, -0.10266699,  0.66126893, -0.52615077,
        -0.35171842,  0.67069426, -0.272305  , -0.9