In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import subprocess
import os
import sys
sys.path.append('/usr/workspace/vanover1/approx-llvm/approx')
from approx_modules import approx

from sklearn.ensemble import ExtraTreesRegressor

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ._gradient_boosting import predict_stages
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ._gradient_boosting import predict_stages


In [2]:
# need to use /usr/tce/bin/git binary because the default git version grabbed by this subprocess is too early for the --show-superproject-working-tree flag
REPO_ROOT = subprocess.check_output("/usr/tce/bin/git rev-parse --show-superproject-working-tree --show-toplevel | head -1", shell=True).strip().decode()
N_KERNELS = 10
KERNEL_NAMES = [
        'HPCCG::Ap_0',
        'HPCCG::alpha_0',
        'HPCCG::alpha_1',
        'HPCCG::beta_0',
        'HPCCG::normr_0',
        'HPCCG::normr_1',
        'HPCCG::r_0',
        'HPCCG::x_0',
        'main::x_0',
        'main::b_0'
    ]

In [3]:
def run_HPCCG(nx, ny, nz):
    
    # compile and run HPCCG instrumented with HPAC directives
    subprocess.check_call(f"rm -f test.h5 && source {REPO_ROOT}/scripts/activate_env.sh && make && ./test_HPCCG {nx} {nx} {nz}", shell=True)

    # open database
    approxDataProfile = approx.approxApplication("./test.h5")
    
    # get output
    Y = np.mean(approxDataProfile.getApplicationOutput()['solution'])
    
    # get aggregated kernel outputs
    kernel_outs = []
    for kernel_name in KERNEL_NAMES:
        src_name = kernel_name.split("::")[0]
        var_name = kernel_name.split("::")[-1]

        for region_name in approxDataProfile.getRegionNames():
            if region_name.startswith(src_name) and var_name in [region_name.split("::")[-1].split(",")][0]:

                if region_name == "HPCCG::beta_0":
                    kernel_outs.append(approxDataProfile[region_name].X().mean())
                elif region_name == "main::x_0,b_0":
                    splitpoint = approxDataProfile[region_name].Y().shape[1]//2
                    if var_name == 'x_0':
                        kernel_outs.append(approxDataProfile[region_name].Y()[:,:splitpoint].mean())
                    elif var_name == 'b_0':
                        kernel_outs.append(approxDataProfile[region_name].Y()[:,splitpoint:].mean())
                else:
                    kernel_outs.append(approxDataProfile[region_name].Y().mean())
                    
    return kernel_outs, Y

In [4]:
n_sample = 2**10

In [5]:
nx = 20
ny = 30
nz = 160

try:
    with open(f"XX_{n_sample}n.npy", "rb") as f:
        kernel_outs = np.load(f)
    with open(f"YY_{n_sample}n.npy", "rb") as f:
        YY = np.load(f)
        
except FileNotFoundError:

    # run HPCCG
    kernel_outs = []
    YY = []
    for n in range(n_sample):

        temp = run_HPCCG(nx, ny, nz)
        kernel_outs.append(temp[0])
        YY.append(temp[1])

    kernel_outs = np.array(kernel_outs)
    YY = np.array(YY)
    
    with open(f"XX_{n_sample}n.npy", "wb") as f:
        np.save(f, kernel_outs)
    with open(f"YY_{n_sample}n.npy", "wb") as f:
        np.save(f, YY)

In [6]:
# train/test split
split_proportion = 0.8
split_point = int(n_sample * split_proportion)
XX_train, XX_test = kernel_outs[:split_point], kernel_outs[split_point:]
YY_train, YY_test = YY[:split_point], YY[split_point:]

print(f"Training with {split_point} samples.")

# analyze kernel output sensitivity
et = ExtraTreesRegressor(n_estimators=100,
                                    criterion="mse",
                                    # max_features=int(round(XX.shape[1] / 3)),
                                    max_depth=8,
                                    min_samples_split=2,
                                    min_samples_leaf=max(1, int(round(np.sqrt(XX_train.shape[0]) / np.sqrt(1000)))),
                                    min_weight_fraction_leaf=0,
                                    max_leaf_nodes=None,
                                    #bootstrap=True,
                                    #oob_score=True,
                                    random_state=1)

et.fit(XX_train, YY_train)
print(f"Score: {et.score(XX_test, YY_test)}")

Si = [(KERNEL_NAMES[k_no], et.feature_importances_[k_no]) for k_no in range(N_KERNELS)]
Si.sort(key = lambda x : x[1])
print(f"Sensitivities: {Si}")

Training with 819 samples.
Score: 0.9685078068257224
Sensitivities: [('HPCCG::alpha_0', 0.0), ('HPCCG::alpha_1', 0.0), ('HPCCG::beta_0', 0.0), ('HPCCG::normr_0', 0.0), ('HPCCG::r_0', 0.004583377093853409), ('HPCCG::Ap_0', 0.01683102053491955), ('main::b_0', 0.1087015570675255), ('HPCCG::normr_1', 0.18200707381872472), ('HPCCG::x_0', 0.31393760050084096), ('main::x_0', 0.3739393709841358)]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)


## Compare to:
```
=== BEGIN ADAPT REPORT ===
28704396 total independent/intermediate variables
1 dependent variables
Mixed-precision recommendation:
  Replace variable normr:HPCCG.cpp:125      max error introduced: 0.00000e+00  count: 99          
  Replace variable normr:HPCCG.cpp:105      max error introduced: 0.00000e+00  count: 1           
  Replace variable x:main.cpp:181           max error introduced: 0.00000e+00  count: 96000       
  Replace variable b:main.cpp:182           max error introduced: 0.00000e+00  count: 96000       
  DO NOT replace   beta:HPCCG.cpp:120       max error introduced: 6.35086e-21  count: 98          
  DO NOT replace   alpha:HPCCG.cpp:138      max error introduced: 3.59334e-20  count: 99          
  DO NOT replace   alpha:HPCCG.cpp:137      max error introduced: 5.61582e-20  count: 99          
  DO NOT replace   r:HPCCG.cpp:142          max error introduced: 2.05151e-08  count: 9504000     
  DO NOT replace   Ap:HPCCG.cpp:135         max error introduced: 4.20565e-08  count: 9504000     
  DO NOT replace   x:HPCCG.cpp:140          max error introduced: 1.85488e-07  count: 9504000     
=== END ADAPT REPORT ===
```

In [7]:
# prepare data for shaff analysis
rows = []
for n in range(n_sample):
    row = {}
    for i, kernel_name in enumerate(KERNEL_NAMES):
        row[kernel_name] = kernel_outs[n][i]
    row["Y"] = YY[n]
    rows.append(row)
df = pd.DataFrame.from_dict(rows)
df = df[["Y"] + KERNEL_NAMES]
df.to_csv(f"df_{split_point}n.csv", index=False)

### From 5 SHAFF analysis runs with 204 samples:
```
 [1] "HPCCG..Ap_0"    "HPCCG..alpha_0" "HPCCG..alpha_1" "HPCCG..beta_0" 
 [5] "HPCCG..normr_0" "HPCCG..normr_1" "HPCCG..r_0"     "HPCCG..x_0"    
 [9] "main..x_0"      "main..b_0"     
 
 [1] 0.007311581 0.057866433 0.000000000 0.000000000 0.019603875 0.121547643
 [7] 0.000000000 0.315991220 0.301300586 0.163664125
 
 [1] 0.041131600 0.020491647 0.000000000 0.001783308 0.000000000 0.145431151
 [7] 0.034138667 0.301747721 0.326754565 0.115214073
 
 [1] 0.01231547 0.07257845 0.00000000 0.00000000 0.04976910 0.12349858
 [7] 0.01560563 0.31341909 0.24297095 0.15681582
 
 [1] 0.04108724 0.01305161 0.00000000 0.03102680 0.00745509 0.17842195
 [7] 0.03612008 0.23356233 0.29577123 0.15106809
 
 [1] 0.0313003424 0.0227614063 0.0338591616 0.0008722084 0.0584686394
 [6] 0.1197180796 0.0309788151 0.2998767624 0.2731233639 0.1160754408
```

Toss up between `HPCCG::x_0` and `main::x_0` followed by `main::b_0` and `HPCCG::normr_1`