In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython
import os
import time
import re
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from pearlsim.ml_utilities import *

# Extract data from Serpent detector files
We can use the read_det_file function from ml_utilities to parse all of the detector files we have in the training data directory to create one pair of unified features/target dataframes.

In [7]:
file_names = os.listdir("training_data/")
all_features = pd.DataFrame([])
all_targets = pd.DataFrame([])
for name in file_names:
    if "det" in name:
        features, targets, avg_uncertainty = read_det_file("training_data/"+name)
        print(f"File {name} has an average {round(avg_uncertainty*100,2)}% uncertainty.")
        all_features = pd.concat([all_features, features])
        all_targets = pd.concat([all_targets, targets])
all_features.to_csv("training_data/current_data.csv")
all_targets.to_csv("training_data/current_target.csv")


File gFHR_core_1.serpent_det1.m has an average 44.83% uncertainty.


Our features include the radius and height of the pebble-based detector. It also includes every bin of the core flux map. The gFHR model is divided into 12 energy groups and has 4 radial divisions, each divided into 10 separate axial zones. Normally you would need to volume-weight these flux values, and deal with the somewhat complicated indexing scheme to sort out which is which. We don't need to bother here, since they're all going to be standardized and the model cares not about where the bin is.

In [69]:
all_features.iloc[0:10]

Unnamed: 0,radius,height,bin1,bin2,bin3,bin4,bin5,bin6,bin7,bin8,...,bin471,bin472,bin473,bin474,bin475,bin476,bin477,bin478,bin479,bin480
0,81.0486,178.114,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
1,57.7707,151.196,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
2,91.4887,326.0362,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
3,82.7382,290.7679,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
4,93.6223,73.545,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
5,52.165,84.252,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
6,68.1323,246.025,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
7,93.1488,149.517,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
8,84.4764,244.562,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16
9,45.3588,364.642,0.0,0.0,0.0,0.0,4.84288e+17,1.32137e+18,1.82217e+18,1.95679e+18,...,4.67409e+18,4.88491e+18,5.61325e+17,1.50833e+18,1.99423e+18,1.85061e+18,3740820000000000.0,1.84576e+16,2.8168e+16,1.05469e+16


Meanwhile, the target variables represent the neutron current going into the pebbles above, divided into the same 12 energy bins. There's a tradeoff between how fine our energy grid is, and how long it takes for the Serpent model to run. Every time you add a new bin, you need to simulate more particles to get sufficient statistics. So, our goal is to maximize accuracy using as few bins as possible.

In [70]:
all_targets.iloc[0:10]

Unnamed: 0,0.000000e+00,3.000000e-08,5.800000e-08,1.400000e-07,2.800000e-07,3.500000e-07,6.250000e-07,4.000000e-06,4.805200e-05,5.530000e-03,8.210000e-01,2.231000e+00
0,29906100000000.0,150940000000000.0,571231000000000.0,1109120000000000.0,210011000000000.0,180285000000000.0,543449000000000.0,720698000000000.0,1171560000000000.0,1533240000000000.0,241264000000000.0,119378000000000.0
1,117583000000000.0,150358000000000.0,601133000000000.0,871153000000000.0,151060000000000.0,450532000000000.0,483848000000000.0,537995000000000.0,1292430000000000.0,1258730000000000.0,179597000000000.0,179788000000000.0
2,150056000000000.0,151690000000000.0,663282000000000.0,659986000000000.0,60402500000000.0,301564000000000.0,238050000000000.0,570893000000000.0,867357000000000.0,964053000000000.0,60157000000000.0,178781000000000.0
3,60742200000000.0,241867000000000.0,479775000000000.0,786503000000000.0,211481000000000.0,329725000000000.0,630434000000000.0,753935000000000.0,1206990000000000.0,956315000000000.0,150424000000000.0,90694300000000.0
4,89474200000000.0,60042300000000.0,209548000000000.0,240472000000000.0,119459000000000.0,59456900000000.0,150832000000000.0,182663000000000.0,240105000000000.0,331077000000000.0,60192300000000.0,29927700000000.0
5,29504500000000.0,210540000000000.0,448440000000000.0,333105000000000.0,59974800000000.0,150733000000000.0,150354000000000.0,477598000000000.0,841778000000000.0,571261000000000.0,242134000000000.0,88824900000000.0
6,91950500000000.0,209964000000000.0,662563000000000.0,1025070000000000.0,181291000000000.0,391721000000000.0,631417000000000.0,987566000000000.0,1318640000000000.0,1346790000000000.0,361362000000000.0,60651300000000.0
7,209009000000000.0,60500000000000.0,540711000000000.0,899784000000000.0,149613000000000.0,209072000000000.0,362134000000000.0,268881000000000.0,1024560000000000.0,1108750000000000.0,391022000000000.0,150361000000000.0
8,60105700000000.0,90469600000000.0,510457000000000.0,542183000000000.0,120468000000000.0,153166000000000.0,388419000000000.0,631145000000000.0,1748390000000000.0,1051830000000000.0,391371000000000.0,89789800000000.0
9,60875100000000.0,119597000000000.0,211299000000000.0,693410000000000.0,150221000000000.0,90582300000000.0,120806000000000.0,209966000000000.0,449910000000000.0,723285000000000.0,209965000000000.0,60017700000000.0


# Data Standardization
Simple standardization is performed here along each column. You might want to consider looking into log standardization, but it didn't seem to help much for me.

In [114]:
train_split = 0.8
np.random.seed(42)

def standardize(raw_data, mean=None, std=None, axis=0):
    if mean is None:
        mean = np.mean(raw_data, axis = axis)
    if std is None:
        std = np.std(raw_data, axis = axis)
        std[ std==0 ] = 0.1
    result = (raw_data - mean) / std
    return result, mean, std

def unstandardize(standardized_data, mean, std):
    raw_data = (standardized_data*std)+mean
    return raw_data

#log_features = all_features#.apply(lambda x: np.log10(x + 1))
#log_targets = all_targets
#log_targets.iloc[:,2:] = all_targets.iloc[:,2:]#.apply(lambda x: np.log10(x + 1))

num_data = len(all_features)
training_size = int(num_data*train_split)
testing_size = num_data - training_size
data_indices = np.arange(num_data)
training_indices = np.random.choice(num_data, training_size, replace=False)
testing_indices = data_indices[np.in1d(data_indices, training_indices, invert=True)]

training_data, data_mean, data_std = standardize(all_features.iloc[training_indices])
training_target, target_mean, target_std = standardize(all_targets.iloc[training_indices])
testing_data, _, _  = standardize(all_features.iloc[testing_indices], mean=data_mean, std=data_std)
testing_target, _, _  = standardize(all_targets.iloc[testing_indices], mean=target_mean, std=target_std)

print(np.shape(training_data))
print(np.shape(training_target))
print(np.shape(testing_data))
print(np.shape(testing_target))

(800, 482)
(800, 12)
(200, 482)
(200, 12)


# Model Training
I threw together a quick RFR model and got some results. You're free to change to any other type of model, as long as its something I can save and load into other modules. Things to try:
- Properly using cross validation
- Tuning the hyper parameters
- Trying a different model, probably a neural net

In [115]:
best_params = {'max_depth': 10, 
               'n_estimators': 1000, 
               'n_jobs': 14,}
rfr_model = RandomForestRegressor(random_state=0)
rfr_model.set_params(**best_params)
rfr_model.fit(training_data, training_target)
rfr_model_test_score = rfr_model.score(testing_data, testing_target)
print(f"RFR score: {rfr_model_test_score}")

RFR score: -0.545159239552131
