In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython
import os
import time
import re
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from pearlsim.ml_utilities import *

# Extract data from Serpent detector files
We can use the read_det_file function from ml_utilities to parse all of the detector files we have in the training data directory to create one pair of unified features/target dataframes.

Sometimes the file(s) provided will have really high uncertainty. This will absolutely limit your model's accuracy, so don't be disheartened until I get you more accurate data, which can take a while to run.

In [8]:
features = pd.read_csv("training_data/burnup_features.csv", index_col=0)
targets = pd.read_csv("training_data/burnup_target.csv", index_col=0)


Our features include the radius and height of the pebble-based detector. It also includes every bin of the core flux map. The gFHR model is divided into 12 energy groups and has 4 radial divisions, each divided into 10 separate axial zones. Normally you would need to volume-weight these flux values, and deal with the somewhat complicated indexing scheme to sort out which is which. We don't need to bother here, since they're all going to be standardized anyways and the model will make its own inferences about the spatial distribution.

In [18]:
features['92235<lib>'].iloc[0:10]

0    0.004355
1    0.004353
2    0.003573
3    0.004670
4    0.004559
5    0.004135
6    0.003781
7    0.003771
8    0.004052
9    0.003465
Name: 92235<lib>, dtype: float64

In [19]:
targets['92235<lib>'].iloc[0:10]

0    0.004355
1    0.004353
2    0.003573
3    0.004669
4    0.004559
5    0.004134
6    0.003781
7    0.003770
8    0.004052
9    0.003465
Name: 92235<lib>, dtype: float64

# Data Standardization
Simple standardization is performed here along each column. You might want to consider looking into log standardization, but it didn't seem to help much for me.

In [12]:
train_split = 0.8
np.random.seed(42)

def standardize(raw_data, mean=None, std=None, axis=0):
    if mean is None:
        mean = np.mean(raw_data, axis = axis)
    if std is None:
        std = np.std(raw_data, axis = axis)
        std[ std==0 ] = 0.1
    result = (raw_data - mean) / std
    return result, mean, std

def unstandardize(standardized_data, mean, std):
    raw_data = (standardized_data*std)+mean
    return raw_data


num_data = len(features)
training_size = int(num_data*train_split)
testing_size = num_data - training_size
data_indices = np.arange(num_data)
training_indices = np.random.choice(num_data, training_size, replace=False)
testing_indices = data_indices[np.in1d(data_indices, training_indices, invert=True)]

training_data, data_mean, data_std = standardize(features.iloc[training_indices])
training_target, target_mean, target_std = standardize(targets.iloc[training_indices])
testing_data, _, _  = standardize(features.iloc[testing_indices], mean=data_mean, std=data_std)
testing_target, _, _  = standardize(targets.iloc[testing_indices], mean=target_mean, std=target_std)

print(np.shape(training_data))
print(np.shape(training_target))
print(np.shape(testing_data))
print(np.shape(testing_target))

(80, 433)
(80, 589)
(20, 433)
(20, 589)


# Model Training
I threw together a quick RFR model and got some results. You're free to change to any other type of model, as long as its something I can save and load into other modules. Things to try:
- Properly using cross validation
- Tuning the hyper parameters
- Trying a different model, probably a neural net

In [13]:
best_params = {'max_depth': 10, 
               'n_estimators': 1000, 
               'n_jobs': 14,} # Set to your number of cores
rfr_model = RandomForestRegressor(random_state=0)
rfr_model.set_params(**best_params)
rfr_model.fit(training_data, training_target)
rfr_model_test_score = rfr_model.score(testing_data, testing_target)
print(f"RFR score: {rfr_model_test_score}")

RFR score: -0.05276871866241244


