In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from scipy.optimize import minimize
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [2]:
RANDOM_STATE = 42

In [3]:
# read in data, configure dataframe
target_directory = 'data/full_opt_15KeV'

x_dictlist, y_dictlist = [], []

for json_file in os.listdir(target_directory):
    if json_file[-4:] != 'json':
        continue

    with open(f'{target_directory}/{json_file}') as f:
        samples = json.load(f)['samples']
        dvars = [samples[i]['dvar'] for i in samples.keys()]
        objs = [samples[i]['obj'] for i in samples.keys()]
        
        x_dictlist += dvars
        y_dictlist += objs
        
x_df = pd.DataFrame(x_dictlist).apply(pd.to_numeric)
y_df = pd.DataFrame(y_dictlist).apply(pd.to_numeric)

In [6]:
# important for later
FEATURES = x_df.columns
assert(' ' not in ''.join(FEATURES))
assert('^' not in ''.join(FEATURES))

## Fitting a polynomial regression

I take this to degree 3.

In [12]:
# polynomialize data
poly = PolynomialFeatures(3, include_bias=False)
x_poly_values = poly.fit_transform(x_df)
x_poly_df = pd.DataFrame(x_poly_values, columns=poly.get_feature_names_out())

In [13]:
x_poly_df.shape

(217293, 3059)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(
    x_poly_df, y_df, test_size=0.15, random_state=RANDOM_STATE
)

In [15]:
reg = LinearRegression().fit(x_train, y_train)
print('train set R^2: ', reg.score(x_train, y_train))
print('test set R^2: ', reg.score(x_test, y_test))

train set R^2:  0.8919795935858758
test set R^2:  0.8679511409608528
