# Emulator: Gaussian Process (`george`)

#### Index<a name="index"></a>
1. [Import packages](#imports)
2. [Load data](#loadData)
    1. [Load train data](#loadTrainData)
    2. [Load test data](#loadTestData)
3. [Emulator method](#emulator)

## 1. Import packages<a name="imports"></a>

In [None]:
import george
import matplotlib.pyplot as plt
import numpy as np
import pickle
import scipy.optimize as op
import seaborn as sns

from sklearn.preprocessing import StandardScaler

#### Aestetic settings

In [None]:
%matplotlib inline

sns.set(font_scale=1.3, style="ticks")

## 2. Load data<a name="loadData"></a>

Read the training data from a `.npy` file:

### 2.1. Load train data<a name="loadTrainData"></a>

In [None]:
path_train = '../data/cosmology_train_1d.pickle'
with open(path_train, 'rb') as input_file:
    data_train = pickle.load(input_file)

In [None]:
input_train = data_train['input_data']
output_train = data_train['output_data']
number_train = input_train.shape[0]
print("Number of datapoints:", number_train)

In [None]:
extra_train = data_train['extra_input']
r_vals = extra_train['r_vals']

In [None]:
xs_train = np.array(input_train.drop(columns=['object_id']))
ys_train = np.array(output_train.drop(columns=['object_id']))

In [None]:
xs_train = input_train.drop(columns=['object_id'])
ys_train = output_train.drop(columns=['object_id'])

### 2.2. Load test data<a name="loadTestData"></a>

In [None]:
path_test = '../data/cosmology_test_1d.pickle'
with open(path_test, 'rb') as input_file:
    data_test = pickle.load(input_file)

In [None]:
input_test = data_test['input_data']
output_test = data_test['output_data']
number_test = input_test.shape[0]
print("Number of datapoints:", number_test)

In [None]:
xs_test = np.array(input_test.drop(columns=['object_id']))
ys_test = np.array(output_test.drop(columns=['object_id']))

In [None]:
xs_test = input_test.drop(columns=['object_id'])
ys_test = output_test.drop(columns=['object_id'])

## 3. Emulator method<a name="emulator"></a>

### 3.1. Scale data<a name="scaleData"></a>

Let's first scale our input parameters, to make training easier:

In [None]:
scaler = StandardScaler()
scaler.fit(xs_train)

In [None]:
import pandas as pd

In [None]:
xs_train.iloc[:, 0] = scaler.transform(xs_train)
xs_test.iloc[:, 0] = scaler.transform(xs_test)

In [None]:
y_mean = np.mean(ys_train, axis=0)
ys_train = ys_train/y_mean
ys_test = ys_test/y_mean

### 3.2. Train emulator<a name="trainEmu"></a>

In [None]:
def fit_gp(kernel, xs, ys, xs_new):
    
    def neg_log_like(p):  # Objective function: negative log-likelihood
        gp.set_parameter_vector(p)
        loglike = gp.log_likelihood(ys, quiet=True)
        return -loglike if np.isfinite(loglike) else 1e25

    def grad_neg_log_like(p):  # Gradient of the objective function.
        gp.set_parameter_vector(p)
        return -gp.grad_log_likelihood(ys, quiet=True)
    
    gp = george.GP(kernel)
    gp.compute(xs)
    results = op.minimize(neg_log_like, gp.get_parameter_vector(),
                          jac=grad_neg_log_like, method="L-BFGS-B", tol=1e-6)
    
    gp.set_parameter_vector(results.x)
    gp_mean, gp_cov = gp.predict(ys, xs_new)
    return gp_mean

In [None]:
number_outputs = np.shape(ys_test)[1]
ys_test_preds = ys_test.copy()
ys_train_0 = ys_train.iloc[:, 0]
for i in np.arange(number_outputs):
    ys_train_i = ys_train.iloc[:, i]
    kernel = np.var(ys_train_0) * george.kernels.ExpSquaredKernel(0.5)
    ys_pred = fit_gp(kernel=kernel, xs=xs_train, 
                     ys=ys_train_i, xs_new=xs_test)
    ys_test_preds.iloc[:, i] = ys_pred

In [None]:
ys_test = ys_test*y_mean
ys_test_preds = ys_test_preds*y_mean

In [None]:
n_plot = int(0.2*number_test)
idxs = np.random.choice(np.arange(number_test), n_plot)
color_idx = np.linspace(0, 1, n_plot)
colors = np.array([plt.cm.rainbow(c) for c in color_idx])

In [None]:
plt.figure(figsize=(8,6))
for i in range(n_plot):
    ys_test_i = ys_test.iloc[idxs[i], :]
    ys_pred_i = ys_test_preds.iloc[idxs[i], :]
    if i==0:
        label_test = 'truth'
        label_pred = 'emu_prediction'
    else:
        label_test = None
        label_pred = None
    plt.plot(r_vals, ys_test_i, alpha=0.8, label=label_test, 
             marker='o', markerfacecolor='None', ls='None', color=colors[i])
    plt.plot(r_vals, ys_pred_i, alpha=0.8, label=label_pred, color=colors[i])
plt.xlabel('$r$')
plt.ylabel(r'$\xi(r)$')
plt.legend()

In [None]:
plt.figure(figsize=(8,6))
for i in range(n_plot):
    ys_test_i = ys_test.iloc[idxs[i], :]
    ys_pred_i = ys_test_preds.iloc[idxs[i], :]
    frac_err = (ys_pred_i-ys_test_i)/ys_test_i
    plt.plot(r_vals, frac_err, alpha=0.8, color=colors[i])
plt.axhline(0.0, color='k')
plt.xlabel('$r$')
plt.ylabel(r'fractional error')