In [None]:
# Copyright 2021, Battelle Energy Alliance, LLC

import pandas as pd 
import numpy as np
import json
import os
import re

!pip install pygam

# Change working directory if not the project directory
current_dir = os.getcwd()
folders = re.split('\/', current_dir)
if folders[len(folders)-1] == 'model':
    os.chdir(os.path.abspath(os.path.join('..')))

# Load environment variables from .env file    
!pip install python-dotenv
%load_ext dotenv
%dotenv
import settings
%pwd

In [None]:
with open(os.getenv("ML_ADAPTER_OBJECT_LOCATION"), 'r') as fp:
    data = json.load(fp)

In [None]:
def standardize_mean_normalization(X_train, X_test, y_train, y_test):
    """
    Standardizes the data according to the z-score formula

    z = (x – μ) / σ

    Note: Only the training mean and standard deviation are used for the standardization of the data.
    This ensures that there is no contamination of the test data set.

    Args
        X_train (DataFrame): a subset of the Features, X, Predictors dataset used for training
        X_test (DataFrame): a subset of the Features, X, Predictors dataset used for testing
        y_train (Series): a subset of the Response, y, Label dataset used for training
        y_test (Series): a subset of the Response, y, Label dataset used for testing
    Return
        X_train_standardize (DataFrame): a standardized subset of the Features, X, Predictors dataset used for training
        X_test_standardize (DataFrame): a standardized subset of the Features, X, Predictors dataset used for testing
        y_train_standardize (Series): a standardized subset of the Response, y, Label dataset used for training
        y_test_standardize (Series): a standardized subset of the Response, y, Label dataset used for testing
        standardize (dictionary): a dictionary of the mean and standard deviation for un-standardizing data and standardizing incoming data e.g. {mean: {X_train: "", y_train: ""}, std: {X_train: "", y_train: ""}}
    """
    # Determine the mean and standard deviation
    # If pandas Dataframe
    if isinstance(X_train, pd.DataFrame):
        X_train_mean = list(X_train.mean())
        X_train_std = list(X_train.std())
    # If pandas Series
    else:
        X_train_mean = X_train.mean()
        X_train_std = X_train.std()

    y_train_mean = y_train.mean()
    y_train_std = y_train.std()

    # Standardize data
    X_train_standardize = (X_train - X_train_mean) / X_train_std
    X_test_standardize  = (X_test - X_train_mean) / X_train_std
    y_train_standardize  = (y_train - y_train_mean) / y_train_std
    y_test_standardize  = (y_test - y_train_mean) / y_train_std

    # Create a dictionary of the mean and standard deviation for un-standardizing data and standardizing incoming data
    standardize = dict()
    standardize["mean"] = dict()
    standardize["std"] = dict()
    standardize["mean"]["X_train"] = X_train_mean
    standardize["std"]["X_train"] = X_train_std
    standardize["mean"]["y_train"] = y_train_mean
    standardize["std"]["y_train"] = y_train_std

    return X_train_standardize, X_test_standardize, y_train_standardize, y_test_standardize, standardize

In [None]:
def create_model(X_train, y_train):
    model = None
    return model

In [None]:
def get_fitted_residuals_RMSE_values(model, X_train, X_test, y_train, y_test):
    """Determine yhat, residuals, and RMSE of training and testing datasets"""
    from sklearn.metrics import mean_squared_error
    # Get yhat through prediction
    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)

    # Reshape
    y_train = y_train.values.reshape(-1,1)
    y_test = y_test.values.reshape(-1,1)

    # Get residuals and RMSE
    y_train_residuals = y_train - yhat_train
    y_test_residuals = y_test - yhat_test
    rmse_train= mean_squared_error(y_train, yhat_train, squared=False)
    rmse_test= mean_squared_error(y_test, yhat_test, squared=False)

    return yhat_train, yhat_test, y_train_residuals, y_test_residuals, rmse_train, rmse_test

In [None]:
def unstandardize_mean_normalization(y_train, y_test, yhat_train, yhat_test, standardize):
    """
    Unstandardizes the data according to the z-score formula

    z = (x * σ) + μ

    Note: Only the training mean and standard deviation are used for the standardization of the data.
    This ensures that there is no contamination of the test data set.

    Args
        y_train (Series): a standardized subset of the Response, y, Label dataset used for training
        y_test (Series): a standardized subset of the Response, y, Label dataset used for testing
        yhat_train (Series): a standardized estimation of the Response, y, Label for training set
        yhat_test (Series): a standardized estimation of the Response, y, Label for testing set
        standardize (dictionary): a dictionary of the mean and standard deviation for un-standardizing data and standardizing incoming data e.g. {mean: {X_train: "", y_train: ""}, std: {X_train: "", y_train: ""}}
    Return
        y_train (Series): an unstandardized subset of the Response, y, Label dataset used for training
        y_test (Series): an unstandardized subset of the Response, y, Label dataset used for testing
        yhat_train (Series): an unstandardized estimation of the Response, y, Label for training set
        yhat_test (Series): a unstandardized estimation of the Response, y, Label for testing set
        y_train_residuals (Series): the difference between the actual train response and the predicted train response (y_train - yhat_train)
        y_test_residuals (Series): the difference between the actual test response and the predicted test response (y_test - yhat_test)

    """   
    # Unstandardize y and yhat for the training and testing datasets
    y_train_mean = standardize["mean"]["y_train"]
    y_train_std = standardize["std"]["y_train"]

    y_train = (y_train * y_train_std) + y_train_mean
    yhat_train = (yhat_train * y_train_std.values) + y_train_mean.values
    y_test = (y_test * y_train_std ) + y_train_mean
    yhat_test = (yhat_test * y_train_std.values) + y_train_mean.values

    # Reshape if necessary
    y_train = y_train.values.reshape(-1)
    y_test = y_test.values.reshape(-1)

    # Unstandardize residuals for the training and testing datasets
    y_train_residuals = y_train - yhat_train
    y_test_residuals = y_test - yhat_test

    return y_train, y_test, yhat_train, yhat_test, y_train_residuals, y_test_residuals

In [None]:
def create_JSON_file(X_train, y_train, y_test, yhat_train, yhat_test, y_train_residuals, y_test_residuals, rmse_train, rmse_test, rmse_train, rmse_test, independent_variables, dependent_variables, standardize, tolerance=2):       
    """Create a .json file of the machine learning results"""
    # Create a dictionary of the machine learning results
    data = dict()
    data["Independent Variables"] = independent_variables
    data["Dependent Variables"] = dependent_variables
    data["RMSE"] = {}
    data["RMSE"]["train"] = [rmse_train]
    data["RMSE"]["test"] = [rmse_test]
    data["Mean"] = {}
    data["Mean"]["X_train"] = standardize["mean"]["X_train"]
    data["Mean"]["y_train"] = list(standardize["mean"]["y_train"])
    data["Standard Deviation"] = {}
    data["Standard Deviation"]["X_train"] = standardize["std"]["X_train"]
    data["Standard Deviation"]["y_train"] = list(standardize["std"]["y_train"])
    data["Fitted"] = {}
    data["Fitted"]["train"] = yhat_train.round(tolerance).tolist()
    data["Fitted"]["test"] = yhat_test.round(tolerance).tolist()
    data["Residuals"] = {}
    data["Residuals"]["train"] = y_train_residuals.round(tolerance).tolist()
    data["Residuals"]["test"] = y_test_residuals.round(tolerance).tolist()
    
    # Write the data to a JSON File
    location = data["MODEL"]["output_file"]
    with open(location, "w") as f:
        json.dump(data, f)
        f.close()

In [None]:
def save_model(model):
    """Save the model to disk"""
    import pickle
    filename = data["MODEL"]["model_serialization_file"]
    pickle.dump(model, open(filename, 'wb'))

In [None]:
def save_standardization(standardize):
    """Save the standardization information"""
    # Write the data to a JSON File
    location = data["MODEL"]["standardization_file"]
    with open(location, "w") as f:
        json.dump(standardize, f)
        f.close()

In [None]:
def build_model():
    # Retrieve Data
    X_train = pd.read_csv('data/X_train.csv', index_col=0)
    X_test = pd.read_csv('data/X_test.csv', index_col=0)
    y_train = pd.read_csv('data/y_train.csv', index_col=0)
    y_test = pd.read_csv('data/y_test.csv', index_col=0)
    independent_variables = list(X_train.columns)
    dependent_variables = list(y_train.columns)
    
    # Standardize the data
    X_train, X_test, y_train, y_test, standardize = standardize_mean_normalization(X_train, X_test, y_train, y_test)
    
    # Create model
    model = create_model(X_train, y_train)
    
    # Determine yhat, residuals, and RMSE of training and testing datasets
    yhat_train, yhat_test, y_train_residuals, y_test_residuals, rmse_train, rmse_test = get_fitted_residuals_RMSE_values(model, X_train, X_test, y_train, y_test)
    
    # Unstandardize the data
    y_train, y_test, yhat_train, yhat_test, y_train_residuals, y_test_residuals = unstandardize_mean_normalization(y_train, y_test, yhat_train, yhat_test, standardize)
    
    # Save model
    save_model(model)

    # Save standardization
    save_standardization(standardize)

    # Generate JSON file of results
    create_JSON_file(X_train, y_train, y_test, yhat_train, yhat_test, y_train_residuals, y_test_residuals, rmse_train, rmse_test, rmse_train, rmse_test, independent_variables, dependent_variables, standardize, tolerance=2)

In [None]:
#build_model()