# Baseline models training

The main purspose of this notebook is to explore baseline models. This is done in the second part of the notebook. In the first part of the notebook standard normalization of all temperature and molality related features is performed. 

In [None]:
# general dependencies

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# ignore warnings in this notebook
# not necessary, can be commented
import warnings
warnings.filterwarnings('ignore')

## Load datasets

In [None]:
# path for loading datasets
path = r"../datasets/"

In [None]:
# loading enhanced electrolyte dataset
dataframe = pd.read_csv(path + "dataset.csv", index_col=0)
dataframe.shape

In [None]:
# loading enhanced RbI dataset
try:
    outtest = pd.read_csv(path + "outtest_dataset.csv", index_col=0)
    print(outtest.shape)
except FileNotFoundError:
        print("The CSV file must be first created in features_addition notebook.")


## Train / test data splitting
Shuffled (to distribute media randomly) data from dataframe are split into three sets in the following propotions:
 - training data: 70 %
 - validation data: 20 %
 - test data: 10 %

In [None]:
# shuffle data to distribute electrolytes randomly
dataframe = dataframe.sample(frac=1)

In [None]:
X_cols = list(dataframe.columns[:-2])    # feature names (excludes "medium" and "sound" columns)

X = dataframe[X_cols]            # store model features dataframe
y = dataframe["sound"]            # store sound speed values series

media = dataframe["medium"]              # store list of media
T = dataframe["T"].copy()                # remember original T feature for use in model_evaluation plots

In [None]:
# dependencies for data splitting

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split

In [None]:
# train / test spliting
# 70 % train, 20 % val, 10 % test

X_train, X_tv, y_train, y_tv = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_tv, y_tv, test_size=0.3)

In [None]:
# print the test, validation, train set sizes
print("Number of samples in train set: {0}, {1:.0f} % of total"
      .format(len(y_train), 100 * len(y_train) / dataframe.shape[0]))

print("Number of samples in validation set: {0}, {1:.0f} % of total"
      .format(len(y_val), 100 * len(y_val) / dataframe.shape[0]))

print("Number of samples in test set: {0}, {1:.0f} % of total"
      .format(len(y_test), 100 * len(y_test) / dataframe.shape[0]))

## Standard normalization of temperature and molality related features

In [None]:
# import the standard normalization object
from sklearn.preprocessing import StandardScaler
norm = StandardScaler()

In [None]:
# normalize yet non scaled features: kW, T, is, c, cm
to_norm = ["T", "c","Kw","cm", "is"]

In [None]:
# only fit the normalization on training data
norm.fit(X_train[to_norm])

# transform all sets
X_train[to_norm] = norm.transform(X_train[to_norm])
X_val[to_norm] = norm.transform(X_val[to_norm])
X_test[to_norm] = norm.transform(X_test[to_norm])

In [None]:
# check that standard normalization went well: std = 1, mean = 0
X_train[to_norm].describe().transpose()

## Save preprocessed dataset, ready for modeling

In [None]:
# save preprocessed dataset, must normalize it first
dataframe[to_norm] = norm.transform(dataframe[to_norm])
# add original temperature values for plotting in model_evaluation notebook
dataframe["T_orig"] = T
dataframe.to_csv(r"../datasets/preprocessed.csv")

In [None]:
# save preprocessed outside matrix RbI test dataset

try:
    outtest[to_norm] = norm.transform(outtest[to_norm])
    outtest.to_csv(r"../datasets/o_preprocessed.csv")
except:
    print("The outtest dataframe does not exist.")
    print("Do not forget to enhance the RbI data in features_addition notebook.")

## Baseline models
The following three models were proposed and tested as baseline models:
 - Decision tree: maximum depth 10
 - Elastic net: on original feature set
 - Elastic net: on 2nd order polynomial transformation of feature set

__Evaluation metric function__

In [None]:
# r2 and rmse and AARD returning function
from sklearn.metrics import r2_score, mean_squared_error

def result_stats(actual, predicted):
    """
    Returns r_2, rmse and AARD value for two arrays of equal length
    """
    
    r2 = r2_score(actual, predicted)
    rmse = np.sqrt(mean_squared_error( actual, predicted ))
    aard = (100 / len(actual)) * np.sum(np.abs((actual - predicted) / actual))
    
    return r2,rmse, aard

__Baseline models import__

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=10)  # limit tree depth to 10

from sklearn.linear_model import ElasticNet
en = ElasticNet()  # all hyperparameters are default

from sklearn.preprocessing import PolynomialFeatures
pr = PolynomialFeatures(degree=2, interaction_only=False)  # all possible 2nd order combinations

### Testing the baseline models using 5 fold cross validation on training data

In [None]:
def cross_validate_baseline(model, c, X, y):
    """
    Five fold cross validation run on model using feature data X with target value y.
    Prints model RMSE.
    """
    pred = cross_val_predict(model, X, y, cv=5)
    _, rmse, _ = result_stats(y, pred)
    
    print('{1} CV Avg.: {0:.1f}'.format(rmse, model.__str__().split("(",1)[0]))
    return

In [None]:
# decision tree
cross_validate_baseline(dt, "b", X_train, y_train)

# elastic net linear
cross_validate_baseline(en, "g", X_train, y_train)

# polynomial transformation of feature set
X_pr = pr.fit_transform(X_train)
# elastic net polynomial
cross_validate_baseline(en, "r", X_pr, y_train)

__Results of baseline models on validation data__

In [None]:
# fit decision tree on train data, use validation set for evaluation
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print("Validation results for Decision tree: RMSE {0:.1f}".format(result_stats(y_val, dt_pred)[1]))

In [None]:
# fit elastic net on train data, use validation set for evaluation
en.fit(X_train, y_train)
en_pred = en.predict(X_val)
print("Validation results for Elastic net with linear features: RMSE {0:.1f}".format(result_stats(y_val, en_pred)[1]))

In [None]:
# do 2nd order polynomial transformation of input data first
pr.fit(X_train)
X_train_pr = pr.transform(X_train)
X_val_pr = pr.transform(X_val)

# fit elastic net on transformed train data, use transformed validation set for evaluation
en.fit(X_train_pr, y_train)
pr_pred = en.predict(X_val_pr)
print("Validation results for Elastic net with polynomial features: RMSE {0:.1f}"
      .format(result_stats(y_val, pr_pred)[1]))

In [None]:
# residual plots for all baseline models
# the second plot has the y-axis values limited

fig, ax = plt.subplots(nrows=1,ncols=2,figsize=(10,5))

for i in [0,1]:            
    
    ax[i].scatter(x = range(len(y_val)),y = dt_pred-y_val,
                  alpha=0.2,c="b", label = "Decision tree")     # decision tree - blue
    
    ax[i].scatter(x=range(len(y_val)),y = en_pred - y_val, 
                  alpha=0.2, c="g", label = "Elastic net (1)")   # linear elastic net - green
    
    ax[i].scatter(x=range(len(y_val)),y=pr_pred-y_val,
                  alpha=0.2,c="y", label = "Elastic net (2)")         # polynomial elastic net - yellow
    
    ax[i].set_xlabel("Data point ID")
    ax[i].set_ylabel("Residual $u$ [m/s]")

ax[0].legend()
ax[1].set_ylim(-50,50)
fig.suptitle("Baseline model residual plots on validation data")
plt.show()

## Save the train, validation and test data for use in other notebooks

In [None]:
%store X_train
%store y_train

In [None]:
%store X_val
%store y_val

In [None]:
%store X_test
%store y_test