# Baselines

This notebook contains baseline models for hardness modelling problem.

### Setup

In [1]:
import sys

sys.path.append(r"..")

In [2]:
import joblib
import numpy as np
import pandas as pd
import wandb
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import tensorflow as tf
from wandb.keras import WandbCallback

from src.plotting import plot_hardness
from src.metrics import regression_report, regression_score

### Data

In [3]:
train_df = pd.read_csv(r"../data/train.csv")
validation_df = pd.read_csv(r"../data/validation.csv")
test_df = pd.read_csv(r"../data/test.csv")

feature_preprocessor = joblib.load(r"../models/feature_preprocessor.joblib")
target_preprocessor = joblib.load(r"../models/target_preprocessor.joblib")

In [4]:
feature_columns = ["Distance", "Electric Current", "Speed", "Beads", "Bead Type Feature"]
target_columns = ["Hardness"]

### Baselines

Use baseline models to evalute against. <br>
Results might be slightly different than those reported in the paper.

### Linear Regression

In [5]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(train_df[feature_columns].values, train_df[target_columns].values)

LinearRegression()

In [6]:
predictions = linear_regression_model.predict(validation_df[feature_columns].values)
predicted_hardness = target_preprocessor.inverse_transform(predictions)

In [7]:
print(regression_report(y_true=validation_df[target_columns].values.flatten(), y_pred=predicted_hardness.flatten()))

                                Absolute                        Normalized                      

Mean Squared Error:             10941.1330                      0.3457                          
Root Mean Squared Error:        104.5999                        0.4725                          
Mean Absolute Error:            88.4826                         0.4725                          
Median Absolute Error:          89.7381                         0.3682                          
Max Error:                      236.6059                        0.3768                          
R2                                                              0.5629                          


                                True                            Predicted                       

Mean:                           239.8083                        260.3124                        
std:                            158.2168                        124.7511                        



In [8]:
joblib.dump(linear_regression_model, "../models/linear_regression.model")

['../models/linear_regression.model']

### SVR

In [9]:
grid = GridSearchCV(
    estimator=SVR(),
    param_grid={
        "kernel": ["linear", "rbf", "poly"],
        "C": np.linspace(0.1, 10, 100),
        "degree": [2, 3, 4, 5, 6, 7, 8, 9, 10]
    }
)

grid.fit(train_df[feature_columns].values, train_df[target_columns].values.ravel())

GridSearchCV(estimator=SVR(),
             param_grid={'C': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
        1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
        2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
        3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
        4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
        5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
        6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
        7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
        8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
       10. ]),
                         'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kernel': ['linear', 'rbf', 'poly']})

In [10]:
pd.DataFrame(grid.cv_results_)[["param_C", "param_degree", "param_kernel", "mean_test_score", "std_test_score", "rank_test_score"]].sort_values(by="rank_test_score").head(10)

Unnamed: 0,param_C,param_degree,param_kernel,mean_test_score,std_test_score,rank_test_score
2071,7.7,8,rbf,0.807318,0.058776,1
2068,7.7,7,rbf,0.807318,0.058776,1
2065,7.7,6,rbf,0.807318,0.058776,1
2077,7.7,10,rbf,0.807318,0.058776,1
2062,7.7,5,rbf,0.807318,0.058776,1
2059,7.7,4,rbf,0.807318,0.058776,1
2056,7.7,3,rbf,0.807318,0.058776,1
2053,7.7,2,rbf,0.807318,0.058776,1
2074,7.7,9,rbf,0.807318,0.058776,1
2002,7.5,3,rbf,0.807303,0.05955,10


In [11]:
predictions = grid.best_estimator_.predict(validation_df[feature_columns].values)
predicted_hardness = target_preprocessor.inverse_transform(np.expand_dims(predictions, axis=-1))

In [12]:
print(regression_report(y_true=validation_df["Hardness"].values, y_pred=predicted_hardness))

                                Absolute                        Normalized                      

Mean Squared Error:             6764.8094                       1.2103                          
Root Mean Squared Error:        82.2485                         0.7431                          
Mean Absolute Error:            58.3689                         0.7431                          
Median Absolute Error:          33.2123                         0.4779                          
Max Error:                      231.5113                        0.3686                          
R2                                                              0.7298                          


                                True                            Predicted                       

Mean:                           239.8083                        245.8289                        
std:                            158.2168                        137.4158                        



In [13]:
joblib.dump(grid.best_estimator_, "../models/svr.model")

['../models/svr.model']

# MLP

Test different MLP models using `wandb` library and system. <br>
Access to `wandb` is not a part of this repository, only code for execution is provided.

Accounts for `wandb` can be create [here](https://wandb.ai/site). 

In [14]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkzajac[0m (use `wandb login --relogin` to force relogin)


True

In [15]:
sweep_config = {
    "name" : "mlp-baseline-sweep",
    "method" : "grid",
    "metric": {
        "name": "MSE",
        "goal": "minimize",
    },
    "parameters" : {
        "optimizer": {
            "values": ["rmsprop"]
        },
        "n_layers": {
            "values": [1, 2, 3, 4, 5],
        },
        "fc_layer_size": {
            "values": [8, 10, 12],
        },
        "epochs": {
            "values": [1000]
        },
        "activation": {
            "values": ["tanh", "relu", "sigmoid"],
        },
        "patience": {
            "values": [10],
        }
    }
}

sweep_id = wandb.sweep(sweep_config)

Create sweep with ID: meolpjgy
Sweep URL: https://wandb.ai/kzajac/uncategorized/sweeps/meolpjgy


In [16]:
def model_from_parameters(parameters):
    mlp_model = tf.keras.Sequential()
    mlp_model.add(tf.keras.layers.InputLayer(input_shape=(5,)))  # number of inputs is fixed

    for _ in range(parameters["n_layers"]):
        mlp_model.add(tf.keras.layers.Dense(parameters["fc_layer_size"], activation=parameters["activation"]))

    mlp_model.add(tf.keras.layers.Dense(1, activation="linear"))  # one output
    
    return mlp_model

In [19]:
def train(config, features, targets): 
    model = model_from_parameters(config)  # creates an instance of model
        
    model.compile(optimizer=config["optimizer"], loss=tf.losses.MeanSquaredError())
    model.fit(
        features.squeeze(),
        targets.squeeze(),
        epochs=config["epochs"],
        verbose=0,
        validation_split=0.1,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=config["patience"], monitor="val_loss"), WandbCallback()]
    )
        
    return model
        
def test(model, features, targets):
    predictions = model.predict(features.squeeze())
    
    # compute metrics
    scores = regression_score(y_true=targets.flatten(), y_pred=predictions.flatten())
    wandb.log(scores.to_dict())
    
def run_experiment():
    with wandb.init() as run:
        config = wandb.config
        model = train(config, train_df[feature_columns].values, train_df[target_columns].values.ravel())
        test(model, validation_df[feature_columns].values, validation_df[target_columns].values.ravel())

In [1]:
# Runs takes around 10-20 minutes
# wandb.agent(sweep_id, function=run_experiment)