In [1]:
import numpy as np 
import pandas as pd 
import plotly.express as px
import os

# Data

In [4]:
data_path = os.path.join("..","..","data","data2_280722.csv")
df = pd.read_csv(data_path)
# Remove NaN columns
df.drop(["Unnamed: 4", "Unnamed: 5"], axis = 1, inplace = True)
# Remove any missing values
df.dropna(inplace=True)
df.head()

Unnamed: 0,xk,xpl,xmw,xTe,yts
0,0.4,0.6,1.4,229.7,0.0
1,0.4,0.6,1.342,240.7,0.5
2,0.4,0.6,1.284,251.5,1.0
3,0.4,0.6,1.228,262.1,1.5
4,0.4,0.6,1.172,272.5,2.0


In [5]:
print(f"df size : {df.shape}")

df size : (351, 5)


In [7]:
p = px.scatter_matrix(df, color = "yts", color_continuous_scale="Sunsetdark")
p.update_layout(template="plotly_white")

In [14]:
X = df.drop(["yts"],axis = 1).values
y = df[["yts"]].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
print(f"X_train : {X_train.shape}, y_train : {y_train.shape}")
print(f"X_test : {X_test.shape}, y_test : {y_test.shape}")

X_train : (280, 4), y_train : (280, 1)
X_test : (71, 4), y_test : (71, 1)


In [8]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from scipy import stats

# Functions

## Train

In [9]:
def train_model(param_grid, estimator,estimator_name,train_set):
    # MultiOutputRegressor(estimator = SVR())
    # Get X_train, y_train
    X_train, y_train = train_set

    # Construct pipeline
    pipe = Pipeline([
        ("scaler", MinMaxScaler()),
        (estimator_name, estimator)
    ])

    # Randomized Search
    search = RandomizedSearchCV(
        pipe,
        param_grid,
        scoring = ("neg_mean_squared_error"),
        cv = KFold(n_splits = 5, shuffle = True),
        n_jobs = -1,
        refit = True,
        n_iter = 60,
        verbose = 0,
        return_train_score = True
    )
    
    # Fit model
    search.fit(X_train, y_train)

    return search

## CV Results

In [10]:
def get_cv_result(model):
    results = pd.DataFrame(model.cv_results_)
    results.sort_values(by = "mean_test_score", ascending=False, inplace=True)
    
    p = px.scatter(
        x = np.arange(0,results.shape[0]), 
        y = results["mean_test_score"],
        error_y=results["std_test_score"]
    )
    p.update_layout(
        template = "plotly_white", 
        xaxis_title = "hyperparam_idx", 
        yaxis_title = "MSE"
    )

    return results, p

## Prediction and Scores

In [11]:
def get_predictions_and_scores(model,model_type,predictions, metrics):
    yhat = model.predict(X)
    predictions[f"yts_hat_{model_type}"] = yhat

    rmse = np.sqrt(mean_squared_error(y, yhat))
    mae = mean_absolute_error(y, yhat)
    r2 = r2_score(y, yhat)
    var = np.var(y - yhat)

    model_metrics = pd.DataFrame({
        "model" : [model_type],
        "rmse" : [rmse],
        "mae" : [mae],
        "r2" : [r2],
        "var" : [np.var(y - yhat)]
    })
    metrics = pd.concat([metrics, model_metrics],axis = 0)

    return predictions, metrics

## Keep track of predictions and scores

In [15]:
# Store Predictions
predictions = pd.DataFrame(columns = ["yts"])
predictions[["yts"]] = y

# Store Metrics
metrics = pd.DataFrame(columns = ["model","rmse", "mae", "r2", "var"])

# Models

## SVM

In [17]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

# Define parameters space
param_grid = [
    {
        "svm__estimator__C" : stats.uniform(0.1,100), 
        "svm__estimator__kernel" : ["linear"]
    },
    {
        "svm__estimator__C" : stats.loguniform(0.1, 100), 
        "svm__estimator__gamma" : stats.loguniform(0.0001, 10), 
        "svm__estimator__kernel" : ["poly","rbf"]
    }
]

search = train_model(
    param_grid=param_grid, 
    estimator = MultiOutputRegressor(estimator = SVR()),
    estimator_name = "svm",
    train_set=(X_train, y_train)
)

print(f"Best Params : {search.best_params_}")
print(f"Best train score : {search.best_score_}")

cv_results, p = get_cv_result(search)
#print(results.head())
p.show()

Best Params : {'svm__estimator__C': 7.754821813494335, 'svm__estimator__gamma': 6.867486622150736, 'svm__estimator__kernel': 'poly'}
Best train score : -0.003518366716673581


In [18]:
predictions, metrics = get_predictions_and_scores(search.best_estimator_, "svm", predictions, metrics)
print(predictions.head())
print(metrics.head())

   yts  yts_hat_svm
0  0.0     0.054192
1  0.5     0.586739
2  1.0     1.099865
3  1.5     1.581323
4  2.0     2.053951
  model      rmse       mae        r2       var
0   svm  0.055863  0.047105  0.999108  0.003089


## Random Forest

In [19]:
from sklearn.ensemble import RandomForestRegressor

param_grid = {
"rf__max_depth" : np.arange(10,120,10),
"rf__n_estimators" : np.arange(100,1000, 100),
"rf__min_samples_split" : stats.randint(1,10),
"rf__min_samples_leaf" : stats.randint(1,5)
}

search = train_model(
    param_grid=param_grid, 
    estimator = RandomForestRegressor(),
    estimator_name = "rf",
    train_set=(X_train, y_train)
)

print(f"Best Params : {search.best_params_}")
print(f"Best train score : {search.best_score_}")

# Cv results
results, p = get_cv_result(search)
results = results[["param_rf__max_depth","param_rf__n_estimators","param_rf__min_samples_split","param_rf__min_samples_leaf","mean_train_score","std_train_score"]]
#print(results.head())
p.show()

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

Best Params : {'rf__max_depth': 100, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 400}
Best train score : -0.02359775669642857


In [20]:
predictions, metrics = get_predictions_and_scores(search.best_estimator_, "rf", predictions, metrics)
print(predictions.head())
print(metrics.head())

   yts  yts_hat_svm  yts_hat_rf
0  0.0     0.054192     0.00000
1  0.5     0.586739     0.50000
2  1.0     1.099865     1.00375
3  1.5     1.581323     1.52625
4  2.0     2.053951     2.07625
  model      rmse       mae        r2       var
0   svm  0.055863  0.047105  0.999108  0.003089
0    rf   0.07281   0.04209  0.998485  6.976246


## KNN

In [21]:
from sklearn.neighbors import KNeighborsRegressor

param_grid = {
    "knn__n_neighbors" : [3,5,7,9,11,13,15],
    "knn__weights" : ["uniform", "distance"],
    "knn__metric" : ["minkowski","euclidean","manhattan"]
}

search = train_model(
    param_grid = param_grid, 
    estimator = KNeighborsRegressor(),
    estimator_name = "knn",
    train_set=(X_train, y_train)
)

print(f"Best Params : {search.best_params_}")
print(f"Best train score : {search.best_score_}")

# Cv results
results, p = get_cv_result(search)
results = results[["param_knn__n_neighbors","param_knn__weights","param_knn__metric","mean_train_score","std_train_score"]]
#print(results.head())
p.show()

Best Params : {'knn__weights': 'distance', 'knn__n_neighbors': 3, 'knn__metric': 'manhattan'}
Best train score : -0.047611124296547326



The total space of parameters 42 is smaller than n_iter=60. Running 42 iterations. For exhaustive searches, use GridSearchCV.



In [22]:
predictions, metrics = get_predictions_and_scores(search.best_estimator_, "knn", predictions, metrics)
print(predictions.head())
print(metrics.head())

   yts  yts_hat_svm  yts_hat_rf  yts_hat_knn
0  0.0     0.054192     0.00000          0.0
1  0.5     0.586739     0.50000          0.5
2  1.0     1.099865     1.00375          1.0
3  1.5     1.581323     1.52625          1.5
4  2.0     2.053951     2.07625          2.0
  model      rmse       mae        r2       var
0   svm  0.055863  0.047105  0.999108  0.003089
0    rf   0.07281   0.04209  0.998485  6.976246
0   knn  0.083804  0.025377  0.997993  0.007023


## Elastic Net

In [23]:
from sklearn.linear_model import ElasticNet

param_grid = {
    "glm__alpha" : stats.uniform(1e-5,100),
    "glm__l1_ratio" : np.arange(0,1,0.01),
}

search = train_model(
    param_grid = param_grid, 
    estimator = ElasticNet(),
    estimator_name = "glm",
    train_set=(X_train, y_train)
)

print(f"Best Params : {search.best_params_}")
print(f"Best train score : {search.best_score_}")

# Cv results
results, p = get_cv_result(search)
#results = results[["param_knn__n_neighbors","param_knn__weights","param_knn__metric","mean_train_score","std_train_score"]]
#print(results.head())
p.show()

Best Params : {'glm__alpha': 48.203724312136195, 'glm__l1_ratio': 0.0}
Best train score : -3.509599021709255


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.916e+02, tolerance: 9.859e-02 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.



In [24]:
predictions, metrics = get_predictions_and_scores(search.best_estimator_, "glm", predictions, metrics)
print(predictions.head())
print(metrics.head())

   yts  yts_hat_svm  yts_hat_rf  yts_hat_knn  yts_hat_glm
0  0.0     0.054192     0.00000          0.0     2.969580
1  0.5     0.586739     0.50000          0.5     2.971277
2  1.0     1.099865     1.00375          1.0     2.972958
3  1.5     1.581323     1.52625          1.5     2.974594
4  2.0     2.053951     2.07625          2.0     2.976215
  model      rmse       mae        r2       var
0   svm  0.055863  0.047105  0.999108  0.003089
0    rf   0.07281   0.04209  0.998485  6.976246
0   knn  0.083804  0.025377  0.997993  0.007023
0   glm  1.865901  1.612674  0.005261  3.500027


## ANN

In [26]:
import os 
import sys
import torch 
import torch.nn as nn 
import torch.optim as optim 
import torch.nn.functional as F

from copy import deepcopy

class Simple_MLP(nn.Module):
    def  __init__(self, l1, l2):
        super(Simple_MLP, self).__init__()
        self.fc1 = nn.Linear(in_features= 4, out_features=l1)
        self.fc2 = nn.Linear(in_features = l1, out_features = l2)
        self.fc3 = nn.Linear(in_features=l2, out_features = 1) 

    def forward(self, X):
        out = F.tanh(self.fc1(X))
        out = F.tanh(self.fc2(out))
        out = self.fc3(out)
        return out

def train_MLP(model,criterion, optimizer,data,epochs=100):
    training_loss = []
    validation_loss = []

    X_train, X_val, y_train, y_val = data

    min_val_loss = sys.maxsize

    for e in range(epochs):
        # forward pass
        model.train()
        yhat_train = model.forward(X_train)
        loss = criterion(yhat_train, y_train)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            model.eval()
            yhat_val = model.forward(X_val)   
            val_loss = criterion(yhat_val, y_val)
            training_loss.append(loss.item())
            validation_loss.append(val_loss.item())

        if val_loss.item() < min_val_loss:
            min_val_loss = val_loss.item()
            best_model = deepcopy(model)

        if e < 10 or (e > 10 and e%100 == 0):
            print(f"epochs : {e}, train_loss : {loss.item()}, val_loss : {val_loss.item()}")

    return best_model

In [28]:
X_train_t = torch.Tensor(X_train)
#X_val_t = torch.Tensor(X_val)
X_test_t = torch.Tensor(X_test)
y_train_t = torch.Tensor(y_train)
#y_val_t = torch.Tensor(y_val)
y_test_t = torch.tensor(y_test)

print(f"X_train : {X_train_t.shape}, y_train : {y_train_t.shape}")
#print(f"X_val : {X_val_t.shape}, y_train : {y_val_t.shape}")
print(f"X_test : {X_test_t.shape}, y_test : {y_test_t.shape}")

X_train : torch.Size([280, 4]), y_train : torch.Size([280, 1])
X_test : torch.Size([71, 4]), y_test : torch.Size([71, 1])


In [31]:
model = Simple_MLP(l1 = 256, l2 = 128)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())
data = (X_train_t, X_test_t, y_train_t, y_test_t)

best_model = train_MLP(model, criterion, optimizer, data, 10000)

epochs : 0, train_loss : 10.930057525634766, val_loss : 6.191565435419587
epochs : 1, train_loss : 5.960840702056885, val_loss : 3.8340891443378085
epochs : 2, train_loss : 3.817307472229004, val_loss : 3.4146390050177167
epochs : 3, train_loss : 3.564136266708374, val_loss : 3.8207485081672177
epochs : 4, train_loss : 4.081179141998291, val_loss : 4.261231472883874
epochs : 5, train_loss : 4.580813884735107, val_loss : 4.465123133535685
epochs : 6, train_loss : 4.806750297546387, val_loss : 4.445779645995713
epochs : 7, train_loss : 4.7854084968566895, val_loss : 4.2834648482405395
epochs : 8, train_loss : 4.605572700500488, val_loss : 4.055902693781383
epochs : 9, train_loss : 4.3505144119262695, val_loss : 3.8227925158208365
epochs : 100, train_loss : 3.115703582763672, val_loss : 3.0307203156327356
epochs : 200, train_loss : 0.2583937644958496, val_loss : 0.3287555647257851
epochs : 300, train_loss : 0.20288588106632233, val_loss : 0.24227746036308898
epochs : 400, train_loss : 0.1

In [34]:
def get_ann_predictions_and_scores(model,model_type,predictions, metrics):
    yhat = model.forward(torch.Tensor(X)).detach().numpy()
    predictions[f"yts_hat_{model_type}"] = yhat

    rmse = np.sqrt(mean_squared_error(y, yhat))
    mae = mean_absolute_error(y, yhat)
    r2 = r2_score(y, yhat)
    var = np.var(y - yhat)

    model_metrics = pd.DataFrame({
        "model" : [model_type],
        "rmse" : [rmse],
        "mae" : [mae],
        "r2" : [r2],
        "var" : [np.var(y - yhat)]
    })
    metrics = pd.concat([metrics, model_metrics],axis = 0)

    return predictions, metrics

In [35]:
predictions, metrics = get_ann_predictions_and_scores(best_model, "mlp", predictions, metrics)

In [36]:
predictions

Unnamed: 0,yts,yts_hat_svm,yts_hat_rf,yts_hat_knn,yts_hat_glm,yts_hat_mlp
0,0.0,0.054192,0.00000,0.000000,2.969580,-0.015550
1,0.5,0.586739,0.50000,0.500000,2.971277,0.540808
2,1.0,1.099865,1.00375,1.000000,2.972958,1.055843
3,1.5,1.581323,1.52625,1.500000,2.974594,1.541129
4,2.0,2.053951,2.07625,2.000000,2.976215,2.023056
...,...,...,...,...,...,...
346,4.0,3.944944,4.28000,4.000000,2.980555,4.019737
347,4.5,4.414654,4.54500,4.500000,2.981705,4.516200
348,5.0,4.899863,5.07000,5.000000,2.982887,5.011570
349,5.5,5.396785,5.59500,5.129357,2.984092,5.493153


In [37]:
metrics

Unnamed: 0,model,rmse,mae,r2,var
0,svm,0.055863,0.047105,0.999108,0.003089
0,rf,0.07281,0.04209,0.998485,6.976246
0,knn,0.083804,0.025377,0.997993,0.007023
0,glm,1.865901,1.612674,0.005261,3.500027
0,mlp,0.032429,0.026635,0.9997,0.001047


# Save DataFrame

In [38]:
predictions.to_csv("results/prediction2.csv")
metrics.to_csv("results/metrics2.csv")