In [None]:
import pandas as pd
import numpy as np
from scipy import stats


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso, ElasticNet


from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import warnings
from joblib import dump, load
warnings.filterwarnings("ignore")

In [None]:
dataset_station = ["dataset_station1.csv", "dataset_station2.csv", "dataset_station3.csv", "dataset_station4.csv"]

data = {}

for file in dataset_station:
    df = pd.read_csv(file)
    df['skill'] = df['skill'].replace('Glue Type A', '1')
    df['skill'] = df['skill'].replace('Glue Type B', '2')
    df['skill'] = df['skill'].replace('Glue Type C', '3')
    data[file] = df

df1 = data['dataset_station1.csv']
df2 = data["dataset_station2.csv"]
df3 = data["dataset_station3.csv"]
df4 = data["dataset_station4.csv"]

In [None]:
df1

Unnamed: 0,velocity,skill,energy_consumption
0,75,2,7.763969
1,75,2,8.907676
2,60,2,9.694551
3,75,2,6.992823
4,50,1,11.282151
...,...,...,...
795,40,1,13.457407
796,60,2,11.688908
797,40,2,14.188629
798,40,2,14.021519


# GlueStation 1


In [None]:
#outlier removal 
numeric_columns = df1.select_dtypes(include=np.number).columns
df1_numeric = df1[numeric_columns]
z_scores = np.abs(stats.zscore(df1_numeric))
threshold = 2
outlier_indices = np.where(z_scores > threshold)
df1_no_outliers = df1.drop(df1.index[outlier_indices[0]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df1_no_outliers.drop("energy_consumption", axis=1), 
                                                    df1_no_outliers["energy_consumption"],
                                                    test_size=0.2,
                                                    random_state=42)
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Support Vector Regression', SVR()),
    ('K-Nearest Neighbors', KNeighborsRegressor()),
    ('Multi-layer Perceptron', MLPRegressor()),
    ('AdaBoost', AdaBoostRegressor())
]

results = []

for model_name, model in models:
    regression_results = pd.DataFrame(columns=['Model', 'MSE', 'R2', 'MAE'])
   
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics on the test set
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    regression_results.loc[0] = [model_name, mse, r2, mae]
    results.append(regression_results)

results_df1 = pd.concat(results, ignore_index=True)

results_df1

Unnamed: 0,Model,MSE,R2,MAE
0,Linear Regression,1.16769,0.782474,0.877803
1,Ridge Regression,1.166655,0.782667,0.877188
2,Lasso Regression,1.819436,0.661062,1.102823
3,ElasticNet,1.809639,0.662887,1.099446
4,Decision Tree,0.948615,0.823285,0.797428
5,Random Forest,0.947685,0.823458,0.797373
6,Gradient Boosting,0.948613,0.823286,0.797431
7,Support Vector Regression,1.523117,0.716263,1.026733
8,K-Nearest Neighbors,1.004004,0.812967,0.823918
9,Multi-layer Perceptron,2.448007,0.543967,1.213024


In [None]:
best_model_mse = results_df1.loc[results_df1['MSE'].idxmin()]
best_model_r2 = results_df1.loc[results_df1['R2'].idxmax()]
best_model_mae = results_df1.loc[results_df1['MAE'].idxmin()]

# Print the best models based on each evaluation metric
print("Best Model based on MSE:", best_model_mse)
print("Best Model based on R2:", best_model_r2)
print("Best Model based on MAE:", best_model_mae)

Best Model based on MSE: Model    AdaBoost
MSE      0.944159
R2       0.824115
MAE      0.800032
Name: 10, dtype: object
Best Model based on R2: Model    AdaBoost
MSE      0.944159
R2       0.824115
MAE      0.800032
Name: 10, dtype: object
Best Model based on MAE: Model    Random Forest
MSE           0.947685
R2            0.823458
MAE           0.797373
Name: 5, dtype: object


In [None]:
#save the best model 
BEST_MODEL = AdaBoostRegressor()

X_train, X_test, y_train, y_test = train_test_split(df1_no_outliers.drop("energy_consumption", axis=1), 
                                                    df1_no_outliers["energy_consumption"],
                                                    test_size = 0.2,
                                                    random_state= 42)

model = BEST_MODEL
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

dump(model, 'GS1_joblib.joblib')

['GS1_joblib.joblib']

# GlueStation 2

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df2.drop("energy_consumption", axis=1), 
                                                    df2["energy_consumption"],
                                                    test_size = 0.2,
                                                    random_state= 42)

models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Support Vector Regression', SVR()),
    ('K-Nearest Neighbors', KNeighborsRegressor()),
    ('Multi-layer Perceptron', MLPRegressor()),
    ('AdaBoost', AdaBoostRegressor())
]

results = []

for model_name, model in models:
    regression_results = pd.DataFrame(columns=['Model', 'MSE', 'R2', 'MAE'])
   
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics on the test set
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    regression_results.loc[0] = [model_name, mse, r2, mae]
    results.append(regression_results)

results_df2 = pd.concat(results, ignore_index=True)

results_df2

Unnamed: 0,Model,MSE,R2,MAE
0,Linear Regression,0.439321,0.648306,0.537838
1,Ridge Regression,0.439333,0.648296,0.537888
2,Lasso Regression,0.552441,0.557748,0.616263
3,ElasticNet,0.550417,0.559368,0.61639
4,Decision Tree,0.231756,0.81447,0.355795
5,Random Forest,0.231856,0.81439,0.355546
6,Gradient Boosting,0.231754,0.814471,0.3558
7,Support Vector Regression,0.256092,0.794987,0.383147
8,K-Nearest Neighbors,0.258103,0.793378,0.377418
9,Multi-layer Perceptron,0.609737,0.51188,0.551539


In [None]:
best_model_mse = results_df2.loc[results_df2['MSE'].idxmin()]
best_model_r2 = results_df2.loc[results_df2['R2'].idxmax()]
best_model_mae = results_df2.loc[results_df2['MAE'].idxmin()]

# Print the best models based on each evaluation metric
print("Best Model based on MSE:", best_model_mse)
print("Best Model based on R2:", best_model_r2)
print("Best Model based on MAE:", best_model_mae)

Best Model based on MSE: Model    Gradient Boosting
MSE               0.231754
R2                0.814471
MAE                 0.3558
Name: 6, dtype: object
Best Model based on R2: Model    Gradient Boosting
MSE               0.231754
R2                0.814471
MAE                 0.3558
Name: 6, dtype: object
Best Model based on MAE: Model    Random Forest
MSE           0.231856
R2             0.81439
MAE           0.355546
Name: 5, dtype: object


In [None]:
#save the best model 
BEST_MODEL = GradientBoostingRegressor()

X_train, X_test, y_train, y_test = train_test_split(df2.drop("energy_consumption", axis=1), 
                                                    df2["energy_consumption"],
                                                    test_size = 0.2,
                                                    random_state= 42)

model = BEST_MODEL
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

dump(model, 'GS2_joblib.joblib')

['GS2_joblib.joblib']

# GlueStation 3

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df3.drop("energy_consumption", axis=1), 
                                                    df3["energy_consumption"],
                                                    test_size = 0.2,
                                                    random_state= 42)

models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Support Vector Regression', SVR()),
    ('K-Nearest Neighbors', KNeighborsRegressor()),
    ('Multi-layer Perceptron', MLPRegressor()),
    ('AdaBoost', AdaBoostRegressor())
]

results = []

for model_name, model in models:
    regression_results = pd.DataFrame(columns=['Model', 'MSE', 'R2', 'MAE'])
   
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics on the test set
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    regression_results.loc[0] = [model_name, mse, r2, mae]
    results.append(regression_results)

results_df3 = pd.concat(results, ignore_index=True)

results_df3

Unnamed: 0,Model,MSE,R2,MAE
0,Linear Regression,0.382363,0.844038,0.522013
1,Ridge Regression,0.382537,0.843967,0.522123
2,Lasso Regression,1.969361,0.196719,1.262967
3,ElasticNet,1.766252,0.279565,1.181863
4,Decision Tree,0.153185,0.937518,0.309086
5,Random Forest,0.153308,0.937467,0.309243
6,Gradient Boosting,0.153183,0.937518,0.309082
7,Support Vector Regression,1.504622,0.386281,1.127992
8,K-Nearest Neighbors,0.213368,0.91297,0.366978
9,Multi-layer Perceptron,3.733901,-0.523017,1.618882


In [None]:
best_model_mse = results_df3.loc[results_df3['MSE'].idxmin()]
best_model_r2 = results_df3.loc[results_df3['R2'].idxmax()]
best_model_mae = results_df3.loc[results_df3['MAE'].idxmin()]

# Print the best models based on each evaluation metric
print("Best Model based on MSE:", best_model_mse)
print("Best Model based on R2:", best_model_r2)
print("Best Model based on MAE:", best_model_mae)

Best Model based on MSE: Model    AdaBoost
MSE      0.152815
R2       0.937668
MAE      0.309135
Name: 10, dtype: object
Best Model based on R2: Model    AdaBoost
MSE      0.152815
R2       0.937668
MAE      0.309135
Name: 10, dtype: object
Best Model based on MAE: Model    Gradient Boosting
MSE               0.153183
R2                0.937518
MAE               0.309082
Name: 6, dtype: object


In [None]:
#save the best model 
BEST_MODEL = AdaBoostRegressor()

X_train, X_test, y_train, y_test = train_test_split(df3.drop("energy_consumption", axis=1), 
                                                    df3["energy_consumption"],
                                                    test_size = 0.2,
                                                    random_state= 42)

model = BEST_MODEL
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

dump(model, 'GS3_joblib.joblib')

['GS3_joblib.joblib']

# GlueStation 4

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df4.drop("energy_consumption", axis=1), 
                                                    df4["energy_consumption"],
                                                    test_size = 0.2,
                                                    random_state= 42)

models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Support Vector Regression', SVR()),
    ('K-Nearest Neighbors', KNeighborsRegressor()),
    ('Multi-layer Perceptron', MLPRegressor()),
    ('AdaBoost', AdaBoostRegressor())
]

results = []

for model_name, model in models:
    regression_results = pd.DataFrame(columns=['Model', 'MSE', 'R2', 'MAE'])
   
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics on the test set
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    regression_results.loc[0] = [model_name, mse, r2, mae]
    results.append(regression_results)

results_df4 = pd.concat(results, ignore_index=True)

results_df4

Unnamed: 0,Model,MSE,R2,MAE
0,Linear Regression,1.507016,0.704277,1.050555
1,Ridge Regression,1.507039,0.704273,1.050582
2,Lasso Regression,1.593077,0.687389,1.092395
3,ElasticNet,1.582126,0.689538,1.088648
4,Decision Tree,0.179848,0.964708,0.326348
5,Random Forest,0.179464,0.964784,0.325919
6,Gradient Boosting,0.179727,0.964732,0.326542
7,Support Vector Regression,1.588665,0.688255,0.978565
8,K-Nearest Neighbors,0.232489,0.954379,0.369531
9,Multi-layer Perceptron,1.772587,0.652164,1.14057


In [None]:
best_model_mse = results_df4.loc[results_df4['MSE'].idxmin()]
best_model_r2 = results_df4.loc[results_df4['R2'].idxmax()]
best_model_mae = results_df4.loc[results_df4['MAE'].idxmin()]

# Print the best models based on each evaluation metric
print("Best Model based on MSE:", best_model_mse)
print("Best Model based on R2:", best_model_r2)
print("Best Model based on MAE:", best_model_mae)


Best Model based on MSE: Model    Random Forest
MSE           0.179464
R2            0.964784
MAE           0.325919
Name: 5, dtype: object
Best Model based on R2: Model    Random Forest
MSE           0.179464
R2            0.964784
MAE           0.325919
Name: 5, dtype: object
Best Model based on MAE: Model    Random Forest
MSE           0.179464
R2            0.964784
MAE           0.325919
Name: 5, dtype: object


In [None]:
#save the best model 
BEST_MODEL = GradientBoostingRegressor()

X_train, X_test, y_train, y_test = train_test_split(df4.drop("energy_consumption", axis=1), 
                                                    df4["energy_consumption"],
                                                    test_size = 0.2,
                                                    random_state= 42)

model = BEST_MODEL
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

dump(model, 'GS4_joblib.joblib')

['GS4_joblib.joblib']