# Imports

In [1]:
import sys
sys.path.append("../src/")

from utils import mean_std_cross_val_scores, mape

import random
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle
from tqdm import tqdm
from xgboost import XGBClassifier, XGBRegressor

random.seed(0)

# Dataset Read In

In [2]:
df = pd.read_csv("../data/dataset_generated_by_extrucal.csv")
df

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
0,160,8.0,900,90.0,192.0,19.2,2,1236.227039
1,80,3.2,800,90.0,64.0,12.8,2,55.664271
2,250,15.0,800,20.0,350.0,50.0,1,1380.830874
3,90,7.2,1100,30.0,162.0,12.6,1,218.844659
4,80,6.4,1400,10.0,64.0,4.8,2,28.502353
...,...,...,...,...,...,...,...,...
1505275,180,10.8,1000,50.0,252.0,28.8,1,1612.587040
1505276,190,13.3,1000,90.0,304.0,11.4,2,4451.222743
1505277,150,12.0,1000,60.0,240.0,15.0,1,1703.915027
1505278,240,9.6,1100,90.0,288.0,43.2,1,4452.281793


In [3]:
# Apply log-transformation to target
df["log_throughput"] = np.log(df["throughput"])

# Train/Test Split

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
train_df.head()

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput,log_throughput
740665,50,2.0,1400,10.0,70.0,9.0,2,5.13878,1.636816
164740,160,11.2,1200,70.0,192.0,12.8,1,2061.690137,7.631281
250675,30,2.1,800,40.0,48.0,5.4,1,6.274011,1.836416
1371460,220,8.8,1100,60.0,308.0,22.0,2,2400.035929,7.783239
1356849,60,3.0,1300,90.0,36.0,10.8,1,42.653946,3.75312


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1204224 entries, 740665 to 773630
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   extruder_size    1204224 non-null  int64  
 1   metering_depth   1204224 non-null  float64
 2   polymer_density  1204224 non-null  int64  
 3   rpm              1204224 non-null  float64
 4   screw_pitch      1204224 non-null  float64
 5   flight_width     1204224 non-null  float64
 6   number_flight    1204224 non-null  int64  
 7   throughput       1204224 non-null  float64
 8   log_throughput   1204224 non-null  float64
dtypes: float64(6), int64(3)
memory usage: 91.9 MB


In [6]:
X_train = train_df.drop(columns=["throughput", "log_throughput"])
y_train = train_df["log_throughput"]

X_test = test_df.drop(columns=["throughput", "log_throughput"])
y_test = test_df["log_throughput"]

# Preparation of Preprocessors

In [7]:
no_log_features = ["extruder_size", "polymer_density", "rpm", "number_flight"]
log_features = ["metering_depth", "screw_pitch", "flight_width"]

In [8]:
no_log_transformer = make_pipeline(
    StandardScaler(),
)

In [9]:
log_transformer = make_pipeline(
    FunctionTransformer(np.log),
    StandardScaler(),
)

In [10]:
preprocessor = make_column_transformer(
    (no_log_transformer, no_log_features),
    (log_transformer, log_features)
)

# Cross-Validation of ML Models

In [11]:
results = {}

In [12]:
mape_scorer = make_scorer(mape, greater_is_better=False)

### 1. Baseline Model

In [13]:
pipe_dummy = make_pipeline(preprocessor, DummyRegressor())

In [14]:
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring="neg_mean_squared_error"
)

In [15]:
pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.223 (+/- 0.007)
score_time,0.032 (+/- 0.006)
test_score,-37.642 (+/- 0.137)
train_score,-37.642 (+/- 0.034)


### 2. ML Models

In [16]:
random_state = 123

pipe_ridge = make_pipeline(
    preprocessor, Ridge(max_iter=2000, random_state=random_state)
)
pipe_lasso = make_pipeline(
    preprocessor, Lasso(max_iter=2000, random_state=random_state)
)
pipe_rf = make_pipeline(
    preprocessor, RandomForestRegressor(random_state=random_state)
)
pipe_xgb = make_pipeline(
    preprocessor, XGBRegressor(random_state=random_state, verbosity=0)
)
pipe_lgbm = make_pipeline(preprocessor, LGBMRegressor(random_state=random_state))
pipe_catb = make_pipeline(
    preprocessor,
    CatBoostRegressor(random_state=random_state, verbose=0, loss_function="RMSE"),
)

In [17]:
models = {
    "ridge": pipe_ridge,
    "lass": pipe_lasso,
    "random_forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catb,
}

In [18]:
%%time

for name, model in tqdm(models.items()):
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring="neg_mean_squared_error"
    )

100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [35:04<00:00, 350.73s/it]

CPU times: total: 1h 36min 40s
Wall time: 35min 4s





In [19]:
pd.DataFrame(results)

Unnamed: 0,Dummy,ridge,lass,random_forest,XGBoost,LightGBM,CatBoost
fit_time,0.223 (+/- 0.007),0.301 (+/- 0.010),0.468 (+/- 0.014),273.358 (+/- 9.724),23.708 (+/- 1.568),3.063 (+/- 0.127),59.912 (+/- 1.465)
score_time,0.032 (+/- 0.006),0.041 (+/- 0.008),0.040 (+/- 0.005),10.906 (+/- 1.367),0.156 (+/- 0.003),0.318 (+/- 0.010),0.101 (+/- 0.010)
test_score,-37.642 (+/- 0.137),-21.503 (+/- 0.086),-23.723 (+/- 0.099),-0.002 (+/- 0.000),-0.005 (+/- 0.000),-0.007 (+/- 0.000),-0.001 (+/- 0.000)
train_score,-37.642 (+/- 0.034),-21.503 (+/- 0.022),-23.723 (+/- 0.022),-0.000 (+/- 0.000),-0.005 (+/- 0.000),-0.006 (+/- 0.000),-0.001 (+/- 0.000)
