# Imports

In [1]:
import sys
sys.path.append("../src/")

from utils import safe_log, mean_std_cross_val_scores, mape

import random
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle
from tqdm import tqdm
from xgboost import XGBClassifier, XGBRegressor

random.seed(0)

# Dataset Read In

In [2]:
df = pd.read_csv("../data/dataset_generated_by_extrucal.csv")
df

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
0,90,2.7,900,90,162.0,14.4,2,164.332721
1,200,18.0,900,60,280.0,20.0,2,3119.815029
2,60,3.6,900,30,36.0,4.8,2,11.535960
3,150,4.5,900,80,180.0,24.0,2,512.994770
4,190,7.6,1400,50,304.0,38.0,2,1561.370337
...,...,...,...,...,...,...,...,...
1505275,240,12.0,1100,50,144.0,14.4,2,1395.113084
1505276,30,2.1,1100,90,48.0,3.6,1,20.119844
1505277,190,7.6,1200,30,190.0,38.0,1,670.173099
1505278,40,2.4,800,50,56.0,6.4,1,14.236114


In [3]:
# Apply log-transformation to target
df["log_throughput"] = safe_log(df["throughput"])

# Train/Test Split

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
train_df.head()

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput,log_throughput
740665,60,1.2,1000,0,48.0,3.6,1,0.0,0.0
164740,60,5.4,1100,20,108.0,7.2,1,45.917649,3.848394
250675,100,8.0,1300,10,180.0,8.0,1,124.383991,4.831381
1371460,60,1.8,800,10,72.0,3.6,1,4.498032,1.70439
1356849,200,14.0,1400,50,280.0,32.0,2,2973.071136,7.997687


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1204224 entries, 740665 to 773630
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   extruder_size    1204224 non-null  int64  
 1   metering_depth   1204224 non-null  float64
 2   polymer_density  1204224 non-null  int64  
 3   rpm              1204224 non-null  int64  
 4   screw_pitch      1204224 non-null  float64
 5   flight_width     1204224 non-null  float64
 6   number_flight    1204224 non-null  int64  
 7   throughput       1204224 non-null  float64
 8   log_throughput   1204224 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 91.9 MB


In [6]:
X_train = train_df.drop(columns=["throughput", "log_throughput"])
y_train = train_df["log_throughput"]

X_test = test_df.drop(columns=["throughput", "log_throughput"])
y_test = test_df["log_throughput"]

# Preparation of Preprocessors

In [7]:
no_log_features = ["extruder_size", "polymer_density", "rpm", "number_flight"]
log_features = ["metering_depth", "screw_pitch", "flight_width"]

In [8]:
no_log_transformer = make_pipeline(
    StandardScaler(),
)

In [9]:
log_transformer = make_pipeline(
    FunctionTransformer(safe_log),
    StandardScaler(),
)

In [10]:
preprocessor = make_column_transformer(
    (no_log_transformer, no_log_features),
    (log_transformer, log_features)
)

# Cross-Validation of ML Models

In [11]:
results = {}

In [12]:
mape_scorer = make_scorer(mape, greater_is_better=False)

### 1. Baseline Model

In [13]:
pipe_dummy = make_pipeline(preprocessor, DummyRegressor())

In [14]:
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring="neg_mean_squared_error"
)

In [15]:
pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.266 (+/- 0.008)
score_time,0.035 (+/- 0.003)
test_score,-6.717 (+/- 0.009)
train_score,-6.717 (+/- 0.002)


### 2. ML Models

In [16]:
random_state = 123

pipe_ridge = make_pipeline(
    preprocessor, Ridge(max_iter=2000, random_state=random_state)
)
pipe_lasso = make_pipeline(
    preprocessor, Lasso(max_iter=2000, random_state=random_state)
)
pipe_rf = make_pipeline(
    preprocessor, RandomForestRegressor(random_state=random_state)
)
pipe_xgb = make_pipeline(
    preprocessor, XGBRegressor(random_state=random_state, verbosity=0)
)
pipe_lgbm = make_pipeline(preprocessor, LGBMRegressor(random_state=random_state))
pipe_catb = make_pipeline(
    preprocessor,
    CatBoostRegressor(random_state=random_state, verbose=0, loss_function="RMSE"),
)

In [17]:
models = {
    "ridge": pipe_ridge,
    "lass": pipe_lasso,
    # "random_forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catb,
}

In [18]:
%%time

for name, model in tqdm(models.items()):
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring="neg_mean_squared_error"
    )

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [08:10<00:00, 98.14s/it]

CPU times: total: 1h 9min 37s
Wall time: 8min 10s





In [19]:
pd.DataFrame(results)

Unnamed: 0,Dummy,ridge,lass,XGBoost,LightGBM,CatBoost
fit_time,0.266 (+/- 0.008),0.346 (+/- 0.021),0.592 (+/- 0.039),25.166 (+/- 0.357),3.404 (+/- 0.078),65.200 (+/- 1.806)
score_time,0.035 (+/- 0.003),0.041 (+/- 0.002),0.048 (+/- 0.005),0.170 (+/- 0.007),0.371 (+/- 0.032),0.111 (+/- 0.006)
test_score,-6.717 (+/- 0.009),-1.609 (+/- 0.012),-3.831 (+/- 0.012),-0.005 (+/- 0.000),-0.006 (+/- 0.000),-0.001 (+/- 0.000)
train_score,-6.717 (+/- 0.002),-1.609 (+/- 0.003),-3.831 (+/- 0.004),-0.005 (+/- 0.000),-0.006 (+/- 0.000),-0.001 (+/- 0.000)
