# Imports

In [1]:
import sys
sys.path.append("../src/")

from utils import mean_std_cross_val_scores, mape

import random
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LogisticRegression, Ridge
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle
from tqdm import tqdm
from xgboost import XGBClassifier, XGBRegressor

random.seed(0)

# Dataset Read In

In [2]:
df = pd.read_csv("../data/dataset_generated_by_extrucal.csv")
df

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
0,40,3.2,1100,70,64.0,7.2,1,39.648956
1,250,12.5,1400,80,400.0,35.0,2,7710.474531
2,60,1.2,900,40,72.0,10.8,2,9.871525
3,100,6.0,1100,50,180.0,18.0,2,308.711894
4,40,2.4,1200,20,48.0,3.2,2,6.880714
...,...,...,...,...,...,...,...,...
1505275,250,17.5,1400,20,350.0,30.0,1,2776.196206
1505276,230,13.8,1100,0,138.0,13.8,2,0.000000
1505277,220,8.8,1300,20,308.0,39.6,2,863.676713
1505278,100,5.0,1400,40,60.0,14.0,2,68.228337


In [3]:
# Apply log-transformation to target
df["log_throughput"] = np.log1p(df["throughput"])

# Train/Test Split

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
train_df.head()

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput,log_throughput
740665,210,6.3,1300,60,252.0,29.4,2,1504.262985,7.316723
164740,180,16.2,1000,60,180.0,10.8,2,2094.131085,7.647371
250675,180,10.8,1200,50,216.0,28.8,1,1737.913645,7.461016
1371460,220,11.0,1100,60,396.0,44.0,1,3733.419435,8.225348
1356849,170,15.3,1100,40,306.0,27.2,1,2122.786335,7.660956


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1204224 entries, 740665 to 773630
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   extruder_size    1204224 non-null  int64  
 1   metering_depth   1204224 non-null  float64
 2   polymer_density  1204224 non-null  int64  
 3   rpm              1204224 non-null  int64  
 4   screw_pitch      1204224 non-null  float64
 5   flight_width     1204224 non-null  float64
 6   number_flight    1204224 non-null  int64  
 7   throughput       1204224 non-null  float64
 8   log_throughput   1204224 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 91.9 MB


In [6]:
X_train = train_df.drop(columns=["throughput", "log_throughput"])
y_train = train_df["log_throughput"]

X_test = test_df.drop(columns=["throughput", "log_throughput"])
y_test = test_df["log_throughput"]

# Preparation of Preprocessors

In [7]:
no_log_features = ["extruder_size", "polymer_density", "rpm", "number_flight"]
log_features = ["metering_depth", "screw_pitch", "flight_width"]

In [8]:
no_log_transformer = make_pipeline(
    StandardScaler(),
)

In [9]:
log_transformer = make_pipeline(
    FunctionTransformer(np.log1p),
    StandardScaler(),
)

In [10]:
preprocessor = make_column_transformer(
    (no_log_transformer, no_log_features),
    (log_transformer, log_features)
)

# Cross-Validation of ML Models

In [11]:
results = {}

In [12]:
mape_scorer = make_scorer(mape, greater_is_better=False)

### 1. Baseline Model

In [13]:
pipe_dummy = make_pipeline(preprocessor, DummyRegressor())

In [14]:
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring="neg_mean_squared_error"
)

In [15]:
pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.217 (+/- 0.010)
score_time,0.030 (+/- 0.004)
test_score,-6.717 (+/- 0.008)
train_score,-6.717 (+/- 0.002)


### 2. ML Models

In [16]:
random_state = 123

pipe_ridge = make_pipeline(
    preprocessor, Ridge(max_iter=2000, random_state=random_state)
)
pipe_lasso = make_pipeline(
    preprocessor, Lasso(max_iter=2000, random_state=random_state)
)
pipe_rf = make_pipeline(
    preprocessor, RandomForestRegressor(random_state=random_state)
)
pipe_xgb = make_pipeline(
    preprocessor, XGBRegressor(random_state=random_state, verbosity=0)
)
pipe_lgbm = make_pipeline(preprocessor, LGBMRegressor(random_state=random_state))
pipe_catb = make_pipeline(
    preprocessor,
    CatBoostRegressor(random_state=random_state, verbose=0, loss_function="RMSE"),
)

In [17]:
models = {
    "ridge": pipe_ridge,
    "lass": pipe_lasso,
    "random_forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catb,
}

In [18]:
%%time

for name, model in tqdm(models.items()):
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring="neg_mean_squared_error"
    )

100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [32:57<00:00, 329.55s/it]

CPU times: total: 1h 34min 6s
Wall time: 32min 57s





In [19]:
pd.DataFrame(results)

Unnamed: 0,Dummy,ridge,lass,random_forest,XGBoost,LightGBM,CatBoost
fit_time,0.217 (+/- 0.010),0.304 (+/- 0.020),0.510 (+/- 0.010),255.008 (+/- 1.440),23.026 (+/- 0.740),3.256 (+/- 0.195),59.759 (+/- 0.618)
score_time,0.030 (+/- 0.004),0.037 (+/- 0.003),0.037 (+/- 0.003),9.883 (+/- 0.262),0.163 (+/- 0.009),0.353 (+/- 0.024),0.106 (+/- 0.009)
test_score,-6.717 (+/- 0.008),-1.604 (+/- 0.013),-3.824 (+/- 0.013),-0.002 (+/- 0.000),-0.005 (+/- 0.000),-0.006 (+/- 0.000),-0.001 (+/- 0.000)
train_score,-6.717 (+/- 0.002),-1.604 (+/- 0.003),-3.824 (+/- 0.003),-0.000 (+/- 0.000),-0.005 (+/- 0.000),-0.006 (+/- 0.000),-0.001 (+/- 0.000)


#### `CatBoost` seems to perform best

# Evaluation of `CatBoost` Model