# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random

from extrucal.extrusion import throughput_cal

from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings

from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB

from tqdm import tqdm

random.seed(0)

# Dataset Read In

In [2]:
df = pd.read_csv("../data/extrucal_dataset_improved.csv")
df

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
0,20,0.4,800,0,12.0,1.2,1,0.00
1,20,0.4,800,10,12.0,1.2,1,0.06
2,20,0.4,800,20,12.0,1.2,1,0.12
3,20,0.4,800,30,12.0,1.2,1,0.18
4,20,0.4,800,40,12.0,1.2,1,0.24
...,...,...,...,...,...,...,...,...
1505275,250,22.5,1400,50,450.0,50.0,2,8207.87
1505276,250,22.5,1400,60,450.0,50.0,2,9849.44
1505277,250,22.5,1400,70,450.0,50.0,2,11491.02
1505278,250,22.5,1400,80,450.0,50.0,2,13132.59


# Useful Functions

In [3]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [4]:
def mape(true, pred):
    return 100.0 * np.mean(np.abs((pred - true) / (true+0.1)))  # 0.1 was added to prevent division by zero

# make a scorer function that we can pass into cross-validation
mape_scorer = make_scorer(mape, greater_is_better=False)

# Train/Test Split

In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123, shuffle=True)
train_df.head()

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
740665,130,10.4,1100,50,130.0,10.4,1,710.85
164740,40,2.8,800,0,24.0,6.4,1,0.0
250675,50,4.5,1400,50,80.0,8.0,2,64.39
1371460,230,18.4,1400,0,276.0,36.8,1,0.0
1356849,230,16.1,800,90,276.0,23.0,1,5226.69


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1204224 entries, 740665 to 773630
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   extruder_size    1204224 non-null  int64  
 1   metering_depth   1204224 non-null  float64
 2   polymer_density  1204224 non-null  int64  
 3   rpm              1204224 non-null  int64  
 4   screw_pitch      1204224 non-null  float64
 5   flight_width     1204224 non-null  float64
 6   number_flight    1204224 non-null  int64  
 7   throughput       1204224 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 82.7 MB


In [7]:
X_train = train_df.drop(columns=["throughput"])
y_train = train_df["throughput"]

X_test = test_df.drop(columns=["throughput"])
y_test = test_df["throughput"]

# Evaluation of ML Models

In [8]:
results = {}

### 1. Baseline Model

In [9]:
pipe_dummy = make_pipeline(StandardScaler(), DummyRegressor())

In [10]:
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring=mape_scorer
)

In [11]:
pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.187 (+/- 0.019)
score_time,0.019 (+/- 0.002)
test_score,-92599.090 (+/- 429.413)
train_score,-92599.146 (+/- 127.713)


### 2. ML Models

In [12]:
random_state = 123

pipe_ridge = make_pipeline(StandardScaler(), Ridge(max_iter=2000, random_state=random_state))
pipe_lasso = make_pipeline(StandardScaler(), Lasso(max_iter=2000, random_state=random_state))
pipe_rf = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=random_state))
pipe_xgb = make_pipeline(StandardScaler(), XGBRegressor(random_state=random_state, verbosity=0))
pipe_lgbm = make_pipeline(StandardScaler(), LGBMRegressor(random_state=random_state))
pipe_catb = make_pipeline(StandardScaler(), CatBoostRegressor(random_state=random_state, verbose=0, loss_function='RMSE'))

In [13]:
models = {
    "ridge": pipe_ridge,
    "lass": pipe_lasso,
    "random_forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catb
}

In [14]:
%%time

for (name, model) in tqdm(models.items()):
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=mape_scorer
    )

100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [35:53<00:00, 358.97s/it]

CPU times: total: 1h 34min 54s
Wall time: 35min 53s





In [15]:
pd.DataFrame(results)

Unnamed: 0,Dummy,ridge,lass,random_forest,XGBoost,LightGBM,CatBoost
fit_time,0.187 (+/- 0.019),0.272 (+/- 0.022),0.660 (+/- 0.075),287.830 (+/- 8.176),22.865 (+/- 0.451),2.664 (+/- 0.266),57.382 (+/- 0.415)
score_time,0.019 (+/- 0.002),0.027 (+/- 0.003),0.025 (+/- 0.003),11.415 (+/- 0.951),0.135 (+/- 0.004),0.245 (+/- 0.012),0.084 (+/- 0.005)
test_score,-92599.090 (+/- 429.413),-93591.789 (+/- 310.975),-93462.085 (+/- 307.575),-2.531 (+/- 0.012),-2178.953 (+/- 44.157),-2610.402 (+/- 46.168),-1128.844 (+/- 22.571)
train_score,-92599.146 (+/- 127.713),-93591.520 (+/- 75.241),-93461.868 (+/- 74.492),-0.970 (+/- 0.003),-2169.274 (+/- 26.794),-2607.849 (+/- 59.106),-1121.717 (+/- 22.031)
