# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random

from extrucal.extrusion import throughput_cal

from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings

from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB

from tqdm import tqdm

random.seed(0)

# Dataset Read In

In [2]:
df = pd.read_csv("../data/extrucal_dataset_w_noise.csv")
df

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
0,80,7.2,1200,95,48.0,12.0,2,90.381112
1,220,17.6,1300,75,154.0,13.2,1,4321.460806
2,190,7.6,1400,95,323.0,36.1,2,3029.120568
3,190,7.6,800,55,228.0,26.6,1,1005.047866
4,60,1.8,800,10,48.0,10.8,1,2.794134
...,...,...,...,...,...,...,...,...
1935355,40,3.2,1000,30,32.0,4.0,1,9.063403
1935356,180,3.6,800,85,306.0,27.0,2,774.583875
1935357,30,1.8,1100,60,30.0,2.7,1,7.609954
1935358,70,6.3,1000,40,35.0,10.5,2,11.832348


# Useful Functions

In [3]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [4]:
def mape(true, pred):
    return 100.0 * np.mean(np.abs((pred - true) / (true+0.1)))  # 0.1 was added to prevent division by zero

# make a scorer function that we can pass into cross-validation
mape_scorer = make_scorer(mape, greater_is_better=False)

# Train/Test Split

In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
train_df.head()

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
665324,120,9.6,1400,50,72.0,24.0,2,95.663153
775241,230,9.2,1200,45,161.0,43.7,2,642.441095
867365,220,11.0,800,90,242.0,13.2,1,3218.34383
1458314,250,10.0,1150,0,300.0,50.0,2,0.0
1097383,70,5.6,800,45,77.0,7.0,2,67.112046


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1548288 entries, 665324 to 773630
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   extruder_size    1548288 non-null  int64  
 1   metering_depth   1548288 non-null  float64
 2   polymer_density  1548288 non-null  int64  
 3   rpm              1548288 non-null  int64  
 4   screw_pitch      1548288 non-null  float64
 5   flight_width     1548288 non-null  float64
 6   number_flight    1548288 non-null  int64  
 7   throughput       1548288 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 106.3 MB


In [7]:
X_train = train_df.drop(columns=["throughput"])
y_train = train_df["throughput"]

X_test = test_df.drop(columns=["throughput"])
y_test = test_df["throughput"]

# Evaluation of ML Models

In [8]:
results = {}

### 1. Baseline Model

In [9]:
pipe_dummy = make_pipeline(StandardScaler(), DummyRegressor())

In [10]:
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring=mape_scorer
)

In [11]:
pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.256 (+/- 0.035)
score_time,0.023 (+/- 0.004)
test_score,-54720.861 (+/- 255.055)
train_score,-54720.837 (+/- 63.223)


### 2. ML Models

In [12]:
random_state = 123

pipe_ridge = make_pipeline(StandardScaler(), Ridge(max_iter=2000, random_state=random_state))
pipe_lasso = make_pipeline(StandardScaler(), Lasso(max_iter=2000, random_state=random_state))
pipe_rf = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=random_state))
pipe_xgb = make_pipeline(StandardScaler(), XGBRegressor(random_state=random_state, verbosity=0))
pipe_lgbm = make_pipeline(StandardScaler(), LGBMRegressor(random_state=random_state))
pipe_catb = make_pipeline(StandardScaler(), CatBoostRegressor(random_state=random_state, verbose=0, loss_function='MAPE'))

In [13]:
models = {
    "ridge": pipe_ridge,
    "lass": pipe_lasso,
    "random_forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catb
}

In [14]:
%%time

for (name, model) in tqdm(models.items()):
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=mape_scorer
    )

100%|█████████████████████████████████████████████████████████████████████████████████| 6/6 [1:19:23<00:00, 793.88s/it]

CPU times: total: 2h 28min 43s
Wall time: 1h 19min 23s





In [15]:
pd.DataFrame(results)

Unnamed: 0,Dummy,ridge,lass,random_forest,XGBoost,LightGBM,CatBoost
fit_time,0.256 (+/- 0.035),0.353 (+/- 0.029),1.043 (+/- 0.119),494.703 (+/- 5.663),70.208 (+/- 19.804),6.692 (+/- 0.286),175.711 (+/- 3.227)
score_time,0.023 (+/- 0.004),0.032 (+/- 0.004),0.040 (+/- 0.004),62.623 (+/- 3.751),0.260 (+/- 0.161),0.401 (+/- 0.043),0.160 (+/- 0.016)
test_score,-54720.861 (+/- 255.055),-56178.105 (+/- 274.920),-56109.616 (+/- 274.783),-4.704 (+/- 0.011),-1431.296 (+/- 3.945),-1743.544 (+/- 67.992),-51.264 (+/- 0.514)
train_score,-54720.837 (+/- 63.223),-56178.401 (+/- 57.282),-56109.933 (+/- 57.086),-1.803 (+/- 0.002),-1425.132 (+/- 20.001),-1745.111 (+/- 63.465),-51.252 (+/- 0.593)
