# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random

from extrucal.extrusion import throughput_cal

from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings

from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB

from tqdm import tqdm

random.seed(0)

# Dataset Read In

In [2]:
df = pd.read_csv("../data/extrucal_dataset.csv")
df

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
0,80,7.2,1200,95,48.0,12.0,2,92.55
1,220,17.6,1300,75,154.0,13.2,1,4397.68
2,190,7.6,1400,95,323.0,36.1,2,3187.76
3,190,7.6,800,55,228.0,26.6,1,1008.12
4,60,1.8,800,10,48.0,10.8,1,2.72
...,...,...,...,...,...,...,...,...
1935355,40,3.2,1000,30,32.0,4.0,1,8.78
1935356,180,3.6,800,85,306.0,27.0,2,762.58
1935357,30,1.8,1100,60,30.0,2.7,1,7.89
1935358,70,6.3,1000,40,35.0,10.5,2,11.59


In [3]:
df["log_throughput"] = np.log(df["throughput"])
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

  result = getattr(ufunc, method)(*inputs, **kwargs)


# Useful Functions

In [4]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [5]:
def mape(true, pred):
    return 100.0 * np.mean(np.abs((pred - true) / (true+0.1)))  # 0.1 was added to prevent division by zero

# make a scorer function that we can pass into cross-validation
mape_scorer = make_scorer(mape, greater_is_better=False)

# Train/Test Split

In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
train_df.head()

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput,log_throughput
1072703,20,1.0,1450,30,18.0,2.8,2,0.83,-0.18633
537273,80,7.2,1100,15,56.0,8.0,1,37.06,3.612538
1155872,110,7.7,1150,25,121.0,19.8,2,141.03,4.948973
906411,100,2.0,1400,90,50.0,20.0,1,66.11,4.19132
104125,20,1.2,1300,10,22.0,4.0,2,0.31,-1.171183


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1470873 entries, 1072703 to 814572
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   extruder_size    1470873 non-null  int64  
 1   metering_depth   1470873 non-null  float64
 2   polymer_density  1470873 non-null  int64  
 3   rpm              1470873 non-null  int64  
 4   screw_pitch      1470873 non-null  float64
 5   flight_width     1470873 non-null  float64
 6   number_flight    1470873 non-null  int64  
 7   throughput       1470873 non-null  float64
 8   log_throughput   1470873 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 112.2 MB


In [8]:
X_train = train_df.drop(columns=["throughput", "log_throughput"])
y_train = train_df["log_throughput"]

X_test = test_df.drop(columns=["throughput", "log_throughput"])
y_test = test_df["log_throughput"]

# Evaluation of ML Models

In [9]:
results = {}

### 1. Baseline Model

In [10]:
pipe_dummy = make_pipeline(StandardScaler(), DummyRegressor())

In [11]:
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring=mape_scorer
)

In [12]:
pd.DataFrame(results)

Unnamed: 0,Dummy
fit_time,0.229 (+/- 0.007)
score_time,0.022 (+/- 0.002)
test_score,-196.396 (+/- 5.376)
train_score,-196.396 (+/- 1.355)


### 2. ML Models

In [13]:
random_state = 123

pipe_ridge = make_pipeline(StandardScaler(), Ridge(max_iter=2000, random_state=random_state))
pipe_lasso = make_pipeline(StandardScaler(), Lasso(max_iter=2000, random_state=random_state))
pipe_rf = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=random_state))
pipe_xgb = make_pipeline(StandardScaler(), XGBRegressor(random_state=random_state, verbosity=0))
pipe_lgbm = make_pipeline(StandardScaler(), LGBMRegressor(random_state=random_state))
pipe_catb = make_pipeline(StandardScaler(), CatBoostRegressor(random_state=random_state, verbose=0, loss_function='MAPE'))

In [14]:
models = {
    "ridge": pipe_ridge,
    "lass": pipe_lasso,
    "random_forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catb
}

In [15]:
%%time

for (name, model) in tqdm(models.items()):
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=mape_scorer
    )

100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [50:32<00:00, 505.34s/it]

CPU times: total: 2h 5min 23s
Wall time: 50min 32s





In [16]:
pd.DataFrame(results)

Unnamed: 0,Dummy,ridge,lass,random_forest,XGBoost,LightGBM,CatBoost
fit_time,0.229 (+/- 0.007),0.317 (+/- 0.008),0.467 (+/- 0.021),389.773 (+/- 8.480),29.703 (+/- 0.758),3.400 (+/- 0.077),103.782 (+/- 2.030)
score_time,0.022 (+/- 0.002),0.027 (+/- 0.002),0.031 (+/- 0.004),16.627 (+/- 5.538),0.152 (+/- 0.013),0.351 (+/- 0.017),0.096 (+/- 0.010)
test_score,-196.396 (+/- 5.376),-64.979 (+/- 1.874),-139.762 (+/- 3.843),-2.400 (+/- 0.065),-3.682 (+/- 0.028),-4.080 (+/- 0.038),-2.017 (+/- 0.110)
train_score,-196.396 (+/- 1.355),-64.978 (+/- 0.449),-139.762 (+/- 0.929),-0.915 (+/- 0.004),-3.667 (+/- 0.072),-4.040 (+/- 0.094),-2.012 (+/- 0.034)
