# Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pycaret.regression import *

# Data Read-In

In [2]:
data = pd.read_csv("../../../extrucal_machine-learning/data/extrucal_dataset.csv")
data

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
0,80,7.2,1200,95,48.0,12.0,2,92.55
1,220,17.6,1300,75,154.0,13.2,1,4397.68
2,190,7.6,1400,95,323.0,36.1,2,3187.76
3,190,7.6,800,55,228.0,26.6,1,1008.12
4,60,1.8,800,10,48.0,10.8,1,2.72
...,...,...,...,...,...,...,...,...
1935355,40,3.2,1000,30,32.0,4.0,1,8.78
1935356,180,3.6,800,85,306.0,27.0,2,762.58
1935357,30,1.8,1100,60,30.0,2.7,1,7.89
1935358,70,6.3,1000,40,35.0,10.5,2,11.59


In [3]:
# To make these variable as "numeric"

data['polymer_density'] = data['polymer_density'].astype('float')
data['rpm'] = data['rpm'].astype('float')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1935360 entries, 0 to 1935359
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   extruder_size    int64  
 1   metering_depth   float64
 2   polymer_density  float64
 3   rpm              float64
 4   screw_pitch      float64
 5   flight_width     float64
 6   number_flight    int64  
 7   throughput       float64
dtypes: float64(6), int64(2)
memory usage: 118.1 MB


# Setup of a Simple Experiment without Logging

In [8]:
experiment = setup(data, target='throughput')

Unnamed: 0,Description,Value
0,session_id,1041
1,Target,throughput
2,Original Data,"(1935360, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1354751, 7)"


In [9]:
# Model Selection

model = compare_models(n_select = 1)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,55.022,7858.8966,88.6468,0.9971,0.8903,1.0102,3.166
xgboost,Extreme Gradient Boosting,51.8862,8167.7084,90.3639,0.997,0.7579,0.4543,51.022
dt,Decision Tree Regressor,61.0578,21176.267,145.5184,0.9921,0.0981,0.0794,1.798
gbr,Gradient Boosting Regressor,170.4764,80113.5848,283.0369,0.9701,1.5966,10.4491,59.748
knn,K Neighbors Regressor,166.3901,126164.4078,355.1889,0.9529,0.7127,0.3096,31.736
br,Bayesian Ridge,663.0803,978606.794,989.2381,0.6349,2.5923,127.1739,0.353
lasso,Lasso Regression,663.0005,978610.85,989.2401,0.6349,2.5921,127.1209,13.814
lr,Linear Regression,663.0817,978606.8,989.2381,0.6349,2.5923,127.1748,0.868
lar,Least Angle Regression,663.0818,978606.7967,989.2381,0.6349,2.5923,127.1748,0.084
ridge,Ridge Regression,663.0816,978606.775,989.2381,0.6349,2.5923,127.1748,0.075


In [11]:
# Model Optimization

tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,31.6032,2722.8059,52.1805,0.999,0.7148,0.3971
1,31.8887,2789.6314,52.817,0.999,0.7051,0.3639
2,31.115,2616.4548,51.1513,0.999,0.7129,0.3453
3,31.295,2640.0281,51.3812,0.999,0.7023,0.3865
4,31.7684,2776.0404,52.6881,0.999,0.7164,0.3318
5,31.4921,2726.709,52.2179,0.999,0.7031,0.3397
6,31.5157,2769.9297,52.6301,0.999,0.6873,0.3178
7,31.563,2727.9783,52.2301,0.999,0.7035,0.3655
8,31.4758,2672.55,51.6967,0.999,0.7079,0.4195
9,32.1064,2817.4912,53.08,0.999,0.6916,0.3568


In [12]:
# Finalize model
finalized_model = finalize_model(tuned_model)

# save model to disk
save_model(finalized_model, 'lgbm_simple')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='throughput', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                                colsample_bytree=1.0, feature_fraction=0.8,
                                importance_type='split', learning_rate=0.2,
                                max_depth=-1, min_child_samples=11,
                                min_child_weight=0.001, min_split_gain=0.8,
                 

# Setup of an Experiment with Logging

In [4]:
experiment = setup(
    data, 
    target='throughput',
    log_experiment=True,
    log_plots=True,
    experiment_name='extrucal'  # Need this to make this appear in MLflow
)

Unnamed: 0,Description,Value
0,session_id,8687
1,Target,throughput
2,Original Data,"(1935360, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1354751, 7)"


In [5]:
# Model Selection

model = compare_models(n_select = 1)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,20.6829,1068.1383,32.6807,0.9996,0.6111,0.2844,65.021
lightgbm,Light Gradient Boosting Machine,54.3032,7640.8682,87.4065,0.9971,0.8894,0.9833,3.131
xgboost,Extreme Gradient Boosting,51.9117,8234.2512,90.7405,0.9969,0.7619,0.462,49.877
dt,Decision Tree Regressor,61.1817,21286.3588,145.8968,0.9921,0.0982,0.0794,1.686
gbr,Gradient Boosting Regressor,170.1051,80201.0392,283.1755,0.9701,1.5847,9.7559,59.714
knn,K Neighbors Regressor,166.2766,125960.5406,354.8958,0.953,0.7142,0.3099,44.352
ridge,Ridge Regression,663.0215,979238.8312,989.5456,0.6346,2.5921,126.1927,0.077
br,Bayesian Ridge,663.0201,979238.8025,989.5456,0.6346,2.5921,126.1919,0.352
lar,Least Angle Regression,663.0215,979238.7993,989.5456,0.6346,2.5921,126.1929,0.089
lasso,Lasso Regression,662.9406,979242.9312,989.5477,0.6346,2.5919,126.1411,15.393


In [6]:
# Model Optimization

tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,14.4155,514.9488,22.6925,0.9998,0.5308,0.2064
1,14.7068,526.0423,22.9356,0.9998,0.5429,0.225
2,14.5987,514.0007,22.6716,0.9998,0.5373,0.2131
3,14.5838,528.4814,22.9887,0.9998,0.5407,0.2133
4,14.5117,509.2754,22.5671,0.9998,0.5393,0.2079
5,14.5945,519.7258,22.7975,0.9998,0.5366,0.1877
6,14.3799,503.2729,22.4337,0.9998,0.5403,0.2307
7,14.4522,510.5334,22.595,0.9998,0.5331,0.2034
8,14.5876,513.0717,22.6511,0.9998,0.5521,0.2216
9,14.7249,523.5501,22.8812,0.9998,0.5434,0.2076


In [7]:
# Finalize model
finalized_model = finalize_model(tuned_model)

# save model to disk
save_model(finalized_model, 'catb_logging')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='throughput', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('cluster_all', 'passthrough'),
                 ('dummy', Dummify(target='throughput')),
                 ('fix_perfect', Remove_100(target='throughput')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
  