# Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pycaret.regression import *

# Data Read-In

In [2]:
data = pd.read_csv("../../../extrucal_machine-learning/data/extrucal_dataset.csv")
data

Unnamed: 0,extruder_size,metering_depth,polymer_density,rpm,screw_pitch,flight_width,number_flight,throughput
0,80,7.2,1200,95,48.0,12.0,2,92.55
1,220,17.6,1300,75,154.0,13.2,1,4397.68
2,190,7.6,1400,95,323.0,36.1,2,3187.76
3,190,7.6,800,55,228.0,26.6,1,1008.12
4,60,1.8,800,10,48.0,10.8,1,2.72
...,...,...,...,...,...,...,...,...
1935355,40,3.2,1000,30,32.0,4.0,1,8.78
1935356,180,3.6,800,85,306.0,27.0,2,762.58
1935357,30,1.8,1100,60,30.0,2.7,1,7.89
1935358,70,6.3,1000,40,35.0,10.5,2,11.59


In [3]:
# To make these variable as "numeric"

data['polymer_density'] = data['polymer_density'].astype('float')
data['rpm'] = data['rpm'].astype('float')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1935360 entries, 0 to 1935359
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   extruder_size    int64  
 1   metering_depth   float64
 2   polymer_density  float64
 3   rpm              float64
 4   screw_pitch      float64
 5   flight_width     float64
 6   number_flight    int64  
 7   throughput       float64
dtypes: float64(6), int64(2)
memory usage: 118.1 MB


# Setup of a Experiment only with Normalization

In [5]:
experiment = setup(
    data, 
    target='throughput',
    normalize=True,
    use_gpu=True
)

Unnamed: 0,Description,Value
0,session_id,1431
1,Target,throughput
2,Original Data,"(1935360, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1354751, 7)"


In [6]:
# Model Selection

model = compare_models(n_select=1)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,29.6892,4981.2547,70.5734,0.9981,0.618,0.3273,19.981
rf,Random Forest Regressor,30.7215,5552.9958,74.5152,0.9979,0.0519,0.0419,188.979
lightgbm,Light Gradient Boosting Machine,54.7437,7809.9404,88.37,0.9971,0.8885,0.9926,7.42
xgboost,Extreme Gradient Boosting,51.9658,8229.728,90.7104,0.9969,0.7622,0.4647,1.72
knn,K Neighbors Regressor,54.4137,13688.4555,116.9958,0.9949,0.6622,0.1081,14.237
dt,Decision Tree Regressor,60.996,21254.771,145.7822,0.9921,0.0982,0.0794,5.915
gbr,Gradient Boosting Regressor,169.4124,79795.2312,282.4724,0.9702,1.5822,10.0162,117.374
et,Extra Trees Regressor,16.0378,2300.928,40.131,0.6991,0.0283,0.022,274.516
ridge,Ridge Regression,662.6587,977535.1938,988.6803,0.6346,2.5915,127.9593,0.106
lr,Linear Regression,662.6591,977535.2125,988.6803,0.6346,2.5915,127.9592,0.163


In [7]:
# Model Optimization

tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,40.1908,7215.9836,84.9469,0.9973,0.6866,0.6612
1,40.418,7267.9642,85.2524,0.9973,0.7008,0.654
2,40.1455,7108.8715,84.3141,0.9973,0.6873,0.5157
3,39.7573,7040.6035,83.9083,0.9974,0.6715,0.4925
4,40.0922,7040.0311,83.9049,0.9973,0.6889,0.5256
5,40.3391,7306.2727,85.4767,0.9973,0.6919,0.559
6,40.2601,7181.9572,84.7464,0.9973,0.6918,0.5929
7,40.8076,7318.6692,85.5492,0.9973,0.6951,0.5391
8,39.6651,7017.811,83.7724,0.9974,0.6893,0.5789
9,40.4399,7077.9585,84.1306,0.9973,0.693,0.5503


In [8]:
# Finalize model
finalized_model = finalize_model(tuned_model)

# save model to disk
save_model(finalized_model, 'catb_normalized')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='throughput', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('rem_outliers', 'passthrough'), ('cluster_all', 'passthrough'),
                 ('dummy', Dummify(target='throughput')),
                 ('fix_perfect', Remove_100(target='throughput')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'),

# Setup of an Experiment with Normalization and Target-Transformation

In [5]:
# To prevent error with transform_target

data.replace(0, 0.001, inplace=True)

In [6]:
experiment = setup(
    data, 
    target='throughput',
    normalize=True,
    transform_target=True,
    log_experiment=True,
    log_plots=True,
    use_gpu=True,
    experiment_name='extrucal_2nd'  # Need this to make this appear in MLflow
)

Unnamed: 0,Description,Value
0,session_id,3957
1,Target,throughput
2,Original Data,"(1935360, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1354751, 7)"


In [7]:
# Model Selection

model = compare_models(
    n_select=1,
    exclude=['rf', 'et', 'ada']   # exclude RandomForest and ExtraTree Regressors due to memory problem
)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,19.874,4302.2346,65.5842,0.9984,0.0509,0.0476,22.638
xgboost,Extreme Gradient Boosting,41.4651,7879.5457,88.7004,0.997,0.0811,0.0844,3.096
lightgbm,Light Gradient Boosting Machine,43.9112,10496.7433,102.422,0.9961,0.0895,0.0907,10.056
knn,K Neighbors Regressor,55.3676,14518.6871,120.4917,0.9946,0.4475,328.5099,16.663
dt,Decision Tree Regressor,61.0529,21234.2043,145.7171,0.992,0.0975,0.075,7.313
lr,Linear Regression,225.3665,170162.4438,412.5049,0.9363,1.1181,7857.3732,1.467
ridge,Ridge Regression,225.3666,170162.2062,412.5047,0.9363,1.1181,7857.3689,1.404
lar,Least Angle Regression,225.3665,170162.2487,412.5047,0.9363,1.1181,7857.3726,1.479
br,Bayesian Ridge,225.3666,170161.5964,412.5039,0.9363,1.1181,7857.378,1.684
huber,Huber Regressor,215.1331,221732.5227,470.8757,0.9169,1.1511,15648.1889,5.784


In [10]:
# Model Optimization

# tuned_model = tune_model(model)   <- Gives Error

In [11]:
# Finalize model
# finalized_model = finalize_model(tuned_model)

# Just use not-tuned model
finalized_model = finalize_model(model)

# save model to disk
save_model(finalized_model, 'catb_normalized_target-transformed')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='throughput', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  PowerTransformedTargetRegressor(border_count=32,
                                                  loss_function=