In [1]:
import pandas as pd
from pycaret.classification import *

# Load Data

In [2]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


# Default Modeling

In [3]:
setup_default = ClassificationExperiment()
setup_default.setup(df, target='y', experiment_name='adult', log_experiment=True)

Unnamed: 0,Description,Value
0,Session id,277
1,Target,y
2,Target type,Binary
3,Original data shape,"(32561, 15)"
4,Transformed data shape,"(32561, 65)"
5,Transformed train set shape,"(22792, 65)"
6,Transformed test set shape,"(9769, 65)"
7,Ordinal features,1
8,Numeric features,6
9,Categorical features,8


2023/03/19 17:57:17 INFO mlflow.tracking.fluent: Experiment with name 'adult' does not exist. Creating a new experiment.


<pycaret.classification.oop.ClassificationExperiment at 0x2573adfc2e0>

In [4]:
best_model_default = setup_default.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.8408,0.894,0.562,0.7163,0.6297,0.5302,0.5366,0.887
ridge,Ridge Classifier,0.8385,0.0,0.5006,0.7457,0.5988,0.5027,0.5184,0.678
et,Extra Trees Classifier,0.8283,0.8872,0.3924,0.7887,0.5238,0.4332,0.4726,1.501
lr,Logistic Regression,0.7986,0.5895,0.2684,0.7201,0.3908,0.2992,0.3518,1.322
nb,Naive Bayes,0.7966,0.8336,0.3097,0.6671,0.4229,0.319,0.3537,0.644
rf,Random Forest Classifier,0.7785,0.8126,0.094,0.8658,0.1691,0.1286,0.2389,1.631
knn,K Neighbors Classifier,0.7742,0.6561,0.3112,0.5559,0.3988,0.2732,0.2908,0.868
ada,Ada Boost Classifier,0.7627,0.8378,0.0312,0.6568,0.0594,0.0384,0.1042,1.219
dummy,Dummy Classifier,0.7592,0.5,0.0,0.0,0.0,0.0,0.0,0.761
gbc,Gradient Boosting Classifier,0.7573,0.7938,0.0193,0.4486,0.0368,0.016,0.0467,1.739


# Normalize Data

In [5]:
setup_norm = ClassificationExperiment()
setup_norm.setup(df, target='y', experiment_name='adult', log_experiment=True,
                 normalize=True, normalize_method='zscore')

Unnamed: 0,Description,Value
0,Session id,6421
1,Target,y
2,Target type,Binary
3,Original data shape,"(32561, 15)"
4,Transformed data shape,"(32561, 65)"
5,Transformed train set shape,"(22792, 65)"
6,Transformed test set shape,"(9769, 65)"
7,Ordinal features,1
8,Numeric features,6
9,Categorical features,8


<pycaret.classification.oop.ClassificationExperiment at 0x2573adfc4c0>

In [6]:
best_model_norm = setup_norm.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8473,0.9031,0.5864,0.7269,0.6489,0.5528,0.5582,1.151
svm,SVM - Linear Kernel,0.8384,0.0,0.5883,0.6945,0.6363,0.5334,0.5369,0.853
lda,Linear Discriminant Analysis,0.8377,0.8891,0.5531,0.7092,0.6213,0.5201,0.5267,1.039
ridge,Ridge Classifier,0.8351,0.0,0.4872,0.7394,0.5871,0.4895,0.5062,0.781
et,Extra Trees Classifier,0.8253,0.8822,0.3859,0.7765,0.5151,0.423,0.4617,1.653
knn,K Neighbors Classifier,0.8229,0.8401,0.5697,0.6517,0.6077,0.4941,0.4961,0.939
rf,Random Forest Classifier,0.7812,0.8014,0.1088,0.8654,0.1926,0.147,0.257,1.375
ada,Ada Boost Classifier,0.7616,0.8259,0.0301,0.6064,0.0571,0.0351,0.093,1.452
dummy,Dummy Classifier,0.7592,0.5,0.0,0.0,0.0,0.0,0.0,1.764
lightgbm,Light Gradient Boosting Machine,0.759,0.8117,0.0313,0.4924,0.0587,0.031,0.073,0.94


# Top 5 Predictor

In [7]:
setup_top5 = ClassificationExperiment()
setup_top5.setup(df, target='y', experiment_name='adult', log_experiment=True,
                 n_features_to_select=5)

Unnamed: 0,Description,Value
0,Session id,8998
1,Target,y
2,Target type,Binary
3,Original data shape,"(32561, 15)"
4,Transformed data shape,"(32561, 65)"
5,Transformed train set shape,"(22792, 65)"
6,Transformed test set shape,"(9769, 65)"
7,Ordinal features,1
8,Numeric features,6
9,Categorical features,8


2023/03/19 18:03:30 INFO mlflow.tracking.fluent: Experiment with name 'adult-dataset' does not exist. Creating a new experiment.


<pycaret.classification.oop.ClassificationExperiment at 0x2573b38b0d0>

In [8]:
best_model_top5 = setup_top5.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.8393,0.8906,0.5535,0.7152,0.6236,0.5236,0.5308,1.335
ridge,Ridge Classifier,0.8386,0.0,0.4946,0.7502,0.5958,0.5003,0.5174,0.993
et,Extra Trees Classifier,0.8246,0.8825,0.3815,0.777,0.5112,0.4193,0.4589,1.856
lr,Logistic Regression,0.798,0.5815,0.2682,0.7163,0.3897,0.2977,0.3497,2.539
nb,Naive Bayes,0.7965,0.8347,0.3137,0.6642,0.4257,0.321,0.3546,0.956
rf,Random Forest Classifier,0.7896,0.8119,0.141,0.9087,0.2436,0.1914,0.3069,1.481
knn,K Neighbors Classifier,0.7762,0.6615,0.3115,0.5643,0.4013,0.2776,0.2962,1.238
ada,Ada Boost Classifier,0.768,0.8401,0.0501,0.7791,0.0935,0.0672,0.1563,1.245
dt,Decision Tree Classifier,0.7594,0.5103,0.0297,0.5187,0.056,0.0304,0.0755,0.978
lightgbm,Light Gradient Boosting Machine,0.7593,0.7991,0.0332,0.5327,0.0622,0.0335,0.0816,1.052


# Outlier Removal

In [9]:
setup_outr = ClassificationExperiment()
setup_outr.setup(df, target='y', experiment_name='adult', log_experiment=True,
                 remove_outliers=True)

Unnamed: 0,Description,Value
0,Session id,969
1,Target,y
2,Target type,Binary
3,Original data shape,"(32561, 15)"
4,Transformed data shape,"(31421, 65)"
5,Transformed train set shape,"(21652, 65)"
6,Transformed test set shape,"(9769, 65)"
7,Ordinal features,1
8,Numeric features,6
9,Categorical features,8


<pycaret.classification.oop.ClassificationExperiment at 0x2573b22fe20>

In [10]:
best_model_outr = setup_outr.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8398,0.0,0.5128,0.7429,0.6065,0.5102,0.5241,1.14
lda,Linear Discriminant Analysis,0.8395,0.8933,0.5662,0.7092,0.6295,0.5287,0.5343,1.221
et,Extra Trees Classifier,0.8256,0.8836,0.3859,0.7782,0.5158,0.4239,0.4628,3.654
lr,Logistic Regression,0.7958,0.5981,0.2704,0.697,0.389,0.294,0.342,1.837
nb,Naive Bayes,0.7921,0.829,0.3097,0.6415,0.4174,0.3093,0.3398,1.056
rf,Random Forest Classifier,0.7745,0.8,0.0802,0.8195,0.1454,0.108,0.2094,3.206
knn,K Neighbors Classifier,0.7721,0.6604,0.3052,0.5482,0.3917,0.2653,0.2827,1.195
ada,Ada Boost Classifier,0.7618,0.8327,0.0271,0.6307,0.0519,0.0326,0.0924,1.37
dummy,Dummy Classifier,0.7592,0.5,0.0,0.0,0.0,0.0,0.0,1.057
gbc,Gradient Boosting Classifier,0.7568,0.7965,0.0177,0.4012,0.0336,0.0132,0.0373,1.739


# Gabungan

In [11]:
setup_all = ClassificationExperiment()
setup_all.setup(df, target='y', experiment_name='adult', log_experiment=True,
                normalize=True, normalize_method='zscore',
                n_features_to_select=5, remove_outliers=True)

Unnamed: 0,Description,Value
0,Session id,951
1,Target,y
2,Target type,Binary
3,Original data shape,"(32561, 15)"
4,Transformed data shape,"(31421, 65)"
5,Transformed train set shape,"(21652, 65)"
6,Transformed test set shape,"(9769, 65)"
7,Ordinal features,1
8,Numeric features,6
9,Categorical features,8


<pycaret.classification.oop.ClassificationExperiment at 0x2573b22ece0>

In [12]:
best_model_all = setup_all.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8507,0.9054,0.5957,0.7343,0.6577,0.5635,0.5686,1.714


Processing:   0%|          | 0/61 [00:00<?, ?it/s]