# PyCaret 이용한 AutoML 기반 코드

[캐글 노트북](https://www.kaggle.com/qkrwlsdn96/automl-by-pycaret-base-pb-score-0-87669)

## 데이터 불러오기

In [2]:
import pandas as pd
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

## 데이터 구조 확인

In [3]:
print(train.shape)
print(test.shape)

(26049, 16)
(6512, 15)


In [4]:
train.head(3)

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,<=50K


## 함수 불러오기

In [5]:
import pycaret.classification as pyclf

## 실험 환경 구축

* PyCaret은 모델 학습 전 실험 환경 구축 필요 via `setup` 함수
* `setup` 단계에서는 PyCaret이 자동으로 컬럼 형태를 인식

In [6]:
# income 컬럼이 예측 대상이므로 target 인자에 명시
clf = pyclf.setup(data=train, target='income')

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,6743
1,Target Type,Binary
2,Label Encoded,"<=50K: 0, >50K: 1"
3,Original Data,"(26049, 16)"
4,Missing Values,False
5,Numeric Features,6
6,Categorical Features,9
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## 모델 학습 및 비교

* `compared_models` 함수를 통해 15개의 기본 모델을 학습하고 성능 비교
* 대회 지표인 F1 기준으로 3개 모델 저장

In [7]:
best_3 = pyclf.compare_models(sort='F1', n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.8706,0.9267,0.6556,0.7754,0.7103,0.6278,0.6315,0.1777
1,Extreme Gradient Boosting,0.87,0.9262,0.6581,0.7715,0.7102,0.6271,0.6305,4.3144
2,CatBoost Classifier,0.8708,0.9281,0.6508,0.7792,0.7091,0.6269,0.6312,4.4876
3,Ada Boost Classifier,0.8597,0.9145,0.6191,0.7577,0.681,0.5923,0.5975,0.7389
4,Gradient Boosting Classifier,0.8643,0.9204,0.5892,0.7972,0.6774,0.5939,0.6049,2.3722
5,Random Forest Classifier,0.8447,0.8807,0.5912,0.7176,0.6481,0.5497,0.554,0.116
6,Linear Discriminant Analysis,0.8399,0.8936,0.571,0.711,0.6331,0.5323,0.5377,0.2298
7,Extra Trees Classifier,0.83,0.8793,0.6014,0.6645,0.6312,0.5212,0.5224,0.4005
8,Decision Tree Classifier,0.8114,0.7455,0.6179,0.6088,0.6131,0.4885,0.4886,0.1171
9,Ridge Classifier,0.8398,0.0,0.5205,0.7405,0.611,0.514,0.5268,0.0582


## 모델 앙상블

In [8]:
best_3

[LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                random_state=6743, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=-1, num_parallel_tree=1,
               objective='binary:logistic', random_state=6743, reg_alpha=0,
               reg_lambda=1, scal

In [9]:
blended = pyclf.blend_models(estimator_list=best_3, fold=5, method='soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8687,0.9269,0.6553,0.7676,0.707,0.6231,0.6263
1,0.8755,0.9303,0.6659,0.7871,0.7215,0.642,0.6458
2,0.8684,0.9281,0.6591,0.7648,0.708,0.6237,0.6265
3,0.8656,0.9283,0.6308,0.7725,0.6945,0.6095,0.6147
4,0.8725,0.9269,0.6485,0.7868,0.711,0.6301,0.635
Mean,0.8701,0.9281,0.6519,0.7758,0.7084,0.6257,0.6297
SD,0.0035,0.0012,0.012,0.0095,0.0086,0.0106,0.0103


In [10]:
blended

VotingClassifier(estimators=[('Light Gradient Boosting Machine_0',
                              LGBMClassifier(boosting_type='gbdt',
                                             class_weight=None,
                                             colsample_bytree=1.0,
                                             importance_type='split',
                                             learning_rate=0.1, max_depth=-1,
                                             min_child_samples=20,
                                             min_child_weight=0.001,
                                             min_split_gain=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             num_leaves=31, objective=None,
                                             random_state=6743, reg_alpha=0.0,
                                             reg_la...
                                            n_estimators=100, n_jobs=-1,
                             

## 모델 예측

In [11]:
pred_holdout = pyclf.predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8686,0.9261,0.6464,0.7736,0.7043,0.6207,0.6248


## 전체 데이터에 대한 재학습

* 현재까지의 실험은 train 데이터를 train/valid 로 나눠 실험한 것
* 최적의 성능을 위해 전체 데이터에 재학습

In [12]:
final_model = pyclf.finalize_model(blended)

In [13]:
final_model

VotingClassifier(estimators=[('Light Gradient Boosting Machine_0',
                              LGBMClassifier(boosting_type='gbdt',
                                             class_weight=None,
                                             colsample_bytree=1.0,
                                             importance_type='split',
                                             learning_rate=0.1, max_depth=-1,
                                             min_child_samples=20,
                                             min_child_weight=0.001,
                                             min_split_gain=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             num_leaves=31, objective=None,
                                             random_state=6743, reg_alpha=0.0,
                                             reg_la...
                                            n_estimators=100, n_jobs=-1,
                             

## Test set 예측

In [14]:
preds = pyclf.predict_model(final_model, data=test)
preds

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,Label,Score
0,0,28,Private,67661,Some-college,10,Never-married,Adm-clerical,Other-relative,White,Female,0,0,40,United-States,<=50K,0.0039
1,1,40,Self-emp-inc,37869,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K,0.5363
2,2,20,Private,109952,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,25,United-States,<=50K,0.0002
3,3,40,Private,114537,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K,0.6956
4,4,37,Private,51264,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,99,France,<=50K,0.3577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6507,6507,35,Private,61343,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,>50K,0.5166
6508,6508,41,Self-emp-inc,32185,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K,0.6123
6509,6509,39,Private,409189,5th-6th,3,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,Mexico,<=50K,0.0268
6510,6510,35,Private,180342,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,0.2123


In [28]:
submission = preds[['id','Score']]
submission.head(2)

Unnamed: 0,id,Score
0,0,0.0039
1,1,0.5363


In [29]:
submission['prediction'] = (submission['Score'] >= 0.5).astype(int)
submission.head(2)

Unnamed: 0,id,Score,prediction
0,0,0.0039,0
1,1,0.5363,1


In [30]:
submission_fin = submission[['id','prediction']]
submission_fin.head(2)

Unnamed: 0,id,prediction
0,0,0
1,1,1


In [31]:
submission_fin.to_csv('submissions/submission_pycaret.csv', index=False)

In [32]:
!kaggle competitions submit -c kakr-4th-competition -f submissions/submission_pycaret.csv -m "Message"

100%|██████████████████████████████████████| 43.4k/43.4k [00:04<00:00, 10.0kB/s]
Successfully submitted to [T-Academy X KaKr] 성인 인구조사 소득 예측 대회 