In [1]:
import pandas as pd
from pycaret.classification import *

In [2]:
PATH = '/Users/wonjae/Documents/GitHub/2020-fall-bma-final'

In [3]:
clickstream = pd.read_csv(f'{PATH}/click_stream/clickstream_preprocess.csv')\
                .set_index('CUS_ID')

keyword = pd.read_csv(f'{PATH}/query_string/keyword_preprocess.csv')\
            .set_index('CUS_ID')

profile = pd.read_csv(f'{PATH}/profile/profile_preprocess.csv')\
    .set_index('CUS_ID')

In [4]:
cross_sectional_data = clickstream.join(keyword, how = 'outer')\
                                .join(profile, how = 'outer')\
                                .fillna(0)

In [46]:
# Auto ML with pycaret
config = setup(
    data = cross_sectional_data.drop(columns = ['GENDER','AGE']), 
    target = 'GROUP', 
    session_id = 123, 
    fix_imbalance = False,
    ignore_low_variance = True,
    remove_multicollinearity = True,
    feature_selection = True,
    n_jobs = 2
)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,GROUP
2,Target Type,Multiclass
3,Label Encoded,"F20-: 0, F30: 1, F40+: 2, M20-: 3, M30: 4, M40+: 5"
4,Original Data,"(2500, 285)"
5,Missing Values,False
6,Numeric Features,284
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [6]:
from sklearn.metrics import log_loss
add_metric('logloss', 'Log Loss', log_loss, target = 'pred_proba', greater_is_better = False)

Name                                                          Log Loss
Display Name                                                  Log Loss
Score Function                   <function log_loss at 0x7fab2c528af0>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [7]:
# Comparison

compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss,TT (Sec)
rf,Random Forest Classifier,0.3454,0.6385,0.2447,0.3272,0.2882,0.1274,0.1382,1.6358,0.548
catboost,CatBoost Classifier,0.3385,0.6546,0.2557,0.3242,0.3026,0.134,0.1397,1.6577,55.034
xgboost,Extreme Gradient Boosting,0.3362,0.6373,0.2578,0.3142,0.3072,0.1383,0.1425,1.9617,7.114
lightgbm,Light Gradient Boosting Machine,0.3328,0.6336,0.2454,0.306,0.2941,0.1241,0.1295,2.129,3.088
et,Extra Trees Classifier,0.3322,0.629,0.2293,0.3098,0.2689,0.1058,0.1162,1.6509,0.214
gbc,Gradient Boosting Classifier,0.3202,0.6247,0.244,0.2946,0.2898,0.1158,0.1195,1.6871,13.628
lda,Linear Discriminant Analysis,0.319,0.6203,0.2635,0.3022,0.3056,0.1349,0.136,2.0427,0.046
qda,Quadratic Discriminant Analysis,0.2967,0.5231,0.181,0.1776,0.1947,0.0309,0.0402,21.833,0.043
ada,Ada Boost Classifier,0.2882,0.5843,0.2293,0.2664,0.2693,0.091,0.0926,1.7764,0.498
lr,Logistic Regression,0.2739,0.5771,0.1938,0.2335,0.2306,0.0439,0.0465,1.7755,0.925


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=123, verbose=0,
                       warm_start=False)

---

In [8]:
classifier = create_model('rf')
tuned_model = tune_model(classifier, optimize='Log Loss')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
0,0.2914,0.5,0.1667,0.0849,0.1315,0.0,0.0,1.6977
1,0.2914,0.5,0.1667,0.0849,0.1315,0.0,0.0,1.6977
2,0.2857,0.5,0.1667,0.0816,0.127,0.0,0.0,1.7015
3,0.2857,0.5,0.1667,0.0816,0.127,0.0,0.0,1.7036
4,0.2857,0.5,0.1667,0.0816,0.127,0.0,0.0,1.7035
5,0.2857,0.5,0.1667,0.0816,0.127,0.0,0.0,1.7035
6,0.2857,0.5,0.1667,0.0816,0.127,0.0,0.0,1.7068
7,0.2857,0.5,0.1667,0.0816,0.127,0.0,0.0,1.7016
8,0.2857,0.5,0.1667,0.0816,0.127,0.0,0.0,1.7016
9,0.2874,0.5,0.1667,0.0826,0.1283,0.0,0.0,1.7003


In [53]:
# pycaret을 실행할 때마다 테스트 트레인의 인덱스가 변화함. 따라서 수기로 작성함.

X_test = config[35][3][1]
y_test = config[35][4][1]

X_train = config[35][1][1]
y_train = config[35][2][1]

In [54]:
# 기본모델
y_pred = classifier.predict_proba(X_test) 
y_test = pd.get_dummies(y_test) 

log_loss = -(np.log(y_pred) * y_test).sum(axis = 1).mean()

log_loss

1.5668546154370757

In [57]:
# tuned model
y_pred = tuned_model.predict_proba(X_test) 
y_test = pd.get_dummies(y_test) 

log_loss = -(np.log(y_pred) * y_test).sum(axis = 1).mean()

log_loss

1.668876012366958

In [55]:
classifier, tuned_model

(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                        oob_score=False, random_state=123, verbose=0,
                        warm_start=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={},
                        criterion='gini', max_depth=7, max_features='log2',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.3, min_impurity_split=None,
                        min_samples_leaf=4, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=140, n_jobs=2,
              

In [49]:
# 또한, 변수 선정 기법을 사용했으므로, 다음과 같은 컬럼만 사용한다.
selected_columns = X_test.columns

In [5]:
# Auto ML with pycaret
config_smote = setup(
    data = cross_sectional_data.drop(columns = ['GENDER','AGE']), 
    target = 'GROUP', 
    session_id = 123, 
    fix_imbalance = True,
    ignore_low_variance = True,
    remove_multicollinearity = True,
    feature_selection = True,
    n_jobs = 2
)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,GROUP
2,Target Type,Multiclass
3,Label Encoded,"F20-: 0, F30: 1, F40+: 2, M20-: 3, M30: 4, M40+: 5"
4,Original Data,"(2500, 285)"
5,Missing Values,False
6,Numeric Features,284
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [7]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss,TT (Sec)
catboost,CatBoost Classifier,0.3368,0.6564,0.2849,0.3227,0.3234,0.1562,0.1577,1.6778,73.738
xgboost,Extreme Gradient Boosting,0.3265,0.6363,0.275,0.3173,0.3148,0.144,0.1451,1.9933,14.252
lightgbm,Light Gradient Boosting Machine,0.3236,0.6325,0.2598,0.3086,0.3045,0.1304,0.1327,2.0191,4.057
rf,Random Forest Classifier,0.3042,0.6159,0.2682,0.2986,0.2974,0.1277,0.1286,1.6867,0.987
gbc,Gradient Boosting Classifier,0.303,0.629,0.2629,0.2972,0.297,0.1237,0.1243,1.6997,28.965
qda,Quadratic Discriminant Analysis,0.295,0.5196,0.1781,0.1502,0.188,0.0256,0.0355,22.0484,0.132
et,Extra Trees Classifier,0.2922,0.6117,0.2516,0.2851,0.2856,0.1076,0.1081,1.7249,0.383
lda,Linear Discriminant Analysis,0.2784,0.6129,0.2688,0.3075,0.2855,0.1222,0.1236,2.3028,0.139
ada,Ada Boost Classifier,0.2499,0.5892,0.2334,0.2646,0.2514,0.0812,0.0822,1.7786,1.161
lr,Logistic Regression,0.2333,0.5756,0.2496,0.2712,0.233,0.085,0.0878,1.8894,1.772


<catboost.core.CatBoostClassifier at 0x7fcff85c6b80>

In [8]:
classifier_smote = create_model('catboost')
tuned_model_smote = tune_model(classifier, optimize='Log Loss')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
0,0.2171,0.5892,0.1938,0.2316,0.2196,0.042,0.0425,1.7223
1,0.2857,0.6416,0.2683,0.2827,0.282,0.1081,0.1086,1.6381
2,0.3143,0.6223,0.2994,0.328,0.2968,0.155,0.1593,1.6683
3,0.2629,0.6163,0.2495,0.2841,0.2672,0.0964,0.0975,1.7014
4,0.2743,0.6196,0.2887,0.2594,0.2598,0.1104,0.1119,1.6697
5,0.3257,0.6856,0.2944,0.3493,0.3235,0.177,0.1803,1.6105
6,0.32,0.6544,0.3032,0.3441,0.3128,0.1675,0.1708,1.6517
7,0.3543,0.6947,0.3322,0.3668,0.3507,0.202,0.2044,1.6209
8,0.28,0.6185,0.2687,0.2779,0.2572,0.1133,0.1167,1.6664
9,0.3103,0.6779,0.2616,0.332,0.2977,0.1492,0.1543,1.6317


In [39]:
# pycaret을 실행할 때마다 테스트 트레인의 인덱스가 변화함. 따라서 수기로 작성함.

X_test = config_smote[38][3][1]
y_test = config_smote[38][4][1]

X_train = config_smote[38][1][1]
y_train = config_smote[38][2][1]

In [40]:
# 기본모델
y_pred = classifier_smote.predict_proba(X_test) 
y_test = pd.get_dummies(y_test) 

log_loss = -(np.log(y_pred) * y_test).sum(axis = 1).mean()

log_loss

1.5912318335345756

In [41]:
# tuned model
y_pred = tuned_model_smote.predict_proba(X_test) 
y_test = pd.get_dummies(y_test) 

log_loss = -(np.log(y_pred) * y_test).sum(axis = 1).mean()

log_loss

1.6135326958519927

In [42]:
classifier_smote, tuned_model_smote

(<catboost.core.CatBoostClassifier at 0x7fcff6d9e8b0>,
 <catboost.core.CatBoostClassifier at 0x7fcfb281bb80>)

In [43]:
# 또한, 변수 선정 기법을 사용했으므로, 다음과 같은 컬럼만 사용한다.
selected_columns_smote = X_test.columns

### SMOTE를 안한 Tuned 되지 않은 RandomForest 모델의 Log Loss가 가장 낮게 나옴

In [66]:
import pickle
selected_model = {
    'model': classifier,
    'column': selected_columns
}

with open(f"{PATH}/modeling/predict_selected_model.pkl", 'wb') as f:
    pickle.dump(selected_model, f)