In [2]:
import pandas as pd
from pycaret.classification import *

In [3]:
clickstream = pd.read_csv('./clickstream_preprocess.csv')\
                .set_index('CUS_ID')

keyword = pd.read_csv('./keyword_preprocess.csv')\
            .set_index('CUS_ID')

profile = pd.read_csv('./profile_preprocess.csv')\
    .set_index('CUS_ID')

In [4]:
cross_sectional_data = clickstream.join(keyword, how = 'outer')\
                                .join(profile, how = 'outer')\
                                .fillna(0)

In [7]:
# Auto ML with pycaret
A = setup(
    data = cross_sectional_data.drop(columns = ['GENDER','AGE']), 
    target = 'GROUP', 
    session_id = 123, 
    fix_imbalance = True,
    ignore_low_variance = True,
    remove_multicollinearity = True,
    feature_selection = True,
    n_jobs = 2
)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,GROUP
2,Target Type,Multiclass
3,Label Encoded,"F20-: 0, F30: 1, F40+: 2, M20-: 3, M30: 4, M40+: 5"
4,Original Data,"(2500, 245)"
5,Missing Values,False
6,Numeric Features,244
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [10]:
from sklearn.metrics import log_loss
add_metric('logloss', 'Log Loss', log_loss, target = 'pred_proba', greater_is_better = False)

Name                                                          Log Loss
Display Name                                                  Log Loss
Score Function                      <function log_loss at 0x126caf310>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [12]:
# Comparison

compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss,TT (Sec)
catboost,CatBoost Classifier,0.3425,0.6458,0.2875,0.3278,0.3292,0.1627,0.1643,1.7095,63.135
gbc,Gradient Boosting Classifier,0.3248,0.636,0.2846,0.3205,0.3187,0.1511,0.1518,1.6743,22.521
xgboost,Extreme Gradient Boosting,0.323,0.631,0.2709,0.3081,0.309,0.1369,0.1383,2.0129,10.676
lightgbm,Light Gradient Boosting Machine,0.3191,0.6263,0.2594,0.2994,0.3009,0.1261,0.1279,2.0181,3.219
et,Extra Trees Classifier,0.3111,0.6185,0.2774,0.3039,0.3032,0.1304,0.1313,1.6979,0.374
rf,Random Forest Classifier,0.3002,0.6177,0.267,0.2909,0.292,0.1218,0.1226,1.6831,0.805
qda,Quadratic Discriminant Analysis,0.2973,0.5338,0.184,0.2013,0.2042,0.0375,0.0463,20.7255,0.12
lda,Linear Discriminant Analysis,0.2716,0.6178,0.2656,0.297,0.2766,0.1141,0.1155,2.1237,0.138
ada,Ada Boost Classifier,0.2544,0.5957,0.2405,0.267,0.2544,0.0847,0.0857,1.7786,0.94
lr,Logistic Regression,0.227,0.576,0.2452,0.2666,0.2255,0.079,0.0821,1.8491,1.496


<catboost.core.CatBoostClassifier at 0x130490be0>

---

In [13]:
classifier = create_model('gbc')
tuned_model = tune_model(classifier, optimize='Log Loss')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
0,0.28,0.61,0.2626,0.282,0.2786,0.1054,0.1057,1.7184
1,0.3429,0.6581,0.2766,0.3194,0.3246,0.1608,0.1627,1.6419
2,0.2971,0.5857,0.2532,0.284,0.2893,0.1175,0.1177,1.7908
3,0.28,0.6144,0.2399,0.2603,0.2673,0.0926,0.0931,1.71
4,0.2571,0.6433,0.2538,0.2698,0.2604,0.0755,0.0758,1.6524
5,0.3657,0.6837,0.3558,0.3686,0.3664,0.2127,0.2129,1.5803
6,0.2914,0.6265,0.2459,0.2824,0.2855,0.1107,0.1109,1.7142
7,0.3543,0.6335,0.2905,0.3376,0.3418,0.1786,0.1797,1.6751
8,0.32,0.6186,0.2595,0.2986,0.3028,0.1303,0.1319,1.7057
9,0.3046,0.6646,0.2544,0.2949,0.2937,0.1269,0.1283,1.6435


In [62]:
# pycaret을 실행할 때마다 테스트 트레인의 인덱스가 변화함. 따라서 수기로 작성함.

X_test = A[5]
y_test = A[10]

X_train = A[9]
y_train = A[7]

In [63]:
y_pred = classifier.predict_proba(X_test) 
y_test = pd.get_dummies(y_test) 

log_loss = -(np.log(y_pred) * y_test).sum(axis = 1).mean()

log_loss

1.545684288444372

In [14]:
classifier, tuned_model

(GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=100,
                            n_iter_no_change=None, presort='deprecated',
                            random_state=123, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.163, loss='deviance', max_depth=3,
                            max_features='log2', max_leaf_nodes=None,
                            min_impurity_decrease=0.5, min_impur

### log_loss가 1.54이므로 
```
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.163, loss='deviance', max_depth=3,
                            max_features='log2', max_leaf_nodes=None,
                            min_impurity_decrease=0.5, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=7,
                            min_weight_fraction_leaf=0.0, n_estimators=220,
                            n_iter_no_change=None, presort='deprecated',
                            random_state=123, subsample=0.95, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False)
```
를 채택

In [49]:
# 또한, 변수 선정 기법을 사용했으므로, 다음과 같은 컬럼만 사용한다.
selected_columns = X_test.columns

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [18]:
classifier = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.163, loss='deviance', max_depth=3,
                            max_features='log2', max_leaf_nodes=None,
                            min_impurity_decrease=0.5, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=7,
                            min_weight_fraction_leaf=0.0, n_estimators=220,
                            n_iter_no_change=None, presort='deprecated',
                            random_state=123, subsample=0.95, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [44]:
classifier.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.163, loss='deviance', max_depth=3,
                           max_features='log2', max_leaf_nodes=None,
                           min_impurity_decrease=0.5, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=7,
                           min_weight_fraction_leaf=0.0, n_estimators=220,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=0.95, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [46]:
clickstream_test = pd.read_csv('./preprocess_clickstream_test.csv')\
                .set_index('CUS_ID')

keyword_test = pd.read_csv('./preprocess_querystring_test.csv')\
            .set_index('CUS_ID')

In [47]:
test_set = clickstream_test.join(keyword_test, how = 'outer')\
                           .fillna(0)

In [50]:
test_set = test_set[selected_columns]

In [59]:
prediction = pd.DataFrame(classifier.predict_proba(test_set), 
                          columns = ['F20-', 'F30', 'F40', 'M20-', 'M30', 'M40+'],
                         index = test_set.index)

In [61]:
prediction.to_csv('./group_prediction.csv')