In [162]:
import pandas as pd
from pycaret.classification import *

In [16]:
clickstream = pd.read_csv('./clickstream_preprocess.csv')\
                .set_index('CUS_ID')

keyword = pd.read_csv('./keyword_preprocess.csv')\
            .set_index('CUS_ID')

profile = pd.read_csv('./profile_preprocess.csv')\
            .set_index('CUS_ID')

In [183]:
cross_sectional_data = clickstream.join(keyword, how = 'outer')\
                                  .join(profile, how = 'outer')\
                                  .fillna(0)

In [189]:
# Auto ML with pycaret

setup(
    data = cross_sectional_data.drop(columns = ['GENDER','AGE']), 
    target = 'GROUP', 
    session_id = 123, 
    pca = True,
    pca_components = 100,
    fix_imbalance = True,
    ignore_low_variance = True,
    remove_multicollinearity = True,
    feature_selection = True,
    feature_interaction = True,
    feature_ratio = True
)

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
SITE_CNT_mean,Numeric
SITE_CNT_sum,Numeric
SITE_CNT_mean_Apr,Numeric
SITE_CNT_mean_Aug,Numeric
SITE_CNT_mean_Dec,Numeric
...,...
WORD_COUNT_AVG_AT_ONCE_FOR_LABEL_36,Numeric
WORD_COUNT_AVG_AT_ONCE_FOR_LABEL_37,Numeric
WORD_COUNT_AVG_AT_ONCE_FOR_LABEL_38,Numeric
WORD_COUNT_AVG_AT_ONCE_FOR_LABEL_39,Numeric


 




ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [186]:
# Comparison

compare_models()

IntProgress(value=0, description='Processing: ', max=79)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.3362,0.6306,0.2358,0.3076,0.2802,0.116,0.1251,0.268
lightgbm,Light Gradient Boosting Machine,0.3299,0.6204,0.2425,0.301,0.286,0.1162,0.1227,4.619
et,Extra Trees Classifier,0.3294,0.6275,0.2288,0.317,0.2705,0.1036,0.1129,0.163
gbc,Gradient Boosting Classifier,0.3231,0.623,0.2437,0.295,0.2902,0.1175,0.1215,7.578
xgboost,Extreme Gradient Boosting,0.3219,0.6218,0.2456,0.3011,0.2942,0.1184,0.1222,6.335
ridge,Ridge Classifier,0.3025,0.0,0.2343,0.2835,0.2798,0.1006,0.1026,0.027
lr,Logistic Regression,0.2979,0.5637,0.2125,0.259,0.2474,0.0709,0.0757,0.941
lda,Linear Discriminant Analysis,0.2927,0.6087,0.2432,0.2822,0.2838,0.1051,0.1057,0.05
ada,Ada Boost Classifier,0.2785,0.5886,0.2195,0.2546,0.2579,0.0774,0.0788,0.317
qda,Quadratic Discriminant Analysis,0.2384,0.5032,0.1838,0.2031,0.1884,0.0164,0.018,0.058


Exception in thread Thread-1009:
Traceback (most recent call last):
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/Users/kdchoi/Library/Python/3.8/lib/python/site-packages/joblib/externals/loky/process_executor.py", line 567, in run
    self.flag_executor_shutting_down()
  File "/Users/kdchoi/Library/Python/3.8/lib/python/site-packages/joblib/externals/loky/process_executor.py", line 756, in flag_executor_shutting_down
    self.kill_workers()
  File "/Users/kdchoi/Library/Python/3.8/lib/python/site-packages/joblib/externals/loky/process_executor.py", line 766, in kill_workers
    recursive_terminate(p)
  File "/Users/kdchoi/Library/Python/3.8/lib/python/site-packages/joblib/externals/loky/backend/utils.py", line 28, in recursive_terminate
    _recursive_terminate_without_psutil(process)
  File "/Users/kdchoi/Library/Python/3.8/lib/python/site-packages

KeyboardInterrupt: 

### model을 비교한 결과, random forest 모델이 가장 좋았다. 따라서 이를 통해 하이퍼 파라미터 튜닝을 진행해보려 한다.

In [None]:
model = create_model('rf')
tuned_model = tune_model(model)

In [None]:
model, tuned_model

In [118]:
from sklearn.metrics import log_loss
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [52]:
X = cross_sectional_data.drop(columns = ['GENDER', 'AGE', 'GROUP'])
y = cross_sectional_data[['GROUP']]

train_x, test_x, train_y, test_y = train_test_split(X, y, train_size = 0.7, random_state = 123)

In [190]:
model_log_loss = -((np.log(model.predict_proba(test_x)) * np.array(pd.get_dummies(test_y))).sum(axis = 1)).mean()

In [None]:
tuned_log_loss = -((np.log(tuned_model.predict_proba(test_x)) * np.array(pd.get_dummies(test_y))).sum(axis = 1)).mean()