# First Look
## Naive Train: 가맹점을 이용한 고객은 어떤 가맹점을 사용하는가

In [2]:
import os, glob

import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl

plt.style.use('seaborn')
plt.rcParams["font.family"] = 'AppleSDGothicNeoSB00'
mpl.rcParams['axes.unicode_minus'] = False

In [3]:
pathList = glob.glob('../data/provided/*')
for idx, p in enumerate(pathList[:4]):
    name = p.split(' ')[-1].split('.')[0]
    print(f'Load {name}')
    if idx < 3:
        globals()[name] = pd.read_csv(p, engine='python')
    else:
        globals()[name] = pd.read_excel(p)

Load mrc_info
Load samp_train
Load samp_cst_feat
Load variable_dtype


In [5]:
def getCustomerGroup(group, complement=False):
    '''
    특정 가맹점을 이용한 고객만을 추려내는 함수
    group: list, 가맹점 번호가 성분임
    complement: boolean, 입력한 group에 해당하지 않는 고객을 가져올 경우 True
    ---
    Return: 고객ID 시리즈
    '''
    return (samp_train[~samp_train['MRC_ID_DI'].isin(group)]['cst_id_di'].tolist() if complement 
            else samp_train[samp_train['MRC_ID_DI'].isin(group)]['cst_id_di'].tolist())

In [11]:
train_raw = samp_cst_feat[
    samp_cst_feat['cst_id_di']
    .isin(getCustomerGroup(group=[0], complement=True))
]

train_raw = train_raw.merge(samp_train, how='left', on='cst_id_di').drop('cst_id_di', axis=1)

In [13]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score


class model:
    def __init__(self, random_state=None):
        if random_state is None:
            self.random_state=42
        return
    
    def getModel(self, estimator='xgb'):
        if estimator == 'xgb':
            self.estimator = estimator
            self.model = XGBClassifier(random_state=self.random_state, n_jobs=-1)
            self.params = self._getParams(estimator=estimator)
            print('XGBoost loaded')
    
    def fit(self, X, y, k=3, optimize=False, verbose=0):
        '''
        X: pandas.core.frame.DataFrame
        y: pandas.core.series.Series, numpy.ndarray, iterable object
        '''
        if optimize:
            return self._trainCV(X=X, y=y, k=k, verbose=verbose)
        else:
            return self._train(X=X, y=y, k=k)
    
    def _train(self, X, y, k):
        cv = KFold(n_splits=k, random_state=self.random_state, shuffle=True)
        AUC_list = []
        for train_idx, valid_idx in cv.split(X):
            X_train, X_valid = X.iloc[train_idx, :], X.iloc[valid_idx, :]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
            self.model.fit(X_train, y_train)
            AUC_list.append(
                roc_auc_score(
                    y_true=y_valid, 
                    y_score=self.model.predict_proba(X_valid)[:, 1]))
            
        print('mean of AUROC:', np.mean(AUC_list))
        self.score = np.mean(AUC_list)
        self.score_list = AUC_list
        self.model.fit(X, y)
        return self.model
    
    def _trainCV(self, X, y, k, verbose=0):
        cv = KFold(n_splits=k, random_state=self.random_state, shuffle=True)
        gridCV = GridSearchCV(self.model, param_grid=self.params, 
                              cv=cv, scoring='roc_auc', verbose=verbose)
        gridCV.fit(X, y)
        
        print(f'best params of {self.estimator}:', gridCV.best_params_)
        print(f'best AUROC of {self.estimator}:', gridCV.best_score_)
        return gridCV.best_estimator_
        
    def _getParams(self, estimator='xgb'):
        param_tank = {'xgb': {'booster': ['gbtree'], 
                              'learning_rate': list(np.round(np.linspace(0, 0.2, 9)[1:], 3)), 
                              'gamma': [3, 4, 5], 
                              'colsample_bytree': [0.5+0.1*i for i in range(4)],
                              'max_depth':[6,7,8], 
                              'subsample': [0.6+0.1*i for i in range(4)]}}
        return param_tank[estimator]