# First Look
## Naive Train: 가맹점 미이용 고객 vs 가맹점 이용 고객

In [17]:
import os, glob

import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl

plt.style.use('seaborn')
plt.rcParams["font.family"] = 'AppleSDGothicNeoSB00'
mpl.rcParams['axes.unicode_minus'] = False

In [31]:
numVAR = pd.read_csv('../data/processed/가맹점이용여부_유의미한VAR.csv')
catVAR = ['VAR165', 'VAR066', 'VAR124', 'VAR107', 'VAR111']

In [23]:
pathList = glob.glob('../data/provided/*')
for idx, p in enumerate(pathList[:4]):
    name = p.split(' ')[-1].split('.')[0]
    print(f'Load {name}')
    if idx < 3:
        globals()[name] = pd.read_csv(p, engine='python')
    else:
        globals()[name] = pd.read_excel(p)

Load mrc_info
Load samp_train
Load samp_cst_feat
Load variable_dtype


### 학습 데이터 구축
Problem Type: Binary(가맹점을 이용했는가(1), 이용하지 않았는가(0))

In [40]:
train_raw = samp_cst_feat.merge(samp_train, how='left', on='cst_id_di').drop('cst_id_di', axis=1)
train_raw.loc[train_raw['MRC_ID_DI']!=0, :] = 1

# train_raw.to_csv('../data/processed/train_FranchiseOrNot.csv', index=False)

In [143]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score


class model:
    def __init__(self, random_state=None):
        if random_state is None:
            self.random_state=42
        return
    
    def getModel(self, estimator='xgb'):
        if estimator == 'xgb':
            self.estimator = estimator
            self.model = XGBClassifier(random_state=self.random_state, n_jobs=-1)
            self.params = self._getParams(estimator=estimator)
            print('XGBoost loaded')
    
    def fit(self, X, y, k=3, optimize=False, verbose=0):
        '''
        X: pandas.core.frame.DataFrame
        y: pandas.core.series.Series, numpy.ndarray, iterable object
        '''
        if optimize:
            return self._trainCV(X=X, y=y, k=k, verbose=verbose)
        else:
            return self._train(X=X, y=y, k=k)
    
    def _train(self, X, y, k):
        cv = KFold(n_splits=k, random_state=self.random_state, shuffle=True)
        AUC_list = []
        for train_idx, valid_idx in cv.split(X):
            X_train, X_valid = X.iloc[train_idx, :], X.iloc[valid_idx, :]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
            self.model.fit(X_train, y_train)
            AUC_list.append(
                roc_auc_score(
                    y_true=y_valid, 
                    y_score=self.model.predict_proba(X_valid)[:, 1]))
            
        print('mean of AUROC:', np.mean(AUC_list))
        self.score = np.mean(AUC_list)
        self.score_list = AUC_list
        self.model.fit(X, y)
        return self.model
    
    def _trainCV(self, X, y, k, verbose=0):
        cv = KFold(n_splits=k, random_state=self.random_state, shuffle=True)
        gridCV = GridSearchCV(self.model, param_grid=self.params, 
                              cv=cv, scoring='roc_auc', verbose=verbose)
        gridCV.fit(X, y)
        
        print(f'best params of {self.estimator}:', gridCV.best_params_)
        print(f'best AUROC of {self.estimator}:', gridCV.best_score_)
        return gridCV.best_estimator_
        
    def _getParams(self, estimator='xgb'):
        param_tank = {'xgb': {'booster': ['gbtree'], 
                              'learning_rate': list(np.round(np.linspace(0, 0.2, 9)[1:], 3)), 
                              'gamma': [3, 4, 5], 
                              'colsample_bytree': [0.5+0.1*i for i in range(4)],
                              'max_depth':[6,7,8], 
                              'subsample': [0.6+0.1*i for i in range(4)]}}
        return param_tank[estimator]

In [None]:
class DataGenerator:
    def __init__(self):
        pass
    def drop_features(self, drops=None): # mutable 변수를 초기값으로 넣는 것은 일반적으로 좋지 않음
        if drops == None:
            self.data.drop(["pr7", "pr10", "pr11", "pr12", "pr15", "pr16"], axis=1, inplace=True)
        elif len(drops) == 0:
            pass
        else:
            self.data.drop(drops, axis=1, inplace=True)
        return self.data

    def mean_encoding(self, option=None, drop=False): 
        if option is not None:
            feature_list = self.fts_dic[option]
            if option == "g" or option == "s":
                name_list = []
                for fts in feature_list:
                    name = f"mean_encoding_{fts}"
                    name_list.append(name)
                    temp = (
                        self.data.groupby([fts])
                        .mean()["payed"]
                        .to_frame(name)
                        .reset_index()
                    )
                    self.data = pd.merge(self.data, temp, on=fts, how="left")
                self.data[f"mean_encoding_{option}"] = self.data[
                    self.data.columns[
                        self.data.columns.str.startswith(f"mean_encoding_{option}")
                    ].tolist()
                ].mean(axis=1)
                if drop:
                    self.data.drop(name_list, axis=1, inplace=True)
            else:
                name_list = []
                for fts in feature_list:
                    name_list.append(fts)
                    temp = (
                        self.data.groupby([fts])
                        .mean()["payed"]
                        .to_frame(f"mean_encoding_{fts}")
                        .reset_index()
                    )
                    self.data = pd.merge(self.data, temp, on=fts, how="left")
                if drop:
                    self.data.drop(name_list, axis=1, inplace=True)
        return self.data

In [145]:
MOD = model()
MOD.getModel(estimator='xgb')

X = train_raw.drop('MRC_ID_DI', axis=1)
y = train_raw['MRC_ID_DI']
MOD.fit(X=X, y=y, optimize=False, k=10)

XGBoost loaded
mean of AUROC: 1.0


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)