In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import inspect

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings

warnings.filterwarnings('ignore')

## **1. Data Preparation**

In [2]:
# data read
train = pd.read_csv('train.csv')

In [3]:
train_1 = train.astype('category').copy()
randomstate = 123

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32769 entries, 0 to 32768
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   ACTION            32769 non-null  int64
 1   RESOURCE          32769 non-null  int64
 2   MGR_ID            32769 non-null  int64
 3   ROLE_ROLLUP_1     32769 non-null  int64
 4   ROLE_ROLLUP_2     32769 non-null  int64
 5   ROLE_DEPTNAME     32769 non-null  int64
 6   ROLE_TITLE        32769 non-null  int64
 7   ROLE_FAMILY_DESC  32769 non-null  int64
 8   ROLE_FAMILY       32769 non-null  int64
 9   ROLE_CODE         32769 non-null  int64
dtypes: int64(10)
memory usage: 2.5 MB


In [5]:
train_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32769 entries, 0 to 32768
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   ACTION            32769 non-null  category
 1   RESOURCE          32769 non-null  category
 2   MGR_ID            32769 non-null  category
 3   ROLE_ROLLUP_1     32769 non-null  category
 4   ROLE_ROLLUP_2     32769 non-null  category
 5   ROLE_DEPTNAME     32769 non-null  category
 6   ROLE_TITLE        32769 non-null  category
 7   ROLE_FAMILY_DESC  32769 non-null  category
 8   ROLE_FAMILY       32769 non-null  category
 9   ROLE_CODE         32769 non-null  category
dtypes: category(10)
memory usage: 1.2 MB


In [6]:
# ready for encoding
train_2 = train_1.iloc[:,1:].copy()
target_y = train_1.iloc[:,0].copy()

각 숫자에 의미는 연속형이 아닌 명목형이라 분류형으로 진행  
예상 - 라벨인코딩이 적합
> 이유 원핫인코딩은 너무 많은 피쳐수로 과적합을 유발할 것 같음

In [7]:
# LabelEncoding
from sklearn.preprocessing import LabelEncoder

def label_encoding(x, columns):
    """ 데이터프레임의 열별 라벨인코딩을 진행해주는 함수
    Args :
      x = 데이터프레임
      columns = 데이터프레임 내 라벨인코딩 수행할 열이름
      
    Return :
      {columns : 0,1,2,3,4,5...}
    """
    x_dict={}
    encoding_X = x[columns]
    encoder = LabelEncoder()
    
    encoder.fit(encoding_X)
    
    x_list = encoder.transform(encoding_X)
    x_dict[columns] = x_list.tolist()

    return x_dict    

def label_encoding_concat(main):
    """ 라벨인코딩 된 열을 데이터프레임으로 합치는 함수
    Args :
      main = 라벨인코딩 된 열
      
    Return :
      [{columns : 0,1,2,3,4,5...},{columns : 0,1,2,3,4,5...}...]
    """
    x_train_dict_1 = {}
    for i in main.columns:
        x_train_label = label_encoding(main, i)
        x_train_dict_1.update(x_train_label)
        
        x_train_df = pd.DataFrame(x_train_dict_1)
    
    return x_train_df

In [8]:
# OneHotEncoding
def make_dummy(dataframe_value):
    """ 원핫인코딩으로 만들어주는 함수
    Args :
      dataframe_value = 원핫인코딩으로 바꿀 데이터프레임 
      
    Return :
      [{columns : 1,0,0,0,0,0...},{columns : 0,1,0,0,0,0...}...]
    """
    dummy_X = pd.get_dummies(dataframe_value)
    return dummy_X

In [9]:
# 중복행 role_title 삭제
train_3 = train_2.drop('ROLE_TITLE', axis=1)
train_3

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,39353,85475,117961,118300,123472,117906,290919,117908
1,17183,1540,117961,118343,123125,118536,308574,118539
2,36724,14457,118219,118220,117884,267952,19721,117880
3,36135,5396,117961,118343,119993,240983,290919,118322
4,42680,5905,117929,117930,119569,123932,19793,119325
...,...,...,...,...,...,...,...,...
32764,23497,16971,117961,118300,119993,240983,290919,118322
32765,25139,311198,91261,118026,122392,173805,249618,121145
32766,34924,28805,117961,118327,120299,152038,118612,124924
32767,80574,55643,118256,118257,117945,280788,292795,119082


In [10]:
# RAW 데이터로 label & dummy encoding
X_label_encoding = label_encoding_concat(train_2)
X_train_dummies = make_dummy(train_2)

In [11]:
# 리스트로 모아주기
x_data_for_target_y = [X_label_encoding, X_train_dummies]

## **2. ready for modeling**

In [12]:
# 여러 모델들을 한번에 돌릴 수 있게 함수화
def modeling(X, y,  
             eval_metric  = 'auc', 
             randomstate = 123):
    
    # train & test 데이터 세팅
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=randomstate)
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=randomstate)
    evals = [(X_tr, y_tr),(X_val, y_val)]

    # LightGBM Classifier
    lgb = LGBMClassifier()
    lgb.fit(X_train, y_train,
            eval_metric = eval_metric,
            eval_set = evals,
            verbose = 0)
    lgb_pred_proba = lgb.predict_proba(X_test)[:,1]

    lgb_auc = roc_auc_score(y_test, lgb_pred_proba)

    # RandomForestClassifier
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    rf_pred_proba = rf.predict_proba(X_test)[:,1]

    rf_auc = roc_auc_score(y_test, rf_pred_proba)
    
    # XGBClassifier
    xgb = XGBClassifier()
    xgb.fit(X_train, y_train, eval_set = evals, verbose = 0)
    xgb_pred_proba = xgb.predict_proba(X_test)[:,1]

    xgb_auc = roc_auc_score(y_test, xgb_pred_proba)

    # CatBoostClassifier
    cbc = CatBoostClassifier()
    cbc.fit(X_train, y_train, eval_set = evals, verbose = 0)
    cbc_pred_proba = cbc.predict_proba(X_test)[:,1]

    cbc_auc = roc_auc_score(y_test, cbc_pred_proba)
    
    # LogisticRegression
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(X)
    
    # LR용 train & test 데이터 세팅
    X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(data_scaled, y, test_size=0.2, random_state=randomstate)
    
    lr = LogisticRegression()
    lr.fit(X_train_2, y_train_2)
    lr_pred_proba = lr.predict_proba(X_test_2)[:,1]

    lr_auc = roc_auc_score(y_test_2, lr_pred_proba)

    print(f'LightGBM의 AUC_Score : {lgb_auc:.4f}')
    print(f'XGBClassifier의 AUC_Score : {xgb_auc:.4f}')
    print(f'RandomForestClassifier의 AUC_Score : {rf_auc:.4f}')
    print(f'CatBoostClassifier의 AUC_Score : {cbc_auc:.4f}')
    print(f'LogisticRegression의 AUC_Score : {lr_auc:.4f}')

In [15]:
for function in range(len(x_data_for_target_y)):
    main_Title = x_data_for_target_y[function]
    
    title_name = [title for title, value in locals().items() if value is main_Title][0]
    target = [title for title, value in locals().items() if value is target_y][0]
    
    X = x_data_for_target_y[function]
    y = target_y
    
    print(f'\n {title_name} / {target}')
    modeling(X, y)


 X_label_encoding / target_y
LightGBM의 AUC_Score : 0.8395
XGBClassifier의 AUC_Score : 0.8494
RandomForestClassifier의 AUC_Score : 0.8669
CatBoostClassifier의 AUC_Score : 0.8513
LogisticRegression의 AUC_Score : 0.5582

 X_train_dummies / target_y
LightGBM의 AUC_Score : 0.8476
XGBClassifier의 AUC_Score : 0.8350
RandomForestClassifier의 AUC_Score : 0.8815
CatBoostClassifier의 AUC_Score : 0.8477
LogisticRegression의 AUC_Score : 0.7810


### 1차 모델링 결과
- 다 비슷비슷해서 뚜렷한 차이가 느껴지지 않는다.
- LogisticRegression은 버려도 될 것 같다.
- 그나마 RandomForest가 높게 나오는 것 같아 사용하기로 결정

### 적용할 특성 공학 : get_dummies
* 위 결과 외에도 다른 random값으로 여러번 돌려봤으나, get_dummies가 값이 제일 높았다.
  - True or False를 찾는 분류 문제이며, 구조가 단순해 get_dummies보다 label_encoding이 더 적합할 것이라 생각하고 돌려봤다.
  - 하지만 미묘한 차이로 get_dummies가 더 높은 점수가 나왔다. 왜인지는 좀 더 공부해봐야 될 것 같다.