# 앙상블들의 앙상블 (모델 Stacking : Stacked generalization)
 - 서로 다른 분류기 형태 간의 앙상블들의 앙상블
 - 동일 형식 분류기를 사용한 부트스트랩 표본을 통한 앙상블들의 앙상블

In [1]:
import pandas as pd
pd.options.display.max_columns=None

---

### 데이터 로딩

IBM에서 제공했던 HR 데이터를 활용하겠습니다.

IBM kaggle 데이터 : https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset

In [2]:
datasets = pd.read_csv('./inputs/HR-Employee-Attrition.csv')
datasets.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [3]:
datasets.shape

(1470, 35)

데이터를 살펴보면 categorycal features, numerical features가 함께 있습니다.

In [4]:
datasets.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

Taget variable : *Attrition*

Yes / No -> 1 / 0으로 변경합니다
- 1 : 퇴직 Yes
- 0 : 퇴직 No

In [5]:
datasets['Attrition_idx'] = datasets['Attrition']\
    .apply(lambda x: 1 if x == 'Yes' else 0)
datasets.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition_idx
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5,1
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7,0
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0,1
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2,0


### Column 전처리

In [6]:
col_names = datasets.columns
col_names

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition_idx'],
      dtype='object')

필요없는 변수들이 있다 : *EmployeeCount*, *EmployeeNumber*, *Over18*, *StandardHours*

In [7]:
print(datasets.Over18.value_counts())
print(datasets.EmployeeCount.value_counts())
print(datasets.StandardHours.value_counts())

Y    1470
Name: Over18, dtype: int64
1    1470
Name: EmployeeCount, dtype: int64
80    1470
Name: StandardHours, dtype: int64


In [8]:
# Target은 feature에서 제외한다.
col_names = col_names\
    .drop(['Attrition_idx', 'Attrition', 'Over18', 
           'EmployeeCount', 'EmployeeNumber', 'StandardHours'])

Categorical column을 다루어보자.

Catagorical column을 numerical column을 나누어보자.

In [9]:
categorical_features = []
numerical_features = []
target = 'Attrition_idx'

# feature를 2가지 형태로 구분한다.
for col in col_names:
    if datasets[col].dtype == 'O':
        categorical_features.append(col)
    else:
        numerical_features.append(col)

In [10]:
print('Categorical feature의 수 :', len(categorical_features))
print('Numerical feature의 수 :', len(numerical_features))

Categorical feature의 수 : 7
Numerical feature의 수 : 23


In [11]:
categorical_features

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [12]:
numerical_features

['Age',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

<font color=red>Categorical 데이터를 one-hot vector로 변경</font>하자. Pandas에서 `get_dummies`를 이용하자.
- Train, test set을 구분하지 않고 원핫벡터를 만드는 경우 : 해당 feature의 모든 원소들을 아는 경우, 예를 들어 회사 부서, 국가 코드의 경우에 해당한다.
- Train, test set을 구분하고 train set으로만 원핫벡터를 만드는 경우 : 해당 feature의 원소가 test set에 없는 경우가 존재할 수 있다.

In [13]:
categorical_datasets = pd.get_dummies(datasets[categorical_features])
categorical_datasets.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1
1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0
2,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1
3,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
4,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0


In [14]:
numerical_datasets = datasets[numerical_features]
numerical_datasets.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,2,94,3,2,4,5993,19479,8,11,3,1,0,8,0,1,6,4,0,5
1,49,279,8,1,3,61,2,2,2,5130,24907,1,23,4,4,1,10,3,3,10,7,1,7
2,37,1373,2,2,4,92,2,1,3,2090,2396,6,15,3,2,0,7,3,3,0,0,0,0
3,33,1392,3,4,4,56,3,1,3,2909,23159,1,11,3,3,0,8,3,3,8,7,3,0
4,27,591,2,1,1,40,3,1,2,3468,16632,9,12,3,4,1,6,3,3,2,2,2,2


Categorical dataset과 numerical dataset을 합친다. 모델의 input으로 사용할 feature이다.

In [15]:
X = pd.concat([categorical_datasets, numerical_datasets], axis=1)
X.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,41,1102,1,2,2,94,3,2,4,5993,19479,8,11,3,1,0,8,0,1,6,4,0,5
1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,49,279,8,1,3,61,2,2,2,5130,24907,1,23,4,4,1,10,3,3,10,7,1,7
2,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,37,1373,2,2,4,92,2,1,3,2090,2396,6,15,3,2,0,7,3,3,0,0,0,0
3,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,33,1392,3,4,4,56,3,1,3,2909,23159,1,11,3,3,0,8,3,3,8,7,3,0
4,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,27,591,2,1,1,40,3,1,2,3468,16632,9,12,3,4,1,6,3,3,2,2,2,2


In [16]:
y = datasets[target]
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Attrition_idx, dtype: int64

### Train set과 test set을 구분

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=42)

### 모델의 성능을 평가하기 위한 metric들을 정의한 helper 함수 정의

In [19]:
from sklearn.metrics import accuracy_score, classification_report

In [20]:
# Help 함수
def get_metric(y_label, pred, set_type):
    # 1. Confusion Matrix
    print('\n {} Confusion Matrix :'.format(set_type))
    display(pd.crosstab(y_label, pred, rownames=['Actual'], colnames=['Predict']))

    # 2. Accuracy
    print('\n {} accuracy :'.format(set_type), accuracy_score(y_label, pred))

    # 3. Classification Report
    print('\n {} Classification Report : \n'.format(set_type), classification_report(y_label, pred))

### <font color=red>Out-of-fold를 사용한 Stacking을 위한 helper 함수 정의</font>

In [21]:
from sklearn.model_selection import StratifiedKFold

splitter = StratifiedKFold(n_splits=5, shuffle=False, random_state=42)

In [22]:
print('cross validation의 길이', len(list(splitter.split(x_train, y_train))))

cross validation의 길이 5


In [23]:
# cv에서 순서가 보장되는지 확인!
list(splitter.split(x_train, y_train))[0]

(array([ 199,  200,  201,  202,  203,  205,  206,  207,  208,  209,  210,
         211,  212,  213,  214,  215,  216,  217,  219,  220,  221,  222,
         223,  225,  226,  227,  229,  230,  231,  232,  233,  234,  236,
         237,  238,  239,  240,  241,  242,  244,  245,  247,  249,  250,
         251,  252,  253,  254,  255,  256,  257,  258,  259,  260,  261,
         262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
         273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,
         284,  285,  286,  287,  288,  289,  290,  291,  292,  293,  294,
         295,  296,  297,  298,  299,  300,  301,  302,  303,  304,  305,
         306,  307,  308,  309,  310,  311,  312,  313,  314,  315,  316,
         317,  318,  319,  320,  321,  322,  323,  324,  325,  326,  327,
         328,  329,  330,  331,  332,  333,  334,  335,  336,  337,  338,
         339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,
         350,  351,  352,  353,  354, 

In [24]:
# Out-of-fold
def get_oof(classifier, x_train, y_train, x_test):
    oof_val = []
    oof_test = pd.DataFrame()
    clf_name = classifier.__class__.__name__
    
    for i, (train_indice, test_indice) in enumerate(splitter.split(x_train, y_train)):
        x_cv_train = x_train.iloc[train_indice]
        y_cv_train = y_train.iloc[train_indice]
        x_cv_val = x_train.iloc[test_indice]
        
        # 4개의 fold로 학습 (첫 번째 layer 학습)
        classifier.fit(x_cv_train, y_cv_train)
        # 1개의 fold로 예측 (두 번째 layer의 input)
        oof_val.extend(classifier.predict_proba(x_cv_val)[:,1])
        # 최종 성능 측정을 위해서 각 cross validation classifier 별로 예측값을 모은다.
        oof_test[i] = classifier.predict_proba(x_test)[:,1]
        
    oof_train = pd.DataFrame({clf_name: oof_val})
    oof_test[clf_name + '_mean'] = oof_test.mean(axis=1)
    
    return oof_train, oof_test

---

## 서로 다른 분류기 형태 간의 앙상블들의 앙상블

- 훈련 데이터에 각 네 가지 분류기를 별도로 적용(로지스틱 회귀, 의사결정 트리, 램덤 포레스트, 에이다 부스트)
- 네 가지 분류기에 관한 확률을 계산한다. 그리고 부류 1에 관한 확률만 메타 분류기에 이용한다. 부류 0의 확률 + 부류 1의 확률 = 1이므로 하나의 확률만 계산해도 충분하다. 그렇지 않으면 다중공선성 문제가 발생한다.
- 최종 0/1 출력에 관한 4개의 확률(각 분류기에서 얻은 값) 간의 관계를 모델링하기 위해 로지스틱 회귀를 메타 분류기로 사용했다.
- 메타 분류기에 사용된 4개 변수 모두에 관한 계수를 계산하고 새로운 데이터에 적용해 관측값을 최종 부류로 분류하기 위한 계산을 한다.

### 1. 분류기에 대하여 학습을 진행 (Out_of_fold 기법을 활용하여)

In [25]:
from sklearn.model_selection import cross_val_predict

In [26]:
# Decision tree 강의에서 클래스 가중치를 구했습니다.
c_weight = {0: 0.3, 1: 0.7}

> 로지스틱 회귀 분류기 out-of-fold

In [27]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(fit_intercept=True, class_weight=c_weight, random_state=42)

In [28]:
lr_train_oof_pred, lr_test_oof_pred = get_oof(lr_classifier, x_train, y_train, x_test)

In [29]:
print(len(lr_train_oof_pred))
lr_train_oof_pred.head()

1029


Unnamed: 0,LogisticRegression
0,0.002691
1,0.173167
2,0.737308
3,0.43694
4,0.138797


In [30]:
print(len(lr_test_oof_pred))
lr_test_oof_pred.head()

441


Unnamed: 0,0,1,2,3,4,LogisticRegression_mean
0,0.162808,0.183174,0.116924,0.176768,0.172023,0.162339
1,0.027554,0.028848,0.023524,0.043728,0.037672,0.032265
2,0.648121,0.343309,0.472608,0.373633,0.451662,0.457867
3,0.00833,0.009205,0.010312,0.027672,0.014404,0.013984
4,0.12168,0.141694,0.142263,0.083988,0.139296,0.125784


In [31]:
# 로지스틱 회귀 분류기 모델만 이용하여 예측값을 구한다.
lr_classifier.fit(x_train, y_train)
train_pred = lr_classifier.predict(x_train)
test_pred = lr_classifier.predict(x_test)

# 로지스틱 회귀 분류기 모델의 성능을 구한다.
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')


 Train Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,779,74
1,62,114



 Train accuracy : 0.8678328474246841

 Train Classification Report : 
              precision    recall  f1-score   support

          0       0.93      0.91      0.92       853
          1       0.61      0.65      0.63       176

avg / total       0.87      0.87      0.87      1029


 Test Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,339,41
1,31,30



 Test accuracy : 0.8367346938775511

 Test Classification Report : 
              precision    recall  f1-score   support

          0       0.92      0.89      0.90       380
          1       0.42      0.49      0.45        61

avg / total       0.85      0.84      0.84       441



> 결정 트리 분류기 out-of-fold

In [32]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight=c_weight,
    random_state=42
)

In [33]:
dt_train_oof_pred, dt_test_oof_pred = get_oof(dt_classifier, x_train, y_train, x_test)

In [34]:
print(len(dt_train_oof_pred))
dt_train_oof_pred.head()

1029


Unnamed: 0,DecisionTreeClassifier
0,0.049296
1,0.252351
2,0.903226
3,0.049296
4,0.475728


In [35]:
print(len(dt_test_oof_pred))
dt_test_oof_pred.head()

441


Unnamed: 0,0,1,2,3,4,DecisionTreeClassifier_mean
0,0.252351,0.291667,0.088129,0.0,0.0,0.126429
1,0.252351,0.089921,0.088129,0.072165,0.06271,0.113055
2,0.823529,1.0,0.35,0.84,0.795455,0.761797
3,0.0,0.089921,0.088129,0.04749,0.06271,0.05765
4,0.014463,0.091205,0.237288,0.5,0.0,0.168591


In [36]:
# 결정 트리 분류기 모델만을 이용하여 예측값을 생성한다.
dt_classifier.fit(x_train, y_train)
train_pred = dt_classifier.predict(x_train)
test_pred = dt_classifier.predict(x_test)

# 결정 트리 분류기 모델의 성능을 평가해보자.
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')


 Train Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,787,66
1,56,120



 Train accuracy : 0.8814382896015549

 Train Classification Report : 
              precision    recall  f1-score   support

          0       0.93      0.92      0.93       853
          1       0.65      0.68      0.66       176

avg / total       0.88      0.88      0.88      1029


 Test Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,345,35
1,36,25



 Test accuracy : 0.8390022675736961

 Test Classification Report : 
              precision    recall  f1-score   support

          0       0.91      0.91      0.91       380
          1       0.42      0.41      0.41        61

avg / total       0.84      0.84      0.84       441



> 랜덤 포레스트 분류기 out-of-fold

In [37]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(
    n_estimators=10000,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight=c_weight,
    random_state=42)

In [38]:
rf_train_oof_pred, rf_test_oof_pred = get_oof(rf_classifier, x_train, y_train, x_test)

In [39]:
# 랜덤 포레스트 분류기만을 이용하여 예측값을 구합니다.
rf_classifier.fit(x_train, y_train)
train_pred = rf_classifier.predict(x_train)
test_pred = rf_classifier.predict(x_test)

# 랜덤 포레스트 분류기의 성능을 측정합니다.
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')


 Train Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,846,7
1,61,115



 Train accuracy : 0.9339164237123421

 Train Classification Report : 
              precision    recall  f1-score   support

          0       0.93      0.99      0.96       853
          1       0.94      0.65      0.77       176

avg / total       0.93      0.93      0.93      1029


 Test Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,373,7
1,53,8



 Test accuracy : 0.8639455782312925

 Test Classification Report : 
              precision    recall  f1-score   support

          0       0.88      0.98      0.93       380
          1       0.53      0.13      0.21        61

avg / total       0.83      0.86      0.83       441



> 에이다 부스트 분류기

In [40]:
from sklearn.ensemble import AdaBoostClassifier
ab_classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1, class_weight=c_weight),
    n_estimators=5000,
    learning_rate=0.05,
    random_state=42
)

In [41]:
ab_train_oof_pred, ab_test_oof_pred = get_oof(ab_classifier, x_train, y_train, x_test)

In [42]:
# 에이다 부스트 분류기만을 이용하여 예측값을 생성합니다.
ab_classifier.fit(x_train, y_train)
train_pred = ab_classifier.predict(x_train)
test_pred = ab_classifier.predict(x_test)

# 에이다 부스트 분류기의 성능을 측정합니다.
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')


 Train Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,809,44
1,25,151



 Train accuracy : 0.9329446064139941

 Train Classification Report : 
              precision    recall  f1-score   support

          0       0.97      0.95      0.96       853
          1       0.77      0.86      0.81       176

avg / total       0.94      0.93      0.93      1029


 Test Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,342,38
1,33,28



 Test accuracy : 0.8390022675736961

 Test Classification Report : 
              precision    recall  f1-score   support

          0       0.91      0.90      0.91       380
          1       0.42      0.46      0.44        61

avg / total       0.84      0.84      0.84       441



### 2. 분류기의 결과들을 결합

In [43]:
ensemble = pd.DataFrame()

부류 1(퇴직자)에 관한 확률을 이용하여 앙상블을 수행합니다.

> 로지스틱 회귀 분류기

In [44]:
# 로지스틱 회귀 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble['lr_output_one'] = lr_train_oof_pred
ensemble.head()

Unnamed: 0,lr_output_one
0,0.002691
1,0.173167
2,0.737308
3,0.43694
4,0.138797


> 결정 트리 분류기 

In [45]:
# 결정 트리 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble['dt_output_one'] = dt_train_oof_pred
ensemble.head()

Unnamed: 0,lr_output_one,dt_output_one
0,0.002691,0.049296
1,0.173167,0.252351
2,0.737308,0.903226
3,0.43694,0.049296
4,0.138797,0.475728


> 랜덤 포레스트 분류기

In [46]:
# 랜덤 포레스트 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble['rf_output_one'] = rf_train_oof_pred
ensemble.head()

Unnamed: 0,lr_output_one,dt_output_one,rf_output_one
0,0.002691,0.049296,0.079142
1,0.173167,0.252351,0.128049
2,0.737308,0.903226,0.529802
3,0.43694,0.049296,0.125879
4,0.138797,0.475728,0.119156


> 에이다 부스트 분류기

In [47]:
# 에이다 부스트 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble['ab_output_one'] = ab_train_oof_pred
ensemble.head()

Unnamed: 0,lr_output_one,dt_output_one,rf_output_one,ab_output_one
0,0.002691,0.049296,0.079142,0.487058
1,0.173167,0.252351,0.128049,0.49091
2,0.737308,0.903226,0.529802,0.500735
3,0.43694,0.049296,0.125879,0.494841
4,0.138797,0.475728,0.119156,0.496151


In [48]:
ensemble = pd.concat([ensemble, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)

In [49]:
ensemble.head(5)

Unnamed: 0,lr_output_one,dt_output_one,rf_output_one,ab_output_one,Attrition_idx
0,0.002691,0.049296,0.079142,0.487058,0
1,0.173167,0.252351,0.128049,0.49091,0
2,0.737308,0.903226,0.529802,0.500735,1
3,0.43694,0.049296,0.125879,0.494841,0
4,0.138797,0.475728,0.119156,0.496151,0


### 3. 메타 분류기 학습

In [50]:
meta_classifier = LogisticRegression(fit_intercept=False)

In [51]:
meta_classifier.fit(
    ensemble[['lr_output_one', 'dt_output_one', 'rf_output_one', 'ab_output_one']],
    ensemble['Attrition_idx'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 4. 테스트 셋에 적용 가능하도록 테스트 셋에 대한 예측값 구하기

In [52]:
ensemble_test = pd.DataFrame()

In [53]:
# 로지스틱회귀 분류기에 대한 결과값을, 부류 1이 될 확률만 리턴받습니다.
ensemble_test['lr_output_one'] = lr_test_oof_pred['LogisticRegression_mean']
ensemble_test['dt_output_one'] = dt_test_oof_pred['DecisionTreeClassifier_mean']
ensemble_test['rf_output_one'] = rf_test_oof_pred['RandomForestClassifier_mean']
ensemble_test['ab_output_one'] = ab_test_oof_pred['AdaBoostClassifier_mean']
ensemble_test.head()

Unnamed: 0,lr_output_one,dt_output_one,rf_output_one,ab_output_one
0,0.162339,0.126429,0.226182,0.49806
1,0.032265,0.113055,0.110086,0.493439
2,0.457867,0.761797,0.484164,0.502463
3,0.013984,0.05765,0.109196,0.495544
4,0.125784,0.168591,0.132689,0.495352


### 5. 메타분류기 성능 측정

In [54]:
# 학습 셋에 대한 예측값
train_pred = meta_classifier.predict(
    ensemble[['lr_output_one', 'dt_output_one', 'rf_output_one', 'ab_output_one']],)

In [55]:
# 테스트 셋에 대한 예측값
test_pred = meta_classifier.predict(
    ensemble_test[['lr_output_one', 'dt_output_one', 'rf_output_one', 'ab_output_one']],)

In [56]:
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')


 Train Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,839,14
1,130,46



 Train accuracy : 0.8600583090379009

 Train Classification Report : 
              precision    recall  f1-score   support

          0       0.87      0.98      0.92       853
          1       0.77      0.26      0.39       176

avg / total       0.85      0.86      0.83      1029


 Test Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,376,4
1,48,13



 Test accuracy : 0.8820861678004536

 Test Classification Report : 
              precision    recall  f1-score   support

          0       0.89      0.99      0.94       380
          1       0.76      0.21      0.33        61

avg / total       0.87      0.88      0.85       441



### 6. 개별 분류기에 대한 평가

각각의 분류기에 대해서 평가 : 로지스틱 회귀 분류기를 메타 분류기로 사용했으면 각 계수(coefficients)를 살펴본다.

In [57]:
meta_classifier.coef_

array([[ 2.72324283,  0.07557159,  1.90903959, -5.85258971]])

에이다부스트의 경우 성능을 끌어내리고 있는것처럼 보인다.

따라서, 에이다 부스트의 파라미터를 조절해보고 성능을 살펴보거나, 에이다 부스트를 제거하고 성능을 살펴본다.

## 동일 형식 분류기를 사용한 부트스트랩 표본을 통한 앙상블들의 앙상블

- 훈련 데이터에서 부트스트랩 표본을 추출한다.
- 각 표본에 관해 매번 에이다 부스트 모델을 학습한다.
- 개별 에이다 부스트 모델의 결과는 bagging 분류기를 통해 합쳐진다.

분산의 축소가 여전히 성능을 향상시키는 매우 유연한 모델에 관해 적절하다(의사 결정 트리, 랜덤 포레스트 등).

### 1. 기본 분류기 및 부트스트랩 샘플을 학습시킬 모델 생성

In [58]:
# 에이다 부스트 모델에 사용할 기본 분류기(의사결정 그루터기)
base_learner = DecisionTreeClassifier(max_depth=1, class_weight=c_weight)

In [59]:
# 부트스트랩 샘플을 학습시킬 에이다 부스트 모델
ab_classifier = AdaBoostClassifier(base_estimator=base_learner,
                                   n_estimators=500,
                                   learning_rate=0.05,
                                   random_state=42)

In [60]:
ab_classifier.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight={0: 0.3, 1: 0.7}, criterion='gini',
            max_depth=1, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.05, n_estimators=500, random_state=42)

에이다 부스트의 기본성능을 살펴보자.

In [61]:
train_pred = ab_classifier.predict(x_train)
test_pred = ab_classifier.predict(x_test)

In [62]:
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')


 Train Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,806,47
1,63,113



 Train accuracy : 0.8931000971817298

 Train Classification Report : 
              precision    recall  f1-score   support

          0       0.93      0.94      0.94       853
          1       0.71      0.64      0.67       176

avg / total       0.89      0.89      0.89      1029


 Test Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,349,31
1,34,27



 Test accuracy : 0.8526077097505669

 Test Classification Report : 
              precision    recall  f1-score   support

          0       0.91      0.92      0.91       380
          1       0.47      0.44      0.45        61

avg / total       0.85      0.85      0.85       441



### 2. Bootstrapping samples를 생성하고 각 샘플에 분류기를 학습

`BaggingClassifier`는 내부에 bootstrap 기능이 있고, 학습시킬 분류기를 1개 지정할 수 있다. 따라서 이것을 사용하면 된다.

In [63]:
from sklearn.ensemble import BaggingClassifier

bag_classifier = BaggingClassifier(
    base_estimator=ab_classifier,
    n_estimators = 50,
    max_samples=1.0,
    max_features=1.0,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1,
    random_state=42)

In [64]:
bag_classifier.fit(x_train, y_train)

BaggingClassifier(base_estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight={0: 0.3, 1: 0.7}, criterion='gini',
            max_depth=1, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_sam...None,
            splitter='best'),
          learning_rate=0.05, n_estimators=500, random_state=42),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=50, n_jobs=-1, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

### 3. 성능 측정

In [65]:
train_pred = bag_classifier.predict(x_train)
test_pred = bag_classifier.predict(x_test)

In [66]:
get_metric(y_train, train_pred, 'Train')
print('=' * 60)
get_metric(y_test, test_pred, 'Test')


 Train Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,824,29
1,69,107



 Train accuracy : 0.9047619047619048

 Train Classification Report : 
              precision    recall  f1-score   support

          0       0.92      0.97      0.94       853
          1       0.79      0.61      0.69       176

avg / total       0.90      0.90      0.90      1029


 Test Confusion Matrix :


Predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,359,21
1,36,25



 Test accuracy : 0.8707482993197279

 Test Classification Report : 
              precision    recall  f1-score   support

          0       0.91      0.94      0.93       380
          1       0.54      0.41      0.47        61

avg / total       0.86      0.87      0.86       441

