In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

import pandas as pd
import numpy as np
import typing

In [2]:
TARGET = 'Attrition'

In [7]:
data = pd.read_csv('../mldata/ibm_hr_data.csv')

In [8]:
target = data[TARGET]
data.drop(TARGET, axis=1, inplace=True)

In [9]:
CATE_COLS = []
NUM_COLS = []
for col in data.columns.tolist():
    if data[col].dtype == 'object':
        CATE_COLS.append(col)
    else:
        NUM_COLS.append(col)

In [10]:
CATE_COLS

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']

In [11]:
NUM_COLS

['Age',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EmployeeCount',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [12]:
class ReshapedLabelEncoder(LabelEncoder):
    def fit_transform(self, y, *args):
        return super().fit_transform(y).reshape(-1, 1)
    
reshape_label_encoder = ReshapedLabelEncoder()
one_hot_encoder = OneHotEncoder()
pipeline = Pipeline([
    ('label_encoder', reshape_label_encoder),
    ('one_hot_encoder', one_hot_encoder)
])

label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target.values.reshape(-1, 1))

  y = column_or_1d(y, warn=True)


In [13]:
# Columns:
def feature_transform(df, cate_cols, numer_cols, scalers: typing.List, is_train_set=True):
    features = np.array([])
    
    # 1. Categorical column : 변수의 중요도를 보기 위해서는 get_dummy를 사용하는것이 좋다.
    reshape_label_encoder = ReshapedLabelEncoder()
    one_hot_encoder = OneHotEncoder()
    pipeline = Pipeline([
        ('label_encoder', reshape_label_encoder),
        ('one_hot_encoder', one_hot_encoder)
    ])
    for col in cate_cols:
        # Nan 값이 존재하는 경우, dummy string 사용
        df[col].fillna('DUMMY', inplace=True)
        oh = pipeline.fit_transform(df[col]).toarray()
        if len(features) == 0:
            features = oh
        else:
            features = np.hstack([features, oh])
    
    # 2. Numeric column
    for idx, col in enumerate(numer_cols):
        # Nan 값이 존재하는 경우, 0을 사용
        df[col].fillna(0, inplace=True)
        if is_train_set:
            scaler = StandardScaler()
            sf = scaler.fit_transform(df[col].astype('float').values.reshape(-1, 1))    
            scalers.append(scaler)
        else:
            scaler = scalers[idx]
            sf = scaler.transform(df[col].astype('float').values.reshape(-1, 1))
        features = np.hstack([features, sf])
    return features, scalers

In [14]:
train_data, test_data, train_labels, test_labels = train_test_split(data, target, test_size=0.2)

In [15]:
scalers = []
train_x, scalers = feature_transform(train_data, CATE_COLS, NUM_COLS, scalers, True)
print(train_x.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://

(1176, 55)


In [16]:
test_x, _ = feature_transform(test_data, CATE_COLS, NUM_COLS, scalers, False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://

In [17]:
print(test_x.shape)

(294, 55)


### Decision tree

In [18]:
d_tree = DecisionTreeClassifier(criterion='gini', 
                                max_depth=5, 
                                min_samples_split=2, 
                                min_samples_leaf=1, 
                                random_state=42, class_weight={0:0.3, 1:0.7})

In [19]:
d_tree.fit(train_x, train_labels)

DecisionTreeClassifier(class_weight={0: 0.3, 1: 0.7}, criterion='gini',
            max_depth=5, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [20]:
train_pred = d_tree.predict(train_x)
test_pred = d_tree.predict(test_x)

In [21]:
print('====================')
print('\nTrain accuracy', accuracy_score(train_labels, train_pred))
print('\nTest accuracy', accuracy_score(test_labels, test_pred))
print('\nComplete report of training data\n', classification_report(train_labels, train_pred))
print('\nComplete report of testing data\n', classification_report(test_labels, test_pred))
print('\nTrain confusion matrix\n', pd.crosstab(train_pred, train_labels, rownames=['Prediction'], colnames=['Actual']))
print('\nTest confusion matrix\n', pd.crosstab(test_pred, test_labels, rownames=['Prediction'], colnames=['Actual']))


Train accuracy 0.8818027210884354

Test accuracy 0.8333333333333334

Complete report of training data
              precision    recall  f1-score   support

          0       0.93      0.93      0.93       991
          1       0.62      0.63      0.63       185

avg / total       0.88      0.88      0.88      1176


Complete report of testing data
              precision    recall  f1-score   support

          0       0.89      0.91      0.90       242
          1       0.53      0.48      0.51        52

avg / total       0.83      0.83      0.83       294


Train confusion matrix
 Actual        0    1
Prediction          
0           920   68
1            71  117

Test confusion matrix
 Actual        0   1
Prediction         
0           220  27
1            22  25


### Bagging

In [22]:
bg_tree = BaggingClassifier(base_estimator=d_tree, 
                            n_estimators=5000, 
                            max_samples=0.67,
                            max_features=1.0, 
                            bootstrap=True, 
                            bootstrap_features=False, 
                            n_jobs=-1, 
                            random_state=42)

In [23]:
bg_tree.fit(train_x, train_labels)
train_pred = bg_tree.predict(train_x)
test_pred = bg_tree.predict(test_x)

In [24]:
print('====================')
print('\nTrain accuracy', accuracy_score(train_labels, train_pred))
print('\nTest accuracy', accuracy_score(test_labels, test_pred))
print('\nComplete report of training data\n', classification_report(train_labels, train_pred))
print('\nComplete report of testing data\n', classification_report(test_labels, test_pred))
print('\nTrain confusion matrix\n', pd.crosstab(train_pred, train_labels, rownames=['Prediction'], colnames=['Actual']))
print('\nTest confusion matrix\n', pd.crosstab(test_pred, test_labels, rownames=['Prediction'], colnames=['Actual']))


Train accuracy 0.9226190476190477

Test accuracy 0.8571428571428571

Complete report of training data
              precision    recall  f1-score   support

          0       0.92      0.99      0.96       991
          1       0.95      0.54      0.69       185

avg / total       0.92      0.92      0.91      1176


Complete report of testing data
              precision    recall  f1-score   support

          0       0.86      0.98      0.92       242
          1       0.78      0.27      0.40        52

avg / total       0.85      0.86      0.83       294


Train confusion matrix
 Actual        0   1
Prediction         
0           986  86
1             5  99

Test confusion matrix
 Actual        0   1
Prediction         
0           238  38
1             4  14


### Random forest

In [26]:
rf_tree = RandomForestClassifier(n_estimators=5000, 
                                 criterion='gini',
                                 max_depth=5, 
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 class_weight={0:0.3, 1:0.7})
rf_tree.fit(train_x, train_labels)

RandomForestClassifier(bootstrap=True, class_weight={0: 0.3, 1: 0.7},
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [28]:
rf_tree.fit(train_x, train_labels)
train_pred = rf_tree.predict(train_x)
test_pred = rf_tree.predict(test_x)

In [29]:
print('====================')
print('\nTrain accuracy', accuracy_score(train_labels, train_pred))
print('\nTest accuracy', accuracy_score(test_labels, test_pred))
print('\nComplete report of training data\n', classification_report(train_labels, train_pred))
print('\nComplete report of testing data\n', classification_report(test_labels, test_pred))
print('\nTrain confusion matrix\n', pd.crosstab(train_pred, train_labels, rownames=['Prediction'], colnames=['Actual']))
print('\nTest confusion matrix\n', pd.crosstab(test_pred, test_labels, rownames=['Prediction'], colnames=['Actual']))


Train accuracy 0.9047619047619048

Test accuracy 0.8673469387755102

Complete report of training data
              precision    recall  f1-score   support

          0       0.90      0.99      0.95       991
          1       0.94      0.42      0.58       185

avg / total       0.91      0.90      0.89      1176


Complete report of testing data
              precision    recall  f1-score   support

          0       0.87      0.99      0.92       242
          1       0.88      0.29      0.43        52

avg / total       0.87      0.87      0.84       294


Train confusion matrix
 Actual        0    1
Prediction          
0           986  107
1             5   78

Test confusion matrix
 Actual        0   1
Prediction         
0           240  37
1             2  15


In [37]:
pipeline = Pipeline([
    ('rf', RandomForestClassifier(criterion='gini', class_weight={0:0.3, 1:0.7}))
])

parameters = {
    'rf__n_estimators': (2000, 3000, 5000),
    'rf__max_depth': (5, 15, 30),
    'rf__min_samples_split': (2, 3),
    'rf__min_samples_leaf': (1, 2),
}

In [38]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=5, verbose=1, scoring='accuracy')

In [39]:
grid_search.fit(train_x, train_labels)
test_pred = grid_search.predict(test_x)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  4.3min finished


In [40]:
print('Best Training Score: %0.3f' %grid_search.best_score_)
best_parameters = grid_search.best_estimator_.get_params()
print('\nBest Parameters Set :')
for param_name in sorted(best_parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))
print('\nTrain accuracy', accuracy_score(train_labels, train_pred))
print('\nTest accuracy', accuracy_score(test_labels, test_pred))
print('\nComplete report of training data\n', classification_report(train_labels, train_pred))
print('\nComplete report of testing data\n', classification_report(test_labels, test_pred))
print('\nTrain confusion matrix\n', pd.crosstab(train_pred, train_labels, rownames=['Prediction'], colnames=['Actual']))
print('\nTest confusion matrix\n', pd.crosstab(test_pred, test_labels, rownames=['Prediction'], colnames=['Actual']))

Best Training Score: 0.865

Best Parameters Set :
	memory: None
	rf: RandomForestClassifier(bootstrap=True, class_weight={0: 0.3, 1: 0.7},
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)
	rf__bootstrap: True
	rf__class_weight: {0: 0.3, 1: 0.7}
	rf__criterion: 'gini'
	rf__max_depth: 5
	rf__max_features: 'auto'
	rf__max_leaf_nodes: None
	rf__min_impurity_decrease: 0.0
	rf__min_impurity_split: None
	rf__min_samples_leaf: 1
	rf__min_samples_split: 2
	rf__min_weight_fraction_leaf: 0.0
	rf__n_estimators: 2000
	rf__n_jobs: 1
	rf__oob_score: False
	rf__random_state: None
	rf__verbose: 0
	rf__warm_start: False
	steps: [('rf', RandomForestClassifier(bootstrap=True, class_weight={0: 0.3, 