# Introduction to Ensembling/Stacking in Python

https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python

In [0]:
# install kaggle
!pip install kaggle

# make kaggle directory and copy json file
!mkdir -p ~/.kaggle
!cp /content/drive/My\ Drive/kaggle.json ~/.kaggle/

# make directory only readable
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
# download titatnic dataset
!kaggle comptetitions download -c titanic

In [1]:
# import
import pandas as pd
import numpy as np
import re # 정규표현식사용

import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

# make interactive graph
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')


import sklearn
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                             GradientBoostingClassifier, ExtraTreesClassifier)

from sklearn.svm import SVC
from sklearn.model_selection import KFold


# Using plotly in google colab notebook


https://www.youtube.com/watch?v=F2pAu9PzgEE

In [0]:
# importing and using plotly 

from IPython.display import Image
from plotly.offline import iplot # 필요없어보임

plotly 사용하는 셀 마다 이 함수 호출 시켜야함 

In [0]:
def configure_plotly_browser_state():
    import IPython
    display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"</script>
    <script>
    requirejs.config({
    paths: {
    base: '/static/base',
    plotly: 'https:..cdn.plot.ly/plotly-1.5.1.min.js?noext',
    }
    });
    </script>
    '''))

# 시작

## Feature Exploration, Engineering and Cleaning


- explore data on hand
- identify possible feature engineering
- numerically encode any caterigorical features

### apply 함수 + len 함수

In [0]:
train['Name_length'] = train['Name'].apply(len)

### apply 함수 + lambda if 함수

In [0]:
train['has_cabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

### pandas qcut 과 cut 의 차이

https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut

qcut 은 quantile 컷이여서 모든 bin 에 (전체 데이터수 / bin 의 개수) 만큼 데이터가 들어가도록 bin 의 범위를 조정함. 각 bin 의 영역이 다름

cut 은 value 의 min max 를 기준으로 bin 을 동일한 영역으로 나누고 그러므로 bin 마다 가지고 있는 데이터 수가 다름

In [3]:
# 예시 코드
factors = np.random.randn(30)
pd.qcut(factors, 5).value_counts()
# pd.cut(factors, 5).value_counts()

(-1.553, -0.678]    6
(-0.678, -0.274]    6
(-0.274, 0.127]     6
(0.127, 0.633]      6
(0.633, 2.051]      6
dtype: int64

In [0]:
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
train['CategoricalAge'] = pd.cut(train['Age'], 5)

### 승객 이름에서 Title 만 빼내기(정규표현식)

In [0]:
# Define fucntion to extract titles from passenge names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exist, extract and return it
    if title_search:
        return title_search.group(1)
    return ""

# Create a new feature title, containing the title of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)    

### 데이터 안에 원하는 대상 replace

In [0]:
# Group all non-common titles into one single grouping 'Rare'
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')


### mapping을 통해 categorical 숫자로 바꾸기

In [0]:
dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
dataset['Title'] = dataset['Title'].map(title_mapping)
dataset['Title'] = dataset['Title'].fillna(0)

### 범위를 지정해서 숫자 주기

In [0]:
dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
dataset['Fare'] = dataset['Fare'].astype(int)

### drop - axis 1 => 열

In [0]:
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis=1)

### Pair plot 사용하기

In [0]:
g = sns.pairplot(train[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch',
                        u'Fare', u'Embarked', u'FamilySize', u'Title']],
                 hue='Survived', palette = 'seismic', size = 1.2, 
                 diag_kind = 'kde', diag_kws=dict(shade=True), plot_kws=dict(s=10))
g.set(xticklabels=[])

## Sklearn Classifier

### KFold 바뀐 부분

In [0]:
# kf = KFOLD(ntrain, n_folds= NFOLDS, random_state =SEED)
kf = KFold(n_splits= NFOLDS, random_state =SEED)
kf = kf.split(np.arange(ntrain)) # change - version problem


### SKlearnHelper

In [0]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x, y)
        
    def feature_importances(self, x, y):
        return self.clf.fit(x, y).feature_importances_

### Out of Fold Prediction - 아직도 잘 모르겠음

In [0]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    

## 모델 만들기

### Params 저장해두기 dict 형태

In [0]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'warm_start': True,
#     'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'verbose': 0
}


### SklearnHelper 클래스 이용해서 빠르게 모델 만들기 

In [0]:
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)

각 모델에서 나온 결과값에 대한 df 만들기

In [0]:
rf_feature = rf.feature_importances(x_train, y_train)
cols = train.columns.values # train col 의 이름들이 리스트로 저장

# Create a dataframe with features

feature_df = pd.DataFrame({'features': cols,
                          'Random Forest feature importances': rf_feature,
                          'Extra Trees feature importances': et_feature,
                          'AdaBoost feature importances': ada_feature,
                          'Gradient Boosting feature importances': gb_feature})

### plotly 그래프 그리기

In [0]:
# Scater plot
configure_plotly_browser_state()
py.init_notebook_mode(connected=True)
trace = go.Scatter(y= feature_df['Random Forest feature importances'].values,
                  x= feature_df['features'].values,
                  mode = 'markers',
                  marker = dict(sizemode = 'diameter',
                               sizeref = 1,
                               size = 25,
#                                size = feature_df['AdaBoost feature importances'].values,
#                                color = np,random.randn(500), # Set color equal to a variable
                               color = feature_df['Random Forest feature importances'].values,
                               colorscale='Portland',
                               showscale=True),
                  text = feature_df['features'].values)
data = [trace]

layout = go.Layout(autosize= True,
                  title = 'Random Forest Feature Importance',
                  hovermode = 'closest',
#                   xaxis = dict(title = 'Pop',
#                               ticklen = 5,
#                               zeroline = False,
#                               gridwith =2,)
                  yaxis = dict(title = 'Feature Importance',
                               ticklen = 5,
                               gridwidth = 2),
                  showlegend=False)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename = 'scatter2010')


### row wise mean 구하기

In [0]:
feature_df['mean'] = feature_df.mean(axis = 1) # axis 1 computes the mean row wise

## Second Level Prediction from First Level Output

### XGBoost 사용하기

In [0]:
gbm = xgb.XGBClassifier(n_estimators = 2000,
#                        learning_rate = 0.02,
                        max_depth=4,
                        min_child_weight = 2,
#                         gamma =1,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        nthread = -1,
                        scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)  