# Model Save & Load --> Voting

In [10]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pylab as plt
import time
import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# 평가용
from sklearn.metrics import accuracy_score

# 지정한 것으로 할 것인가
from sklearn.model_selection import GridSearchCV
# 랜덤으로 돌릴 것인가
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC

  return f(*args, **kwds)


In [11]:
# 옛날 방식
from sklearn.externals import joblib

In [12]:
# Output a pickle file for the model
#joblib.dump(RFC_gs_best, 'rf_model.pkl')

In [13]:
#rf_best_p = joblib.load('rf_model.pkl')

In [14]:
# 신 방식
from joblib import dump, load

In [15]:
#dump(RFC_gs_best, 'rf_model.pkl')

---------------------------------------------------

In [16]:
rf_model = load('rf_model.pkl')
knn_model = load('knn_model.pkl')
svm_model = load('svm_model.pkl')

In [17]:
def encode_feature(df):
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        # 각기 컬럼별로 수행할 인코더 설정 -> 컬럼별로 다르게 되니 매 for마다
        # 새로운 것이 호출됨
        # 0,1,2,3,... 숫자로 바꿔주는 인코딩 작업 수행
        le = preprocessing.Label
        Encoder()
        # 인코딩할 컬럼 설정 및 그에 맞는 학습 수행 -> fit
        le = le.fit(df.loc[:,feature])
        # 실제 학습한 내용을 적용 -> transform
        df.loc[:,feature] = le.transform(df.loc[:,feature])
        
    return df

In [18]:
def titanic_fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    
    return df

In [19]:
def drop_feature(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

In [20]:
def format_feature(df):
    # 1번 부분
    # Cabin에서 필요 정보 변형 부분
    df.loc[:,'Cabin'] = df.loc[:,'Cabin'].apply(lambda x: str(x)[:1])
    
    # 2번 부분
    # 변경할 컬럼 리스트
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df.loc[:,feature])
        df.loc[:,feature] = le.transform(df.loc[:,feature])
    
    return df

### 한 방에 묶는 함수

In [21]:
def titanic_transform(df):
    df = titanic_fillna(df)
    df = drop_feature(df)
    df = format_feature(df)
    return df

In [22]:
data_train = pd.read_csv('../data/titanic_train.csv')

In [23]:
y_titanic_train = data_train.loc[:,'Survived']
X_titanic_train = data_train.drop("Survived", axis=1)

In [24]:
X_titanic_train.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [25]:
X_titanic_train = titanic_transform(X_titanic_train)
X_titanic_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.0,1,0,7.25,7,3
1,1,0,38.0,1,0,71.2833,2,0
2,3,0,26.0,0,0,7.925,7,3


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_train, 
                                                    y_titanic_train,
                                                    test_size=0.2,
                                                    random_state=11)

- train 셋의 비율 확인  
원래 데이터 비율과 뽑아놓은 데이터들의 비율이 조금씩 다르다

In [27]:
y_titanic_train.value_counts()/len(y_titanic_train)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [28]:
y_train.value_counts()/len(y_train)

0    0.605337
1    0.394663
Name: Survived, dtype: float64

In [29]:
y_test.value_counts()/len(y_test)

0    0.659218
1    0.340782
Name: Survived, dtype: float64

비율을 유지하고자 하는 기준을 설정(y_titanic_train)  

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_train, 
                                                    y_titanic_train,
                                                    stratify = y_titanic_train,
                                                    test_size=0.2,
                                                    random_state=11)

In [31]:
print(y_titanic_train.value_counts()/len(y_titanic_train))
print(y_train.value_counts()/len(y_train))
print(y_test.value_counts()/len(y_test))

0    0.616162
1    0.383838
Name: Survived, dtype: float64
0    0.616573
1    0.383427
Name: Survived, dtype: float64
0    0.614525
1    0.385475
Name: Survived, dtype: float64


In [32]:
SVM_predictions = svm_model.predict(X_test)
RF_predictions = rf_model.predict(X_test)
KNN_predictions = knn_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, SVM_predictions)
accuracy_rf = accuracy_score(y_test, RF_predictions)
accuracy_knn = accuracy_score(y_test, KNN_predictions)
print("SVM에서 GridSearchCV로 찾은 결과 : {0:.4f}".format(accuracy_svm))
print("RF에서 GridSearchCV로 찾은 결과 : {0:.4f}".format(accuracy_rf))
print("KNN에서 GridSearchCV로 찾은 결과 : {0:.4f}".format(accuracy_knn))

SVM에서 GridSearchCV로 찾은 결과 : 0.7207
RF에서 GridSearchCV로 찾은 결과 : 0.8436
KNN에서 GridSearchCV로 찾은 결과 : 0.7263


# Voting

In [33]:
from sklearn.ensemble import VotingClassifier

In [34]:
eclf = VotingClassifier(estimators=[('knn', knn_model),('rf',rf_model),('svm',svm_model)],
                       voting='hard')

eclf.fit(X_train, y_train)
eclf_pred = eclf.predict(X_test)
accuracy_voting = accuracy_score(y_test, eclf_pred)
print("Hard Voting에서 GridSearchCV로 찾은 결과 : {0:.4f}".format(accuracy_voting))

Hard Voting에서 GridSearchCV로 찾은 결과 : 0.7598


- X_test에다가 옆에 칼럼을 추가해 knn, rfc, svm best모델 predict 결과값들을 저장(예측정답지)

In [35]:
x_test_model_total = X_test.copy(deep=True)
x_test_model_total['knnc'] = knn_model.predict(X_test)
x_test_model_total['rfc'] = rf_model.predict(X_test)
x_test_model_total['svm'] = svm_model.predict(X_test)

In [36]:
x_test_model_total.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,knnc,rfc,svm
212,3,1,22.0,0,0,7.25,7,3,0,0,0
456,1,1,65.0,0,0,26.55,4,3,0,0,0
557,1,1,29.699118,0,0,227.525,7,0,1,0,1
763,1,0,36.0,1,2,120.0,1,3,1,1,1
682,3,1,20.0,0,0,9.225,7,3,0,0,0


In [37]:
def voting(df):
    sum_list = df['knnc'] + df['rfc'] + df['svm']
    df['voting'] = sum_list.apply(lambda x: 1 if x >= 2 else 0)
    
    return df

In [38]:
voting(x_test_model_total)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,knnc,rfc,svm,voting
212,3,1,22.000000,0,0,7.2500,7,3,0,0,0,0
456,1,1,65.000000,0,0,26.5500,4,3,0,0,0,0
557,1,1,29.699118,0,0,227.5250,7,0,1,0,1,1
763,1,0,36.000000,1,2,120.0000,1,3,1,1,1,1
682,3,1,20.000000,0,0,9.2250,7,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
119,3,0,2.000000,4,2,31.2750,7,3,0,0,1,0
4,3,1,35.000000,0,0,8.0500,7,3,0,0,0,0
352,3,1,15.000000,1,1,7.2292,7,0,1,1,1,1
499,3,1,24.000000,0,0,7.7958,7,3,0,0,0,0


In [39]:
import collections

- 제일 적은 것부터 나열해서 dict형태로 보여준다.

In [40]:
collections.Counter([1,0,0,0])

Counter({1: 1, 0: 3})

In [41]:
collections.Counter([0,0,1,1,1]).most_common()

[(1, 3), (0, 2)]

In [42]:
import collections, numpy
def vote_func(x, best_model_idx = 0):
    p=collections.Counter(x)
    
    if p.most_common()[0][1] >= 2:
        return(p.most_common()[0][0])
    else:
        print("Check!! Voting Result!!")
        return(x[best_model_idx])

In [43]:
x_test_model_total['ensemble']= x_test_model_total.loc[:,['knnc','rfc','svm']].apply(lambda x: vote_func(x), axis=1)

In [44]:
x_test_model_total['y_real'] = y_test

In [45]:
show_col_list = ['knnc','rfc','svm','ensemble','y_real']

In [46]:
x_test_model_total.loc[:,show_col_list]

Unnamed: 0,knnc,rfc,svm,ensemble,y_real
212,0,0,0,0,0
456,0,0,0,0,0
557,1,0,1,1,0
763,1,1,1,1,1
682,0,0,0,0,0
...,...,...,...,...,...
119,0,0,1,0,0
4,0,0,0,0,0
352,1,1,1,1,0
499,0,0,0,0,0


In [52]:
import xgboost as xgb
from xgboost import XGBClassifier

In [53]:
XGBC = XGBClassifier(n_jobs=-1)
xgc_param_grid = {
    'n_estimators' : [10, 30, 50]
}

In [54]:
n_iter_search = 10
xgv = RandomizedSearchCV(XGBC, 
                             xgc_param_grid, 
                             cv=7,
                             n_jobs=-1,
                             random_state=1234,
                             scoring='accuracy',
                            n_iter=n_iter_search)
xgv.fit(X_train, y_train)



RandomizedSearchCV(cv=7, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=-1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
         

In [55]:
parameters = {
    'n_estimators' : [50, 100, 150],
    'base_score' : [0, 5],
    'booster' : ['gbtree']
}

In [59]:
grid_xgb = GridSearchCV(XGBC, 
                             parameters, 
                             cv=5,
                             n_jobs=-1,
                        scoring='accuracy')

In [60]:
from lightgbm import LGBMClassifier

In [61]:
LGB = LGBMClassifier(random_state =1234, n_jobs = -1)
lb_param_grid = {
    'n_estimator' : [100, 200, 300],
    'learning_rate' : [0.1, 0.05, 0.01]
}

In [62]:
n_iter_search = 10
LGB_cif = RandomizedSearchCV(LGB, 
                             lb_param_grid, 
                             cv=7,
                             n_jobs=-1,
                             verbose = 1,
                             random_state=1234,
                             scoring='accuracy',
                            n_iter=n_iter_search)
LGB_cif.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 7 folds for each of 9 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    1.7s finished


RandomizedSearchCV(cv=7, error_score='raise-deprecating',
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leaves=31, objective=None,
                                            random_state=1234, reg_alpha=0.0,
                                            reg_lambda=0.0, silent=True,
                                            subsample=1.0,
                                            s