In [3]:
# 계속 바뀔 수가 있다.
import numpy as np
import pandas as pd


%matplotlib inline
import matplotlib.pylab as plt
import time
import seaborn as sns

### 함수로 만들어서 필요한 부분 모듈화 시키기

In [4]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [5]:
def encode_feature(df):
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        # 각기 컬럼별로 수행할 인코더 설정 -> 컬럼별로 다르게 되니 매 for마다
        # 새로운 것이 호출됨
        # 0,1,2,3,... 숫자로 바꿔주는 인코딩 작업 수행
        le = preprocessing.LabelEncoder()
        # 인코딩할 컬럼 설정 및 그에 맞는 학습 수행 -> fit
        le = le.fit(df.loc[:,feature])
        # 실제 학습한 내용을 적용 -> transform
        df.loc[:,feature] = le.transform(df.loc[:,feature])
        
    return df

In [6]:
def titanic_fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    
    return df

In [7]:
def drop_feature(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

In [8]:
def format_feature(df):
    # 1번 부분
    # Cabin에서 필요 정보 변형 부분
    df.loc[:,'Cabin'] = df.loc[:,'Cabin'].apply(lambda x: str(x)[:1])
    
    # 2번 부분
    # 변경할 컬럼 리스트
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df.loc[:,feature])
        df.loc[:,feature] = le.transform(df.loc[:,feature])
    
    return df

### 한 방에 묶는 함수

In [9]:
def titanic_transform(df):
    df = titanic_fillna(df)
    df = drop_feature(df)
    df = format_feature(df)
    return df

In [10]:
data_train = pd.read_csv('../data/titanic_train.csv')

In [11]:
y_titanic_train = data_train.loc[:,'Survived']
X_titanic_train = data_train.drop("Survived", axis=1)

In [12]:
X_titanic_train.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [13]:
X_titanic_train = titanic_transform(X_titanic_train)
X_titanic_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.0,1,0,7.25,7,3
1,1,0,38.0,1,0,71.2833,2,0
2,3,0,26.0,0,0,7.925,7,3


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_train, 
                                                    y_titanic_train,
                                                    test_size=0.2,
                                                    random_state=11)

- train 셋의 비율 확인  
원래 데이터 비율과 뽑아놓은 데이터들의 비율이 조금씩 다르다

In [15]:
y_titanic_train.value_counts()/len(y_titanic_train)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [16]:
y_train.value_counts()/len(y_train)

0    0.605337
1    0.394663
Name: Survived, dtype: float64

In [17]:
y_test.value_counts()/len(y_test)

0    0.659218
1    0.340782
Name: Survived, dtype: float64

비율을 유지하고자 하는 기준을 설정(y_titanic_train)  

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_train, 
                                                    y_titanic_train,
                                                    stratify = y_titanic_train,
                                                    test_size=0.2,
                                                    random_state=11)

In [19]:
print(y_titanic_train.value_counts()/len(y_titanic_train))
print(y_train.value_counts()/len(y_train))
print(y_test.value_counts()/len(y_test))

0    0.616162
1    0.383838
Name: Survived, dtype: float64
0    0.616573
1    0.383427
Name: Survived, dtype: float64
0    0.614525
1    0.385475
Name: Survived, dtype: float64


In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 평가용
from sklearn.metrics import accuracy_score

- knn으로 간단히 확인

In [21]:
from sklearn.neighbors import KNeighborsClassifier

# KNN으로 해보자

In [22]:
###### 평가용 library ######
# 지정한 것으로 할 것인가
from sklearn.model_selection import GridSearchCV
# 랜덤으로 돌릴 것인가
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

In [23]:
knn = KNeighborsClassifier();

In [24]:
scores = cross_val_score(knn,
                        X_train,
                        y_train,
                        cv=5)
for iter_count, accuracy in enumerate(scores):
    print("knn 교차검증 {0} & 정확도 {1:.4f}".format(iter_count, accuracy))
print("-----------------------------------")
print("knn 교차검증 평균 정확도 : {0:.4f}".format(scores.mean()))

knn 교차검증 0 & 정확도 0.7413
knn 교차검증 1 & 정확도 0.6853
knn 교차검증 2 & 정확도 0.7483
knn 교차검증 3 & 정확도 0.7676
knn 교차검증 4 & 정확도 0.7376
-----------------------------------
knn 교차검증 평균 정확도 : 0.7360


In [25]:
tuned_parameters = {
    'n_neighbors': [1,3,5,7],
    'weights' : ['uniform','distance']
}

In [26]:
n_iter_search = 10
knn_rgs = RandomizedSearchCV(knn, 
                             tuned_parameters, 
                             cv=7, 
                             n_jobs=-1,
                             verbose = 1,
                             n_iter = n_iter_search,
                             random_state=1234,
                             scoring='accuracy')

In [27]:
knn_rgc = knn_rgs.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 7 folds for each of 8 candidates, totalling 56 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  41 out of  56 | elapsed:    5.3s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  56 out of  56 | elapsed:    5.7s finished


In [28]:
knn_rgc.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='distance')

In [29]:
print(knn_rgc.best_params_)
print(knn_rgc.best_score_)

{'weights': 'distance', 'n_neighbors': 7}
0.7429775280898876


In [30]:
tuned_parameters = {
    'n_neighbors': [3,5,7],
    'weights' : ['distance']
}

In [31]:
model_knn = GridSearchCV(knn,
                        param_grid=tuned_parameters,
                        cv=7,
                        scoring='accuracy',
                        n_jobs=-1)

In [32]:
model_knn.fit(X_train,y_train)

GridSearchCV(cv=7, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': [3, 5, 7], 'weights': ['distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [33]:
print(model_knn.best_score_)

0.7429775280898876


### k = 7, weights='distance' 에서 best estimator

In [34]:
model_knn.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='distance')

In [35]:
knn_best = model_knn.best_estimator_

knn_predictions = knn_best.predict(X_test)
accuracy = accuracy_score(y_test, knn_predictions)
print("knn에서 GridSearchCV로 찾은 결과 : {0:.4f}".format(accuracy))

knn에서 GridSearchCV로 찾은 결과 : 0.7263


# RandomForest를 적용해보자

In [36]:
from sklearn import tree

In [37]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [38]:
RF_ci = RandomForestClassifier();

In [39]:
scores = cross_val_score(RF_ci,
                        X_train,
                        y_train,
                        cv=5,
                        scoring='accuracy')
for iter_count, accuracy in enumerate(scores):
    print("knn 교차검증 {0} & 정확도 {1:.4f}".format(iter_count, accuracy))
print("-----------------------------------")
print("knn 교차검증 평균 정확도 : {0:.4f}".format(scores.mean()))

knn 교차검증 0 & 정확도 0.8042
knn 교차검증 1 & 정확도 0.7483
knn 교차검증 2 & 정확도 0.8322
knn 교차검증 3 & 정확도 0.8169
knn 교차검증 4 & 정확도 0.8014
-----------------------------------
knn 교차검증 평균 정확도 : 0.8006




In [40]:
tuned_parameters = {
    'n_estimators': [300,400,500,600,700,800],
    'max_features' : (np.arange(3, 8)),
    'criterion' : ['gini','entropy'],
    'bootstrap' : [True, False]
}

In [41]:
n_iter_search = 10
RF_rgs = RandomizedSearchCV(RF_ci, 
                             tuned_parameters, 
                             cv=7,
                             n_jobs=-1,
                             random_state=1234,
                             scoring='accuracy')

In [42]:
X_train.shape

(712, 8)

In [43]:
RF_rgc = RF_rgs.fit(X_train, y_train)

In [44]:
RF_rgc.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [45]:
print(RF_rgc.best_params_)
print(RF_rgc.best_score_)

{'n_estimators': 300, 'max_features': 7, 'criterion': 'entropy', 'bootstrap': True}
0.8174157303370787


In [46]:
tuned_parameters = {
    'n_estimators': np.arange(200,400,10),
    'max_features' : np.arange(3, 8),
    'criterion' : ['entropy'],
    'bootstrap' : [True]
}

In [47]:
model_RF = GridSearchCV(RF_ci,
                        param_grid=tuned_parameters,
                        cv=7,
                        scoring='accuracy',
                        n_jobs=-1)

In [48]:
model_RF.fit(X_train,y_train)

GridSearchCV(cv=7, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid=

In [49]:
model_RF.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=6, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=330,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
print(model_RF.best_score_)

0.8202247191011236


### max_feature: 6 / n_estimators = 280 / criterion = 'entropy' / bootstrap = True

In [51]:
RF_best = model_RF.best_estimator_

RF_predictions = RF_best.predict(X_test)
accuracy = accuracy_score(y_test, RF_predictions)
print("RF에서 GridSearchCV로 찾은 결과 : {0:.4f}".format(accuracy))

RF에서 GridSearchCV로 찾은 결과 : 0.8436


# SVM으로 찾아보자

In [52]:
from sklearn.svm import SVC

In [53]:
svm_model = SVC()

In [54]:
tuned_parameters = {
    #'C': (np.arange(0.01,10,0.1))
    'C':[11],
    'kernel' : ['rbf'],
#     'degree' : [2,3,4],
    'gamma': [0.1, 1, 10]
}

In [55]:
clf = RandomizedSearchCV(svm_model, tuned_parameters, cv=5, n_jobs=-1, random_state=1234)

In [56]:
svm_rgc = clf.fit(X_train, y_train)



In [57]:
svm_rgc.best_estimator_

SVC(C=11, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [58]:
svm_rgc.best_score_

0.7134831460674157

In [59]:
svm_best = svm_rgc.best_estimator_

SVM_predictions = svm_best.predict(X_test)
accuracy = accuracy_score(y_test, SVM_predictions)
print("SVM에서 GridSearchCV로 찾은 결과 : {0:.4f}".format(accuracy))

SVM에서 GridSearchCV로 찾은 결과 : 0.7207


# 모델 저장  및 로드

In [62]:
from joblib import dump, load

In [64]:
# Output a pickle file for the model
RFC_gs_best = model_RF.best_estimator_
dump(RFC_gs_best, 'rf_model.pkl')
dump(svm_best, 'svm_model.pkl')
dump(knn_best, 'knn_model.pkl')

['knn_model.pkl']