In [81]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split

#classification(분류)
from sklearn.tree import DecisionTreeClassifier      #의사결정트리
from sklearn.ensemble import RandomForestClassifier  #랜덤포레스트
from sklearn.neighbors import KNeighborsClassifier   #KNN(K_Nearst_Neighbor)K-최근접이웃
from sklearn.linear_model import LogisticRegression  #로지스틱회귀
from sklearn.svm import SVC                          #SVM(Support_Vector_Machine)서포트벡터머신


from sklearn.metrics import confusion_matrix      #혼동행렬
from sklearn.metrics import accuracy_score        #정확도
from sklearn.metrics import precision_score     #정밀도
from sklearn.metrics import recall_score        #재현율
from sklearn.metrics import f1_score            #f1
from sklearn.metrics import roc_auc_score       #roc_auc
from sklearn.metrics import classification_report #종합


#교차검증
from sklearn.model_selection import GridSearchCV

In [82]:
# 결측값 채우기
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('nan', inplace=True)
    df['Embarked'].fillna('nan', inplace=True)
    return df

# 필요없는 변수들 제거하기
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 변수들 변환하기
def format_features(df):
    # cabin 변수 좌석 클래스 앞 한자리만 뽑기
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        # LabelEncoder는 문자를 정수로 바꿔줌
        le = LabelEncoder()
        # cabin, sex, embarked 세 항목을 피팅
        le = le.fit(df[feature])
        # 세 항목을 라벨숫자로 변환하기
        df[feature] = le.transform(df[feature])
    return df

#위에 것들 다 통합한 함수
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [83]:
path = '/Users/Leo Kim/'
df = pd.read_csv(path + 'titanic_train.csv')

# 종속 변수를 담는 듯
y = df['Survived']

# 종속 변수를 뺀 나머지를 데이터 셋으로 만듬(비교하기 위해)
df_features = df.drop('Survived', axis=1)

# 필요없는 변수들을 제거하고 x축 독립변수들로 만듬
x = transform_features(df_features)

In [84]:
# train_test_split으로 데이터를 나눠줌
# test_size로 train과 test 데이터의 비율을 설정할 수 있음 (default = 0.25)
# train_size도 설정가능 (default = 1-test_size)
# random_state로 셔플을 위한 seed 값을 지정할 수 있음(int로도 입력가능)
# shuffle도 설정가능. 데이터 섞을건지 (default = True)
# stratify도 설정가능 하며 classification을 다룰때 필요함. 한쪽에 치중되는 것을 방지 (default = None)
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.3,
                                                    random_state=123)

### GridSearch_CV

In [85]:
clf = LogisticRegression()

In [86]:
parameters = {
    'C':[0.001, 0.01, 0.1, 1, 10, 100]
}

In [87]:
grid_clf = GridSearchCV(clf, param_grid=parameters, cv=5)

In [88]:
grid_clf.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]})

In [89]:
pd.DataFrame(grid_clf.cv_results_).iloc[:,4:].sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,1.0,{'C': 1},0.8,0.76,0.776,0.790323,0.814516,0.788168,0.01887,1
5,100.0,{'C': 100},0.8,0.76,0.768,0.790323,0.814516,0.786568,0.020131,2
4,10.0,{'C': 10},0.8,0.752,0.776,0.782258,0.814516,0.784955,0.021325,3
2,0.1,{'C': 0.1},0.792,0.752,0.768,0.758065,0.822581,0.778529,0.025905,4
1,0.01,{'C': 0.01},0.664,0.76,0.704,0.733871,0.75,0.722374,0.034803,5
0,0.001,{'C': 0.001},0.616,0.704,0.712,0.709677,0.693548,0.687045,0.036089,6


In [90]:
print('best_param: {0}'.format(grid_clf.best_params_))
print('best_score: {0:.4f}'.format(grid_clf.best_score_))

best_param: {'C': 1}
best_score: 0.7882


### example

In [91]:
dt_clf = DecisionTreeClassifier(random_state=0)
rf_clf = RandomForestClassifier(random_state=0)
knn_clf = KNeighborsClassifier()\
svm_clf = SVC()

In [101]:
#DecisionTree_parameters

dt_parameters = {
    'max_depth': [6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18],
    'min_samples_split':[2,3]
}

In [93]:
#RandomForest_parameters

rf_parameters = {
    'n_estimators':[100],
    'max_depth':[6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18]
}

In [94]:
#KNN_parameters

knn_parameters ={
    'n_neighbors':[1, 3, 5, 7, 9]
}

In [95]:
#SVM_parameters

svm_parameters = {
    'C':[0.01, 0.1, 1, 10],
    'gamma':[0.01, 0.1, 1, 10]
}

In [102]:
def grid_search(clf, parameters, x_train, y_train):
    grid_clf = GridSearchCV(clf, param_grid=parameters, cv=5)
    grid_clf.fit(x_train, y_train)
    
    print('best_param: {0}'.format(grid_clf.best_params_))
    print('best_score: {0:.4f}'.format(grid_clf.best_score_))
    
    return pd.DataFrame(grid_clf.cv_results_).iloc[:,4:].sort_values(by='mean_test_score', ascending=False)

In [103]:
grid_search(dt_clf, dt_parameters, x_train, y_train)

best_param: {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 2}
best_score: 0.7961


Unnamed: 0,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6,8,2,"{'max_depth': 6, 'min_samples_leaf': 8, 'min_s...",0.856,0.76,0.792,0.83871,0.733871,0.796116,0.046035,1
1,6,8,3,"{'max_depth': 6, 'min_samples_leaf': 8, 'min_s...",0.856,0.76,0.792,0.83871,0.733871,0.796116,0.046035,1
7,8,8,3,"{'max_depth': 8, 'min_samples_leaf': 8, 'min_s...",0.856,0.76,0.792,0.830645,0.741935,0.796116,0.042509,3
19,12,8,3,"{'max_depth': 12, 'min_samples_leaf': 8, 'min_...",0.856,0.76,0.792,0.830645,0.741935,0.796116,0.042509,3
18,12,8,2,"{'max_depth': 12, 'min_samples_leaf': 8, 'min_...",0.856,0.76,0.792,0.830645,0.741935,0.796116,0.042509,3
13,10,8,3,"{'max_depth': 10, 'min_samples_leaf': 8, 'min_...",0.856,0.76,0.792,0.830645,0.741935,0.796116,0.042509,3
12,10,8,2,"{'max_depth': 10, 'min_samples_leaf': 8, 'min_...",0.856,0.76,0.792,0.830645,0.741935,0.796116,0.042509,3
6,8,8,2,"{'max_depth': 8, 'min_samples_leaf': 8, 'min_s...",0.856,0.76,0.792,0.830645,0.741935,0.796116,0.042509,3
14,10,12,2,"{'max_depth': 10, 'min_samples_leaf': 12, 'min...",0.808,0.736,0.776,0.822581,0.798387,0.788194,0.030171,9
21,12,12,3,"{'max_depth': 12, 'min_samples_leaf': 12, 'min...",0.808,0.736,0.776,0.822581,0.798387,0.788194,0.030171,9


In [98]:
grid_search(rf_clf, rf_parameters, x_train, y_train)

best_param: {'max_depth': 10, 'min_samples_leaf': 8, 'n_estimators': 100}
best_score: 0.8187


Unnamed: 0,param_max_depth,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,10,8,100,"{'max_depth': 10, 'min_samples_leaf': 8, 'n_es...",0.832,0.784,0.808,0.822581,0.846774,0.818671,0.021441,1
9,12,8,100,"{'max_depth': 12, 'min_samples_leaf': 8, 'n_es...",0.832,0.776,0.808,0.822581,0.846774,0.817071,0.024102,2
3,8,8,100,"{'max_depth': 8, 'min_samples_leaf': 8, 'n_est...",0.84,0.784,0.808,0.814516,0.83871,0.817045,0.020864,3
0,6,8,100,"{'max_depth': 6, 'min_samples_leaf': 8, 'n_est...",0.84,0.776,0.808,0.806452,0.822581,0.810606,0.021114,4
1,6,12,100,"{'max_depth': 6, 'min_samples_leaf': 12, 'n_es...",0.824,0.76,0.792,0.798387,0.806452,0.796168,0.021028,5
7,10,12,100,"{'max_depth': 10, 'min_samples_leaf': 12, 'n_e...",0.824,0.752,0.792,0.798387,0.806452,0.794568,0.023836,6
10,12,12,100,"{'max_depth': 12, 'min_samples_leaf': 12, 'n_e...",0.824,0.752,0.792,0.798387,0.806452,0.794568,0.023836,6
4,8,12,100,"{'max_depth': 8, 'min_samples_leaf': 12, 'n_es...",0.824,0.752,0.8,0.790323,0.806452,0.794555,0.023946,8
2,6,18,100,"{'max_depth': 6, 'min_samples_leaf': 18, 'n_es...",0.784,0.752,0.8,0.790323,0.798387,0.784942,0.01745,9
8,10,18,100,"{'max_depth': 10, 'min_samples_leaf': 18, 'n_e...",0.784,0.736,0.8,0.798387,0.790323,0.781742,0.023586,10


In [99]:
grid_search(knn_clf, knn_parameters, x_train, y_train)

best_param: {'n_neighbors': 5}
best_score: 0.7126


Unnamed: 0,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,5,{'n_neighbors': 5},0.688,0.76,0.728,0.693548,0.693548,0.712619,0.027622,1
3,7,{'n_neighbors': 7},0.696,0.704,0.728,0.75,0.653226,0.706245,0.032585,2
4,9,{'n_neighbors': 9},0.632,0.704,0.696,0.75,0.645161,0.685432,0.042667,3
1,3,{'n_neighbors': 3},0.688,0.752,0.64,0.709677,0.629032,0.683742,0.045291,4
0,1,{'n_neighbors': 1},0.672,0.656,0.672,0.717742,0.677419,0.679032,0.020642,5


In [100]:
grid_search(svm_clf, svm_parameters, x_train, y_train)

best_param: {'C': 10, 'gamma': 0.01}
best_score: 0.7352


Unnamed: 0,param_C,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,10.0,0.01,"{'C': 10, 'gamma': 0.01}",0.744,0.704,0.712,0.75,0.766129,0.735226,0.023515,1
13,10.0,0.1,"{'C': 10, 'gamma': 0.1}",0.68,0.712,0.664,0.766129,0.717742,0.707974,0.035242,2
8,1.0,0.01,"{'C': 1, 'gamma': 0.01}",0.672,0.712,0.704,0.701613,0.645161,0.686955,0.024912,3
9,1.0,0.1,"{'C': 1, 'gamma': 0.1}",0.632,0.672,0.648,0.701613,0.669355,0.664594,0.023604,4
10,1.0,1.0,"{'C': 1, 'gamma': 1}",0.672,0.656,0.64,0.620968,0.645161,0.646826,0.016946,5
4,0.1,0.01,"{'C': 0.1, 'gamma': 0.01}",0.608,0.624,0.616,0.637097,0.66129,0.629277,0.01867,6
14,10.0,1.0,"{'C': 10, 'gamma': 1}",0.664,0.64,0.616,0.604839,0.596774,0.624323,0.024608,7
11,1.0,10.0,"{'C': 1, 'gamma': 10}",0.64,0.624,0.624,0.604839,0.620968,0.622761,0.011177,8
15,10.0,10.0,"{'C': 10, 'gamma': 10}",0.64,0.624,0.624,0.604839,0.604839,0.619535,0.013347,9
0,0.01,0.01,"{'C': 0.01, 'gamma': 0.01}",0.608,0.608,0.608,0.604839,0.612903,0.608348,0.002586,10


-----