In [1]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df
def drop_features(df):
    df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)
    return df
def format_features(df):
    from sklearn.preprocessing import LabelEncoder
    df['Cabin'] = df['Cabin'].str[0]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder() 
        df[feature] = le.fit_transform(df[feature])
        print(le.classes_)
    return df
def transform_features():
    import pandas as pd
    df = pd.read_csv('titanic_train.csv')
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df

In [2]:
titanic_df = transform_features()

['A' 'B' 'C' 'D' 'E' 'F' 'G' 'N' 'T']
['female' 'male']
['C' 'N' 'Q' 'S']


In [3]:
titanic_df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,7,3
1,1,1,0,38.0,1,0,71.2833,2,0


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_df = titanic_df.drop(columns=['Survived'])
y_df = titanic_df['Survived'] #정답

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=11)

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [11]:
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11) #estimator 100개라 위에 보다 정확성 높음
lr_clf = LogisticRegression(solver='liblinear')

In [18]:
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print(f'DecisionTreeClassifier 정확도:{accuracy_score(y_test, dt_pred):.4f}')

DecisionTreeClassifier 정확도:0.7877


In [19]:
rf_clf.fit(X_train,y_train)
rf_pred = rf_clf.predict(X_test)
print(f'DecisionTreeClassifier 정확도:{accuracy_score(y_test, rf_pred):.4f}')

DecisionTreeClassifier 정확도:0.8547


In [20]:
lr_clf.fit(X_train,y_train)
lr_pred = lr_clf.predict(X_test)
print(f'DecisionTreeClassifier 정확도:{accuracy_score(y_test, lr_pred):.4f}')

DecisionTreeClassifier 정확도:0.8659


In [21]:
#교차검증 street~ Kfold gridsearch~
from sklearn.model_selection import KFold

In [44]:
def exec_kfold(clf, fold=5):
    import numpy as np
    kfold = KFold(n_splits=fold)
    scores = []
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_df)):
        #enumerate > for문이 가능 
        X_train, X_test = X_df.values[train_index], X_df.values[test_index]
        y_train, y_test = y_df.values[train_index], y_df.values[test_index]
        clf.fit(X_train,y_train)
        pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test,pred)
        scores.append(accuracy)
        print(f'교차검증 {iter_count} 정확도 : {accuracy}')
    print(f'평균정확도:{np.mean(scores)}')

In [45]:
exec_kfold(dt_clf) #DecisionTreeClassifier 정확도:0.7877

교차검증 0 정확도 : 0.7541899441340782
교차검증 1 정확도 : 0.7808988764044944
교차검증 2 정확도 : 0.7865168539325843
교차검증 3 정확도 : 0.7696629213483146
교차검증 4 정확도 : 0.8202247191011236
평균정확도:0.782298662984119


In [46]:
exec_kfold(rf_clf) #DecisionTreeClassifier 정확도:0.8547

교차검증 0 정확도 : 0.7932960893854749
교차검증 1 정확도 : 0.8089887640449438
교차검증 2 정확도 : 0.8370786516853933
교차검증 3 정확도 : 0.7752808988764045
교차검증 4 정확도 : 0.8595505617977528
평균정확도:0.8148389931579938


In [47]:
exec_kfold(lr_clf)#DecisionTreeClassifier 정확도:0.8659

교차검증 0 정확도 : 0.7932960893854749
교차검증 1 정확도 : 0.7921348314606742
교차검증 2 정확도 : 0.7752808988764045
교차검증 3 정확도 : 0.7471910112359551
교차검증 4 정확도 : 0.8426966292134831
평균정확도:0.7901198920343984


In [51]:
from sklearn.model_selection import cross_val_score
import numpy as np

In [52]:
score = cross_val_score(dt_clf, X_df, y_df, cv=5)
print(f'{score} 평균:{np.mean(score)}')
#교차검증 0 정확도 : 0.7541899441340782
#교차검증 1 정확도 : 0.7808988764044944
#교차검증 2 정확도 : 0.7865168539325843
#교차검증 3 정확도 : 0.7696629213483146
#교차검증 4 정확도 : 0.8202247191011236
#평균정확도:0.782298662984119

[0.74301676 0.7752809  0.79213483 0.78651685 0.84269663] 평균:0.7879291946519366


In [53]:
score = cross_val_score(rf_clf, X_df, y_df, cv=5)
print(f'{score} 평균:{np.mean(score)}')
# 교차검증 0 정확도 : 0.7932960893854749
# 교차검증 1 정확도 : 0.8089887640449438
# 교차검증 2 정확도 : 0.8370786516853933
# 교차검증 3 정확도 : 0.7752808988764045
# 교차검증 4 정확도 : 0.8595505617977528
# 평균정확도:0.8148389931579938

[0.79329609 0.79775281 0.84831461 0.76404494 0.86516854] 평균:0.8137153976523758


In [54]:
score = cross_val_score(lr_clf, X_df, y_df, cv=5)
print(f'{score} 평균:{np.mean(score)}')
# 교차검증 0 정확도 : 0.7932960893854749
# 교차검증 1 정확도 : 0.7921348314606742
# 교차검증 2 정확도 : 0.7752808988764045
# 교차검증 3 정확도 : 0.7471910112359551
# 교차검증 4 정확도 : 0.8426966292134831
# 평균정확도:0.7901198920343984

[0.7877095  0.79213483 0.7752809  0.76404494 0.82022472] 평균:0.7878789780930262


In [55]:
from sklearn.model_selection import GridSearchCV

In [56]:
# 파라미터 값이 달라서 위에꺼 두개만 적용
parameter = {
    'max_depth' : [2,3,5,10],
    'min_samples_split' : [2,3,5],
    'min_samples_leaf' : [1,5,8]
}

In [57]:
grid_clf = GridSearchCV(dt_clf, param_grid=parameter, scoring='accuracy')
#그리드서치씨븨 객체 만들기 n_jobs > 얼마나 사용할건지, refit=제일 좋았던거 한번더, cv=None, verbose=0 >>>> 찾아서 주석달기
grid_clf.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=11),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')

In [58]:
grid_clf.best_params_ # 이때 제일 좋았다 ~

{'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}

In [59]:
grid_clf.best_score_ #위에 껄로 했을 때 점수

0.7991825076332119

In [60]:
pred = grid_clf.best_estimator_.predict(X_test)

In [61]:
accuracy_score(y_test, pred)

0.8715083798882681

In [62]:
grid_clf = GridSearchCV(rf_clf, param_grid=parameter, scoring='accuracy')
grid_clf.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=11),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')

In [63]:
grid_clf.best_params_ 

{'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [64]:
grid_clf.best_score_ 

0.8145966709346991

In [67]:
pred = grid_clf.best_estimator_.predict(X_test)
accuracy_score(y_test,pred)

0.88268156424581

In [None]:
#이진분류에서 균형잡히면 정확도 보면되는데 불균형이면 정확도 이외에도 지표가 필요