# F1_score
- 정밀도와 재현율을 결합한 평가 지표
    - F1 = 2/(1/recall + 1/precision) = 2 * (precision * recall) / (precision + recall)

In [5]:
import numpy as np 
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True) # PassengerId, Name, Ticket 은 생존율에 영향이 없을 것이라 판단
    return df

def format_features(df):
    df['Cabin'] = df['Cabin'].str[0]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df
    
def get_clf_eval(y_test, predictions):
    print('Confusion Matrix\n', confusion_matrix(y_test, predictions))
    print('Accuracy: ', accuracy_score(y_test, predictions))
    print('Precision: ', precision_score(y_test, predictions))
    print('Recall: ', recall_score(y_test, predictions))
    print('F1_score: ', f1_score(y_test, predictions))

titanic_df = pd.read_csv(r'C:\Users\arceu\Desktop\Machine-Learning\data\titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2)

clf = LogisticRegression()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
get_clf_eval(y_test, pred)

pred_prob = clf.predict_proba(X_test)

result = np.concatenate([pred_prob, pred.reshape(-1, 1)], axis=1)

from sklearn.preprocessing import Binarizer

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

clf = LogisticRegression()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
pred_prob = clf.predict_proba(X_test)
pred_prob2 = pred_prob[:, 1].reshape(-1, 1)

# 정확도/정밀도/재현율 중 최적의 평가 지표를 찾기위한 Threshold(문턱 값) 조정
ths = [0.4, 0.45, 0.5, 0.55, 0.6]
for th in ths:
    binarizer = Binarizer(threshold=th).fit(pred_prob2)
    custom_pred = binarizer.transform(pred_prob2)
    print('Threshold: {}'.format(th))
    get_clf_eval(y_test, custom_pred)
    print()

Confusion Matrix
 [[96 18]
 [21 44]]
Accuracy:  0.7821229050279329
Precision:  0.7096774193548387
Recall:  0.676923076923077
F1_score:  0.6929133858267716
Threshold: 0.4
Confusion Matrix
 [[97 21]
 [11 50]]
Accuracy:  0.8212290502793296
Precision:  0.704225352112676
Recall:  0.819672131147541
F1_score:  0.7575757575757576

Threshold: 0.45
Confusion Matrix
 [[105  13]
 [ 13  48]]
Accuracy:  0.8547486033519553
Precision:  0.7868852459016393
Recall:  0.7868852459016393
F1_score:  0.7868852459016392

Threshold: 0.5
Confusion Matrix
 [[108  10]
 [ 14  47]]
Accuracy:  0.8659217877094972
Precision:  0.8245614035087719
Recall:  0.7704918032786885
F1_score:  0.7966101694915254

Threshold: 0.55
Confusion Matrix
 [[111   7]
 [ 16  45]]
Accuracy:  0.8715083798882681
Precision:  0.8653846153846154
Recall:  0.7377049180327869
F1_score:  0.7964601769911505

Threshold: 0.6
Confusion Matrix
 [[113   5]
 [ 17  44]]
Accuracy:  0.8770949720670391
Precision:  0.8979591836734694
Recall:  0.7213114754098361
