In [36]:
# [Preprocessing Functions]

def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

def drop_features(df):
    df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)
    return df

def format_features(df):
    from sklearn.preprocessing import LabelEncoder
    df['Cabin'] = df.Cabin.str[0]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
        print(le.classes_)
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

def get_clf_eval(y_test, pred):
    # 정확도, 정밀도, 민감도(재현율), 
    from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)

    print(f"Confusion Matrix\n{confusion}")
    print(f"accuracy: {accuracy: .4f}", end=' ')
    print(f"precision: {precision: .4f}", end=' ')
    print(f"recall: {recall: .4f}", end=' ')




In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 실행 과정

# [파일 호출]
df = pd.read_csv("C:/Users/js2-3/Desktop/data/titanic/train.csv")

# [데이터 전처리]
y = df.Survived
x = df.drop(columns=['Survived'])
x = transform_features(x)  # 전처리 함수 호출
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)



['A' 'B' 'C' 'D' 'E' 'F' 'G' 'N' 'T']
['female' 'male']
['C' 'N' 'Q' 'S']


In [93]:
from sklearn.model_selection import train_test_split
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# dt_clf = DecisionTreeClassifier(random_state=11)
# rf_clf = RandomForestClassifier(random_state=11)


lr_clf = LogisticRegression(max_iter=150)
lr_clf.fit(X_train, y_train)
pred_lr = lr_clf.predict(X_test)
get_clf_eval(y_test, pred_lr)

Confusion Matrix
[[104  14]
 [ 13  48]]
accuracy:  0.8492 precision:  0.7742 recall:  0.7869 

In [90]:
pred_proba = lr_clf.predict_proba(X_test)

In [94]:
pred_lr[:3]

array([1, 0, 0], dtype=int64)

In [96]:
pred_proba

array([[0.46203017, 0.53796983],
       [0.87872733, 0.12127267],
       [0.87717511, 0.12282489],
       [0.88251544, 0.11748456],
       [0.85526601, 0.14473399],
       [0.88216984, 0.11783016],
       [0.88846031, 0.11153969],
       [0.20877482, 0.79122518],
       [0.78289495, 0.21710505],
       [0.36914729, 0.63085271],
       [0.89973075, 0.10026925],
       [0.87508071, 0.12491929],
       [0.87716914, 0.12283086],
       [0.88841448, 0.11158552],
       [0.43698012, 0.56301988],
       [0.85905344, 0.14094656],
       [0.90373136, 0.09626864],
       [0.73345034, 0.26654966],
       [0.72478645, 0.27521355],
       [0.17162115, 0.82837885],
       [0.75361145, 0.24638855],
       [0.61897074, 0.38102926],
       [0.85469626, 0.14530374],
       [0.8146844 , 0.1853156 ],
       [0.88804438, 0.11195562],
       [0.76550859, 0.23449141],
       [0.8596044 , 0.1403956 ],
       [0.92580965, 0.07419035],
       [0.71963478, 0.28036522],
       [0.69554739, 0.30445261],
       [0.

In [None]:
import numpy as np

np.concatenate