In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

In [4]:
import pandas as pd
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
def fillna(df):
    df = df.copy()  # 명시적 복사
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Embarked'] = df['Embarked'].fillna('N')
    df['Fare'] = df['Fare'].fillna(0)
    return df

def drop_features(df):
    return df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

def format_features(df):
    df = df.copy()
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [6]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print(confusion)
    print('*'*20)
    print(accuracy, precision, recall)

In [7]:
titanic_df = transform_features(df)

In [8]:
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df,
                                                    y_titanic_df,
                                                    test_size=0.2,
                                                    random_state=0 )

# 성능비교 - 로지스틱회귀

In [10]:
#로지스틱회귀 분류모델 생성
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=2000)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

#정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[92 18]
 [16 53]]
********************
0.8100558659217877 0.7464788732394366 0.7681159420289855


In [11]:
from sklearn.base import BaseEstimator
import numpy as np
class MyDummyClassifier(BaseEstimator):
  def fit(self, X, y):
    pass

  def predict(self, X):
    pred = np.zeros((X.shape[0],1))
    for i in range(X.shape[0]):
      if X['Sex'].iloc[i] == 1:
        pred[i]=0
      else :
        pred[i]=1
    return pred

In [13]:
myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)
my_pred = myclf.predict(X_test)
accuracy_score(y_test, my_pred)

0.7877094972067039

# 랜덤포레스트, KNN 의 정밀도, 재현율 비교하기

# 분류모델의 임계치의 확인

In [18]:
pred_proba = lr_clf.predict_proba(X_test) # y이가 0일지 1일지 결정하는 확률..? 
pos_proba = pred_proba[:, 1] # 양성 #음성 양성

threshold = 0.4
custom_proba = (pos_proba >= threshold) .astype(int)
confusion_matrix(y_test, custom_proba)

array([[86, 24],
       [13, 56]])

In [19]:
get_clf_eval(y_test, custom_proba)

[[86 24]
 [13 56]]
********************
0.7932960893854749 0.7 0.8115942028985508
