# 정리노트 - 로지스틱 회귀

In [None]:
#option1 - 유방암 데이터셋
import pandas as pd

#데이터 로드
df = pd.read_csv("data/breastCancer.csv")
df.drop(columns={"Unnamed: 32"}, axis=1, inplace=True)
df["target"] = 0
df.loc[(df.diagnosis == "M"), "target"] = 1
df.loc[(df.diagnosis == "B"), "target"] = 0
df.drop(columns={"id", "diagnosis"}, axis=1, inplace=True)
x_vars = df.columns.to_list()
x_vars.remove("target")
#전처리는 생략...

In [None]:
#option2 - 타이타닉 데이터셋
import pandas as pd

df = pd.read_csv("data/모의고사 2회/titanic.csv")
df.rename(columns={"survived" : "target"}, inplace=True)

df["age"].fillna(df["age"].median(), inplace=True)
df["fare"].fillna(df["fare"].median(), inplace=True)
df["embarked"].fillna(df["embarked"].mode()[0], inplace=True)
df["sex_1"] = ""
df["embarked_1"] = ""
df.loc[(df.sex == "male"), "sex_1"] = 1
df.loc[(df.sex == "female"), "sex_1"] = 2
df.loc[(df.embarked == "S"), "embarked_1"] = 1
df.loc[(df.embarked == "C"), "embarked_1"] = 2
df.loc[(df.embarked == "Q"), "embarked_1"] = 3

x_vars = ["pclass", "sex_1", "age", "sibsp", "parch", "fare", "embarked_1"]

In [None]:
from sklearn.preprocessing import StandardScaler  # 표준화 패키지 라이브러리
df[x_vars] = StandardScaler().fit_transform(df[x_vars])

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.3, shuffle=True)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_fit = model.fit(train_data[x_vars], train_data["target"])
coef = model_fit.coef_
intercept = model_fit.intercept_[0]
coef_r = [{"X" : "intercept", 
           "beta" : intercept, 
           "exp(beta) = odds" : np.exp(intercept)}]
for idx, c in enumerate(coef[0]):
    this_dict = {
        "X" : x_vars[idx],
        "beta" : c,
        "exp(beta) = odds" : np.exp(c)
    }
    if np.exp(c) >= 1:
        this_dict["opinion"] = "유방암 확률을 높이는 요소"
    coef_r.append(this_dict)
coef_r = pd.DataFrame(coef_r)
# coef_r

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# predict_proba #0번째 열은 0일 확률, 1번째 열은 1일 확률
predict_proba = model.predict_proba(test_data[x_vars])
acc_score = accuracy_score(test_data["target"], 
                           model.predict(test_data[x_vars]))
recall_score = recall_score(test_data["target"], 
                            model.predict(test_data[x_vars]))
precision_score = precision_score(test_data["target"], 
                                  model.predict(test_data[x_vars]))
confusion_matrix = confusion_matrix(test_data["target"], 
                                    model.predict(test_data[x_vars]))
report = classification_report(test_data["target"], 
                               model.predict(test_data[x_vars]))
print("acc_score : {:.2f}, recall_score : {:.2f}, precision_score : {:.2f}".format(acc_score, recall_score, precision_score))
print(confusion_matrix)
print(report)

In [None]:
#4개 depth를 가진 모델을 가지고 ROC커브로 최적의 threshold를 찾아 정확도를 높여보자.
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
fpr, tpr, thresholds = roc_curve(test_data["target"], 
                                 model.predict_proba(test_data[x_vars])[:,1])

roc = pd.DataFrame({
    "FPR": fpr, 
    "TPR": tpr, 
    "Threshold": thresholds,
    "TPR-FPR" : tpr-fpr
})
display(roc)
plt.plot(roc["FPR"], roc["TPR"])
maxima = np.argmax(roc["TPR-FPR"])
print("TPR - FPR 최적 threshold값 : {}".format(roc.iloc[maxima, 2]))

In [None]:
from sklearn.preprocessing import Binarizer

custom_threshold = roc.iloc[maxima, 2]
pred_proba_flatten = model.predict_proba(test_data[x_vars])[:,1].reshape(-1,1)
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_flatten) 
custom_predict = binarizer.transform(pred_proba_flatten)
print("오리지날")
print(classification_report(test_data["target"], 
                            model.predict(test_data[x_vars]))+"\n\n")

print("threshold : {}".format(custom_threshold))
print(classification_report(test_data["target"], custom_predict)+"\n\n")