## Logistic Regression

In [13]:
import numpy as np
import plotly.graph_objects as go

z = np.linspace(-10, 10, 200)
sigmoid = 1 / (1 + np.exp(-z))

fig = go.Figure()
fig.add_trace(go.Scatter(x=z, y=sigmoid, mode="lines",
                         line=dict(color="purple", width=3),
                         name="Sigmoid"))

fig.update_layout(title="Sigmoid Function (점수 → 확률 변환기)",
                  xaxis_title="z (모델 점수)",
                  yaxis_title="sigmoid(z)",
                  template="plotly_dark", yaxis=dict(range=[0,1]))
fig.show()

## 타이타닉 전처리(대충)

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def fillna(df):
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Embarked'] = df['Embarked'].fillna('N')
    df['Fare'] = df['Fare'].fillna(0)
    return df

def drop_features(df):
    return df.drop(['PassengerId','Name','Ticket'], axis=1)

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    for feature in ['Cabin','Sex','Embarked']:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
    return df

def transform_features(df):
    return format_features(drop_features(fillna(df)))

titanic_df = pd.read_csv('../day_01/titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = transform_features(titanic_df.drop('Survived', axis=1))

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_titanic_df, y_titanic_df, test_size=0.2, stratify=y_titanic_df, random_state=42
)

lr = LogisticRegression(max_iter=5000, random_state=42)
lr.fit(X_train, y_train)

# 예측 확률 출력
y_proba = lr.predict_proba(X_test)[:,1]

import pandas as pd
pd.DataFrame({"예측 확률": y_proba[:10]})

Unnamed: 0,예측 확률
0,0.076887
1,0.052487
2,0.176848
3,0.040405
4,0.56441
5,0.469982
6,0.726112
7,0.411799
8,0.38567
9,0.144943


## 정확도

In [4]:
from sklearn.metrics import precision_score, recall_score

for thr in [0.3, 0.5, 0.7]:
    y_pred = (y_proba >= thr).astype(int)
    print(f"Threshold={thr}")
    print("  Precision:", precision_score(y_test, y_pred, zero_division=0))
    print("  Recall   :", recall_score(y_test, y_pred, zero_division=0))

Threshold=0.3
  Precision: 0.6627906976744186
  Recall   : 0.8260869565217391
Threshold=0.5
  Precision: 0.7741935483870968
  Recall   : 0.6956521739130435
Threshold=0.7
  Precision: 0.9393939393939394
  Recall   : 0.4492753623188406


In [5]:
from sklearn.metrics import f1_score

thr_list = np.round(np.arange(0.1, 1.0, 0.1), 2)
rows = []
for thr in thr_list:
    y_hat = (y_proba >= thr).astype(int)
    rows.append([
        thr,
        precision_score(y_test, y_hat, zero_division=0),
        recall_score(y_test, y_hat, zero_division=0),
        f1_score(y_test, y_hat, zero_division=0)
    ])

thr_df = pd.DataFrame(rows, columns=["Threshold","Precision","Recall","F1"])

fig = go.Figure()
fig.add_trace(go.Scatter(x=thr_df["Threshold"], y=thr_df["Precision"],
                         mode="lines", name="Precision", line=dict(dash="dash", color="cyan")))
fig.add_trace(go.Scatter(x=thr_df["Threshold"], y=thr_df["Recall"],
                         mode="lines", name="Recall", line=dict(color="magenta")))
fig.add_trace(go.Scatter(x=thr_df["Threshold"], y=thr_df["F1"],
                         mode="lines", name="F1", line=dict(color="yellow")))

fig.update_layout(title="임곗값 변화에 따른 Precision / Recall / F1",
                  xaxis_title="Threshold", yaxis_title="Score",
                  template="plotly_dark", yaxis=dict(range=[0,1]))
fig.show()

# 로지스틱 회귀 확률 기반 분류기
# Sigmoid 함수 -> 점수(z) 확률로 변환
# 임계값에 따라서 Precision, Recall 값이 줄다라기
# 정책에 맞게 임계값을 조정