In [None]:
# HW05.ipynb
# homeworks/HW05/HW05.ipynb

# 2.3.1 Загрузка данных и первичный анализ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report

# Загрузка данных
df = pd.read_csv("S05-hw-dataset.csv")

# Первичный анализ
print("Первые строки датасета:")
display(df.head())

print("
Информация о столбцах и типах:")
print(df.info())

print("
Базовые статистики числовых признаков:")
display(df.describe())

print("
Распределение таргета default:")
print(df['default'].value_counts(normalize=True))

In [None]:
# 2.3.2 Подготовка признаков и таргета
X = df.drop(columns=['default', 'client_id'])
y = df['default']

print("Диапазон debt_to_income:", X['debt_to_income'].min(), X['debt_to_income'].max())

In [None]:
# 2.3.3 Train/Test-сплит и бейзлайн-модель
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

dummy = DummyClassifier(strategy="most_frequent", random_state=42)
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)
y_proba_dummy = dummy.predict_proba(X_test)[:,1]

print("
DummyClassifier результаты:")
print("Accuracy:", accuracy_score(y_test, y_pred_dummy))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_dummy))

In [None]:
# 2.3.4 Логистическая регрессия и подбор гиперпараметров
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000, random_state=42))
])

best_c = None
best_roc = 0
for C in [0.01, 0.1, 1.0, 10.0]:
    pipe.set_params(logreg__C=C)
    pipe.fit(X_train, y_train)
    y_proba = pipe.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, y_proba)
    print(f"C={C}: ROC-AUC={roc:.4f}")
    if roc > best_roc:
        best_roc = roc
        best_c = C

print(f"
Лучший C: {best_c} с ROC-AUC={best_roc:.4f}")

pipe.set_params(logreg__C=best_c)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:,1]

print("
LogisticRegression результаты:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("
Classification report:")
print(classification_report(y_test, y_pred))

# ROC-кривая
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label=f'LogReg (AUC={roc_auc_score(y_test, y_proba):.3f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-кривая LogisticRegression")
plt.legend()
plt.grid()
plt.tight_layout()
plt.savefig("figures/roc_curve_logreg.png")
plt.show()

In [None]:
# 2.3.5 Сравнение бейзлайна и логистической регрессии
results = pd.DataFrame({
    "Model": ["DummyClassifier", "LogisticRegression"],
    "Accuracy": [accuracy_score(y_test, y_pred_dummy), accuracy_score(y_test, y_pred)],
    "ROC-AUC": [roc_auc_score(y_test, y_proba_dummy), roc_auc_score(y_test, y_proba)]
})
print("
Сравнение моделей:")
display(results)