# Логистическая регрессия

Сравнение реализаций логистической регрессии на датасете Breast Cancer Wisconsin.

# Подготовка данных


In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from source.logistic_regression import LogisticRegression

features, labels = load_breast_cancer(return_X_y=True)
labels[labels == 0] = -1

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3, random_state=42
)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Метод Ньютона-Рафсона


In [2]:
model_nr = LogisticRegression(solver='nr', max_iter=200, tol=1e-3)
model_nr.fit(X_train, y_train)

y_pred_nr = model_nr.predict(X_test)

print("Матрица ошибок:")
print(confusion_matrix(y_test, y_pred_nr))
print()
print(f"Accuracy: {accuracy_score(y_test, y_pred_nr):.3f}")
print(f"Precision: {precision_score(y_test, y_pred_nr, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_test, y_pred_nr, average='weighted'):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred_nr, average='weighted'):.3f}")


Матрица ошибок:
[[61  2]
 [10 98]]

Accuracy: 0.930
Precision: 0.935
Recall: 0.930
F1-score: 0.931


# Метод IRLS


In [3]:
model_irls = LogisticRegression(solver='irls', max_iter=200, tol=1e-3)
model_irls.fit(X_train, y_train)

y_pred_irls = model_irls.predict(X_test)

print("Матрица ошибок:")
print(confusion_matrix(y_test, y_pred_irls))
print()
print(f"Accuracy: {accuracy_score(y_test, y_pred_irls):.3f}")
print(f"Precision: {precision_score(y_test, y_pred_irls, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_test, y_pred_irls, average='weighted'):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred_irls, average='weighted'):.3f}")


Матрица ошибок:
[[ 57   6]
 [  3 105]]

Accuracy: 0.947
Precision: 0.947
Recall: 0.947
F1-score: 0.947


# Логистическая регрессия, эталонная реализация sklearn


In [4]:
sklearn_model = SklearnLogisticRegression(max_iter=200, random_state=42)
y_train_sklearn = (y_train + 1) / 2
y_test_sklearn = (y_test + 1) / 2
sklearn_model.fit(X_train, y_train_sklearn)

y_pred_sklearn = sklearn_model.predict(X_test)
y_pred_sklearn = y_pred_sklearn * 2 - 1

print("Матрица ошибок:")
print(confusion_matrix(y_test, y_pred_sklearn))
print()
print(f"Accuracy: {accuracy_score(y_test, y_pred_sklearn):.3f}")
print(f"Precision: {precision_score(y_test, y_pred_sklearn, average='weighted'):.3f}")
print(f"Recall: {recall_score(y_test, y_pred_sklearn, average='weighted'):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred_sklearn, average='weighted'):.3f}")


Матрица ошибок:
[[ 58   5]
 [  1 107]]

Accuracy: 0.965
Precision: 0.966
Recall: 0.965
F1-score: 0.965


# Сравнение методов


In [5]:
acc_nr = accuracy_score(y_test, y_pred_nr)
acc_irls = accuracy_score(y_test, y_pred_irls)
acc_sklearn = accuracy_score(y_test, y_pred_sklearn)

diff_nr_sklearn = abs(acc_nr - acc_sklearn)
diff_irls_sklearn = abs(acc_irls - acc_sklearn)
diff_nr_irls = abs(acc_nr - acc_irls)

print("| Метод            | Accuracy |")
print("|------------------|----------|")
print(f"| {'Ньютон-Рафсон':<16} | {acc_nr:>8.3f} |")
print(f"| {'IRLS':<16} | {acc_irls:>8.3f} |")
print(f"| {'sklearn':<16} | {acc_sklearn:>8.3f} |")
print()
print("| Пара методов     | Разница Accuracy|")
print("|------------------|-----------------|")
print(f"| {'NR vs sklearn':<16} | {diff_nr_sklearn:>15.3f} |")
print(f"| {'IRLS vs sklearn':<16} | {diff_irls_sklearn:>15.3f} |")
print(f"| {'NR vs IRLS':<16} | {diff_nr_irls:>15.3f} |")


| Метод            | Accuracy |
|------------------|----------|
| Ньютон-Рафсон    |    0.930 |
| IRLS             |    0.947 |
| sklearn          |    0.965 |

| Пара методов     | Разница Accuracy|
|------------------|-----------------|
| NR vs sklearn    |           0.035 |
| IRLS vs sklearn  |           0.018 |
| NR vs IRLS       |           0.018 |
