# Лабораторная работа №2. Логистическая и линейная регрессия

Используются те же датасеты:

- **Классификация:** съедобность грибов
- **Регрессия:** популярность треков

Цели лабораторной:

1. Построить и исследовать модели логистической регрессии для задачи классификации.
2. Построить и исследовать модели линейной (и ридж-) регрессии для задачи регрессии.
3. Сравнить результаты с KNN из ЛР1 по выбранным метрикам.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_absolute_error, mean_squared_error, r2_score,
    classification_report
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [3]:

mushroom_path = "../data/mushroom-data.csv"
spotify_path = "../data/modified-spotify-dataset.csv"

mushroom_df = pd.read_csv(mushroom_path)
spotify_df = pd.read_csv(spotify_path)

print("Mushrooms shape:", mushroom_df.shape)
print("Spotify shape:", spotify_df.shape)

mushroom_df.head(), spotify_df.head()


Mushrooms shape: (8124, 23)
Spotify shape: (6987, 159)


(  Mushroom_quality cap_shape cap_surface cap_color bruises odor  \
 0                p         x           s         n       t    p   
 1                e         x           s         y       t    a   
 2                e         b           s         w       t    l   
 3                p         x           y         w       t    p   
 4                e         x           s         g       f    n   
 
   gill_attachment gill_spacing gill_size gill_color  ...  \
 0               f            c         n          k  ...   
 1               f            c         b          k  ...   
 2               f            c         b          n  ...   
 3               f            c         n          n  ...   
 4               f            w         b          k  ...   
 
   stalk_surface_below_ring stalk_color_above_ring stalk_color_below_ring  \
 0                        s                      w                      w   
 1                        s                      w                  

In [4]:
target_col_mush = "Mushroom_quality" 

X_mush = mushroom_df.drop(columns=[target_col_mush])
y_mush = mushroom_df[target_col_mush]

X_mush_train, X_mush_test, y_mush_train, y_mush_test = train_test_split(
    X_mush, y_mush, test_size=0.2, random_state=RANDOM_STATE, stratify=y_mush
)

print("Train mushrooms:", X_mush_train.shape, "Test:", X_mush_test.shape)


Train mushrooms: (6499, 22) Test: (1625, 22)


In [5]:
target_col_spot = "popularity"

cols_to_drop = [
    "track_id", "track_name", "artist_name", "album_name", "popularity_level"  
]
cols_to_drop = [c for c in cols_to_drop if c in spotify_df.columns]

features_spot_df = spotify_df.drop(columns=[target_col_spot] + cols_to_drop)
y_spot = spotify_df[target_col_spot]

numeric_features_spot = features_spot_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features_spot = features_spot_df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

X_spot_train, X_spot_test, y_spot_train, y_spot_test = train_test_split(
    features_spot_df, y_spot, test_size=0.2, random_state=RANDOM_STATE
)

print("Numeric features (Spotify):", len(numeric_features_spot))
print("Categorical features (Spotify):", len(categorical_features_spot))
print("Train Spotify:", X_spot_train.shape, "Test:", X_spot_test.shape)


Numeric features (Spotify): 156
Categorical features (Spotify): 1
Train Spotify: (5589, 157) Test: (1398, 157)


In [6]:
# Бейзлайн логистической регрессии для грибов
# Препроцессинг: OHE для всех категориальных признаков
mush_preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), X_mush_train.columns.tolist())
    ]
)

logreg_clf_baseline = Pipeline(
    steps=[
        ("preprocess", mush_preprocess),
        ("model", LogisticRegression(max_iter=1000, n_jobs=-1))
    ]
)

logreg_clf_baseline.fit(X_mush_train, y_mush_train)
y_mush_pred_lr = logreg_clf_baseline.predict(X_mush_test)

acc_lr = accuracy_score(y_mush_test, y_mush_pred_lr)
prec_lr = precision_score(y_mush_test, y_mush_pred_lr, average="macro")
rec_lr = recall_score(y_mush_test, y_mush_pred_lr, average="macro")
f1_lr = f1_score(y_mush_test, y_mush_pred_lr, average="macro")

print("=== Logistic Regression (Mushrooms, baseline) ===")
print(f"Accuracy: {acc_lr:.4f}")
print(f"Precision (macro): {prec_lr:.4f}")
print(f"Recall (macro): {rec_lr:.4f}")
print(f"F1-macro: {f1_lr:.4f}")
print("\nClassification report:")
print(classification_report(y_mush_test, y_mush_pred_lr))


=== Logistic Regression (Mushrooms, baseline) ===
Accuracy: 0.9994
Precision (macro): 0.9994
Recall (macro): 0.9994
F1-macro: 0.9994

Classification report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       842
           p       1.00      1.00      1.00       783

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



### Улучшенный бейзлайн логистической регрессии

Гипотеза: изменение силы L2‑регуляризации (`C`) может улучшить стабильность и качество модели


In [7]:
# Улучшенный вариант логистической регрессии (подбор C)

logreg_clf_pipeline = Pipeline(
    steps=[
        ("preprocess", mush_preprocess),
        ("model", LogisticRegression(max_iter=1000, n_jobs=-1))
    ]
)

param_grid_logreg = {
    "model__C": [0.01, 0.1, 1.0, 10.0],
    "model__penalty": ["l2"],
    "model__solver": ["lbfgs"],
}

grid_logreg = GridSearchCV(
    logreg_clf_pipeline,
    param_grid=param_grid_logreg,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1
)

grid_logreg.fit(X_mush_train, y_mush_train)
print("Лучшие параметры (LogReg Mushrooms):", grid_logreg.best_params_)

best_logreg = grid_logreg.best_estimator_
y_mush_pred_lr_best = best_logreg.predict(X_mush_test)

acc_lr_b = accuracy_score(y_mush_test, y_mush_pred_lr_best)
prec_lr_b = precision_score(y_mush_test, y_mush_pred_lr_best, average="macro")
rec_lr_b = recall_score(y_mush_test, y_mush_pred_lr_best, average="macro")
f1_lr_b = f1_score(y_mush_test, y_mush_pred_lr_best, average="macro")

print("=== Logistic Regression (Mushrooms, improved) ===")
print(f"Accuracy: {acc_lr_b:.4f}")
print(f"Precision (macro): {prec_lr_b:.4f}")
print(f"Recall (macro): {rec_lr_b:.4f}")
print(f"F1-macro: {f1_lr_b:.4f}")


Лучшие параметры (LogReg Mushrooms): {'model__C': 1.0, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}
=== Logistic Regression (Mushrooms, improved) ===
Accuracy: 0.9994
Precision (macro): 0.9994
Recall (macro): 0.9994
F1-macro: 0.9994


In [9]:
#Бейзлайн линейной регрессии для Spotify

from sklearn.metrics import root_mean_squared_error


spot_preprocess_lin = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features_spot),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features_spot),
    ]
)

linreg_baseline = Pipeline(
    steps=[
        ("preprocess", spot_preprocess_lin),
        ("model", LinearRegression())
    ]
)

linreg_baseline.fit(X_spot_train, y_spot_train)
y_spot_pred_lr = linreg_baseline.predict(X_spot_test)

mae_lr = mean_absolute_error(y_spot_test, y_spot_pred_lr)
rmse_lr = root_mean_squared_error(y_spot_test, y_spot_pred_lr)
r2_lr = r2_score(y_spot_test, y_spot_pred_lr)

print("=== Linear Regression (Spotify, baseline) ===")
print(f"MAE: {mae_lr:.4f}")
print(f"RMSE: {rmse_lr:.4f}")
print(f"R2: {r2_lr:.4f}")


=== Linear Regression (Spotify, baseline) ===
MAE: 0.3047
RMSE: 0.4895
R2: 0.7705


### 3.1. Улучшенный бейзлайн: Ridge-регрессия

Гипотеза: добавление L2‑регуляризации (Ridge) улучшит устойчивость модели к мультиколлинеарности и большому числу признаков


In [11]:
ridge_pipeline = Pipeline(
    steps=[
        ("preprocess", spot_preprocess_lin),
        ("model", Ridge())
    ]
)

param_grid_ridge = {
    "model__alpha": [0.1, 1.0, 10.0, 50.0]
}

grid_ridge = GridSearchCV(
    ridge_pipeline,
    param_grid=param_grid_ridge,
    cv=3,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

grid_ridge.fit(X_spot_train, y_spot_train)
print("Лучшие параметры (Ridge Spotify):", grid_ridge.best_params_)

best_ridge = grid_ridge.best_estimator_
y_spot_pred_ridge = best_ridge.predict(X_spot_test)

mae_ridge = mean_absolute_error(y_spot_test, y_spot_pred_ridge)
rmse_ridge = root_mean_squared_error(y_spot_test, y_spot_pred_ridge)
r2_ridge = r2_score(y_spot_test, y_spot_pred_ridge)

print("=== Ridge Regression (Spotify, improved) ===")
print(f"MAE: {mae_ridge:.4f}")
print(f"RMSE: {rmse_ridge:.4f}")
print(f"R2: {r2_ridge:.4f}")


Лучшие параметры (Ridge Spotify): {'model__alpha': 0.1}
=== Ridge Regression (Spotify, improved) ===
MAE: 0.3043
RMSE: 0.4893
R2: 0.7707


In [13]:
# сравнения с результатами KNN 
knn_mush_acc_baseline = 1.0
knn_mush_f1_baseline = 1.0
knn_mush_acc_improved = 1.0
knn_mush_f1_improved = 1.0

knn_spot_mae_baseline = 0.357071
knn_spot_rmse_baseline = 0.560296
knn_spot_r2_baseline = 0.685171

knn_spot_mae_improved = 0.380839
knn_spot_rmse_improved = 0.585831
knn_spot_r2_improved = 0.655822

results_mush_lr = pd.DataFrame({
    "model": ["KNN_baseline", "KNN_improved", "LogReg_baseline", "LogReg_improved"],
    "accuracy": [knn_mush_acc_baseline, knn_mush_acc_improved, acc_lr, acc_lr_b],
    "f1_macro": [knn_mush_f1_baseline, knn_mush_f1_improved, f1_lr, f1_lr_b]
})

print("=== Mushrooms: KNN vs Logistic Regression ===")
print(results_mush_lr)

results_spot_lr = pd.DataFrame({
    "model": ["KNN_baseline", "KNN_improved", "LinReg_baseline", "Ridge_improved"],
    "MAE": [knn_spot_mae_baseline, knn_spot_mae_improved, mae_lr, mae_ridge],
    "RMSE": [knn_spot_rmse_baseline, knn_spot_rmse_improved, rmse_lr, rmse_ridge],
    "R2": [knn_spot_r2_baseline, knn_spot_r2_improved, r2_lr, r2_ridge]
})

print("\n=== Spotify: KNN vs Linear/Ridge ===")
print(results_spot_lr)


=== Mushrooms: KNN vs Logistic Regression ===
             model  accuracy  f1_macro
0     KNN_baseline  1.000000  1.000000
1     KNN_improved  1.000000  1.000000
2  LogReg_baseline  0.999385  0.999384
3  LogReg_improved  0.999385  0.999384

=== Spotify: KNN vs Linear/Ridge ===
             model       MAE      RMSE        R2
0     KNN_baseline  0.357071  0.560296  0.685171
1     KNN_improved  0.380839  0.585831  0.655822
2  LinReg_baseline  0.304748  0.489485  0.770496
3   Ridge_improved  0.304321  0.489291  0.770679
