# 1. LogisticRegression

# 2. 실습

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score, roc_curve, auc

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, cross_val_predict
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import scipy.stats
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

import time

# Data

- 데이터 출처
  - https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data

In [2]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data = data.drop(['id'], axis=1)
data = data[data['gender'] != 'Other']
data.head()

df = data.copy()
df.loc[:, ["hypertension", "heart_disease", "stroke"]] = data.loc[:, ["hypertension", "heart_disease", "stroke"]].applymap(lambda x: "Yes" if x == 1 else "No")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


## A. 전처리

In [3]:
X = df.drop(["stroke"], axis=1)
y = LabelEncoder().fit_transform(df['stroke'])

X_num = X.select_dtypes(include = 'number')
X_cat = X.select_dtypes(exclude = 'number')

X[X_num.columns] = SimpleImputer(strategy="mean").fit_transform(X_num)
X[X_cat.columns] = SimpleImputer(strategy="most_frequent").fit_transform(X_cat)

In [4]:
scaler = StandardScaler()

onehot = OneHotEncoder(drop = 'first', handle_unknown='ignore', sparse_output=False)

ct = ColumnTransformer([('scaler', scaler, X_num.columns),
                        ('onehot', onehot, X_cat.columns)], 
                       remainder='passthrough', n_jobs=-1)

ct

# 1. `LogisticRegression()`

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
predictr = LogisticRegression(penalty='l1', solver='saga', random_state=42, max_iter = 1000)

pipe = Pipeline([('ct', ct), ("Lasso", predictr)]);pipe

In [7]:
param_dist = {
    'Lasso__C': scipy.stats.loguniform(1e-4, 1e4)
}

In [8]:
start_time = time.time()

predictr_rs = RandomizedSearchCV(pipe, 
                                 param_dist,
                                 n_iter = 50, # default
                                 cv=5, 
                                 scoring='accuracy',
                                 random_state=42, 
                                 refit = True,
                                 n_jobs = -1)

predictr_rs.fit(X_train, y_train)
pipe.set_params(**{key: round(value, 2) for key, value in predictr_rs.best_params_.items()}).fit(X_train, y_train)

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))



코드 실행 시간: 16.5 초


In [9]:
start_time = time.time()

scores = cross_validate(estimator = pipe,
                        X=X_train,
                        y=y_train,
                        scoring = ['accuracy', 'roc_auc'],
                        cv=5,
                        n_jobs = -1)

accuracy_score_rs = np.mean(scores['test_accuracy']).round(4)
roc_auc_score_rs = np.mean(scores['test_roc_auc']).round(4)

end_time = time.time()
print("코드 실행 시간: {:.1f} 초".format(end_time - start_time))

코드 실행 시간: 0.1 초


In [23]:
accuracy_score_rs_test = pipe.named_steps['Lasso'].score(ct.transform(X_test), y_test).round(4)

In [24]:
print('train: %s, test: %s' % (accuracy_score_rs, accuracy_score_rs_test))

train: 0.9513, test: 0.9511
