In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
df=pd.read_csv(r"C:\고대 3-1\비트\python\dataset.csv")
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
# 결측값 처리
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())  # bmi의 결측값을 평균으로 대체
df['smoking_status'] = df['smoking_status'].fillna('Unknown')  # smoking_status의 결측값을 'Unknown'으로 대체

# Oversampling을 통한 데이터 불균형 해소
positive = df[df['stroke'] == 1]
negative = df[df['stroke'] == 0]

oversampled_positive = positive.sample(len(negative), replace=True, random_state=42) #양성 클래스 개수를 음성 클래스 개수만큼 복제제
balanced_df = pd.concat([negative, oversampled_positive])

# 입력 데이터와 타겟 데이터 분리
X = balanced_df.drop(columns=['stroke'])
y = balanced_df['stroke']

# 범주형 변수 인코딩
categrocial_cols=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categrocial_cols:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# 학습용과 테스트용 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
#정상적으로 나뉘었는지 확인인
print('Labels counts in y:', np.bincount(y))
print('Labels counts in y_train:', np.bincount(y_train))
print('Labels counts in y_test:', np.bincount(y_test))

#X freature 표준화
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)


Labels counts in y: [42617 42617]
Labels counts in y_train: [29831 29832]
Labels counts in y_test: [12786 12785]


In [None]:
########################SVM 모델 학습#########################
from sklearn.svm import SVC
#svm 모델 학습
svm_model = SVC(kernel='rbf', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)

print('SVM accuracy:', svm_model.score(X_test, y_test))
#SVM accuracy: 0.8448242149309765

SVM accuracy: 0.8448242149309765


In [None]:
# SVM AUCROC AUCPR 점수 계산
from sklearn.model_selection import cross_val_score

roc_scores = cross_val_score(svm_model, X_train, y_train, cv=3, scoring='roc_auc')
pr_scores = cross_val_score(svm_model, X_train, y_train, cv=3, scoring='average_precision')

print("평균 AUCROC:", roc_scores.mean())
print("평균 AUCPR :", pr_scores.mean())
#평균 AUCROC: 0.9039329164678082
#평균 AUCPR : 0.8445722790493368

평균 AUCROC: 0.9039329164678082
평균 AUCPR : 0.8445722790493368


In [None]:
##################MLP 모델 학습######################
from sklearn.neural_network import MLPClassifier

mlp=MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)
print('MLP accuracy:', mlp.score(X_test, y_test))

#MLP accuracy: 0.9773180556098705

##MLPClassifier(
#   hidden_layer_sizes=(128, 64, 32),activation='relu',solver='adam',alpha=0.001,  # L2 규제
# learning_rate='adaptive',max_iter=500,early_stopping=True,random_state=42)

MLP accuracy: 0.9773180556098705




In [None]:
#MLP AUCROC AUCPR 점수 계산
from sklearn.model_selection import cross_val_score

roc_scores = cross_val_score(mlp, X_train, y_train, cv=3, scoring='roc_auc')
pr_scores = cross_val_score(mlp, X_train, y_train, cv=3, scoring='average_precision')

print("평균 AUCROC:", roc_scores.mean())
print("평균 AUCPR :", pr_scores.mean())
#평균 AUCROC: 0.9786021348528643
#평균 AUCPR : 0.9515377954508728

평균 AUCROC: 0.9786021348528643
평균 AUCPR : 0.9515377954508728
