In [7]:
import pymysql
import tensorflow as tf
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
# GPU 설정
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("GPU 설정 완료:", gpus[0])
    except RuntimeError as e:
        print(e)

# 데이터베이스 연결 함수
def get_db():
    db = pymysql.connect(
        host='human-mysql.mysql.database.azure.com',  # Azure MySQL Host
        port=3306,  # Port number (MySQL default is 3306)
        user='human',  # Username
        passwd='!q1w2e3r4',  # Password
        db='humandb',  # Database name
        ssl_ca=r'/home/azureuser/Desktop/config/DigiCertGlobalRootG2.crt.pem'  # SSL certificate path
    )
    return db

# DB에서 데이터 로드
db_connection = get_db()
cursor = db_connection.cursor()

query = "SELECT * FROM humandb.modeling_final"
cursor.execute(query)
columns = [desc[0] for desc in cursor.description]  # 컬럼 이름 가져오기
all_data = cursor.fetchall()  # 데이터 가져오기

# Pandas DataFrame으로 변환
df = pd.DataFrame(all_data, columns=columns)

# 데이터 확인
print("데이터셋의 크기:", df.shape)
print("컬럼명:", df.columns)

데이터셋의 크기: (57739, 17)
컬럼명: Index(['Index', 'HeartRate', 'BreathRate', 'SPO2', 'SkinTemperature',
       'SleepPhase', 'SleepScore', 'WalkingSteps', 'StressIndex',
       'ActivityIntensity', 'CaloricExpenditure', '심박', '호흡', '피부온도', '혈중산소농도',
       '일상', '상태'],
      dtype='object')


In [8]:
# 다중 라벨 컬럼 선택
y = df[['상태']]  # 다중 라벨 대상

# 입력 데이터(X) 설정
x = df.drop(columns=['Index', '심박', '호흡', '피부온도', '혈중산소농도','일상','상태'])  # 독립 변수


# 데이터의 shape 확인
print("x (입력 데이터) shape:", x.shape)
print("y (출력 데이터) shape:", y.shape)

x (입력 데이터) shape: (57739, 10)
y (출력 데이터) shape: (57739, 1)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42, stratify=y_train)


In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# LightGBM Dataset 변환
dtrain = lgb.Dataset(x_train, label=y_train)
dval = lgb.Dataset(x_val, label=y_val, reference=dtrain)

# 하이퍼파라미터 설정
params = {
    "objective": "multiclass",  # LightGBM에서 다중 클래스 분류
    "num_class": 3,  # 클래스 개수
    "metric": "multi_logloss",  # LightGBM에서 multi-logloss 사용
    "learning_rate": 0.02,
    "max_depth": 4,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "random_state": 42
}

# 모델 학습
results = {}
lightgbm_model = lgb.train(
    params=params,
    train_set=dtrain,
    num_boost_round=50,
    valid_sets=[dtrain, dval],
    valid_names=["train", "val"],# LightGBM에서는 리스트로 설정
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# 예측 수행
y_pred_probs = lightgbm_model.predict(x_test)  # 확률값 반환
y_pred = np.argmax(y_pred_probs, axis=1)  # 가장 확률이 높은 클래스로 변환

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"테스트 정확도: {accuracy:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1140
[LightGBM] [Info] Number of data points in the train set: 36952, number of used features: 10
[LightGBM] [Info] Start training from score -0.466546
[LightGBM] [Info] Start training from score -1.146533
[LightGBM] [Info] Start training from score -2.898633
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[50]	train's multi_logloss: 0.180332	val's multi_logloss: 0.18019
테스트 정확도: 0.9992

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7243
           1       1.00      1.00      1.00      3669
           2       1.00      0.99      0.99       636

    accuracy                           1.00     115

In [11]:
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

# 예측 확률값
y_pred_probs = lightgbm_model.predict(x_test)  # 각 클래스에 대한 확률값 반환
y_pred = np.argmax(y_pred_probs, axis=1)  # 가장 높은 확률을 가진 클래스로 변환

# AUROC 계산 (다중 클래스의 경우 평균 방법 선택)
auroc = roc_auc_score(y_test, y_pred_probs, multi_class="ovr")  # "ovr" (One-vs-Rest 방식)

# Log Loss 계산
logloss = log_loss(y_test, y_pred_probs)

# Accuracy 계산
accuracy = accuracy_score(y_test, y_pred)

# 결과 출력
print(f"AUROC Score: {auroc:.4f}")
print(f"Log Loss: {logloss:.4f}")
print(f"Accuracy: {accuracy:.4f}")

AUROC Score: 0.9998
Log Loss: 0.1807
Accuracy: 0.9992


In [14]:
# LightGBM Feature Importance 추출
lgb_feature_importance = lightgbm_model.feature_importance(importance_type="gain")

# Feature Importance 정리 (내림차순 정렬)
df_lgb_importance = pd.DataFrame({
    "Feature": df.drop(columns=['Index', '심박', '호흡', '피부온도', '혈중산소농도', '일상', '상태']).columns,  # 피처 이름
    "LightGBM Importance": lgb_feature_importance
}).sort_values(by="LightGBM Importance", ascending=False)  # 내림차순 정렬

# 출력
print("LightGBM Feature Importance:")
print(df_lgb_importance)

# CSV 파일로 저장
df_lgb_importance.to_csv("lightgbm_feature_importance.csv", index=False)


LightGBM Feature Importance:
              Feature  LightGBM Importance
4          SleepPhase        574424.518798
5          SleepScore        320222.153327
3     SkinTemperature        186371.530828
0           HeartRate         60790.590656
1          BreathRate         35573.442870
6        WalkingSteps          7636.966065
9  CaloricExpenditure          3266.455635
7         StressIndex           742.444622
8   ActivityIntensity           358.218578
2                SPO2            17.642501


In [10]:
# import joblib

# joblib.dump(lightgbm_model, "model1.pkl")  # 저장
# # model = joblib.load("model1.pkl")  # 불러오기


In [11]:
# 상태(Status) 값이 1 또는 2인 데이터만 필터링
df_filtered = df[df["상태"].isin([1, 2])]

# ✅ 필터링된 데이터 출력
from IPython.display import display
display(df_filtered)

Unnamed: 0,Index,HeartRate,BreathRate,SPO2,SkinTemperature,SleepPhase,SleepScore,WalkingSteps,StressIndex,ActivityIntensity,CaloricExpenditure,심박,호흡,피부온도,혈중산소농도,일상,상태
36213,356,80,24,98,-1.1,1,72,0,0,0,0,1,1,0,0,0,1
36214,357,77,21,97,0.2,1,72,0,0,0,0,1,1,0,0,0,1
36215,358,77,19,98,0.4,0,72,0,0,0,0,1,0,0,0,0,1
36216,359,75,21,98,0.5,0,72,0,0,0,0,1,1,0,0,0,1
36217,360,77,13,98,0.5,2,72,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57734,42949687163,86,22,98,-2.6,0,42,82,0,159,49,0,0,1,0,0,2
57735,42949687247,93,25,98,0.2,0,43,0,0,0,0,0,1,0,0,0,2
57736,51539607860,78,25,98,-1.3,1,12,0,0,0,0,0,1,0,0,0,2
57737,51539607862,76,25,98,-1.1,1,12,0,0,0,0,0,1,0,0,0,2


In [12]:
# 다중 라벨 컬럼 선택
y = df_filtered[['심박', '호흡', '피부온도', '혈중산소농도']]  # 다중 라벨 대상

# 입력 데이터(X) 설정
x = df_filtered.drop(columns=['Index', '심박', '호흡', '피부온도', '혈중산소농도', '일상','상태'])  # 독립 변수

# 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)


# 데이터의 shape 확인
print("x (입력 데이터) shape:", x_train.shape)
print("y (출력 데이터) shape:", y_train.shape)

x (입력 데이터) shape: (13776, 10)
y (출력 데이터) shape: (13776, 4)


In [13]:
models = []
for i in range(y_train.shape[1]):  # 각 라벨별 개별 모델 학습
    print(f"Training LightGBM Model for Label {i+1}...")

    # LightGBM Dataset 변환
    dtrain = lgb.Dataset(x_train, label=y_train.iloc[:, i])  # 🔥 iloc 사용
    dval = lgb.Dataset(x_val, label=y_val.iloc[:, i], reference=dtrain)  # 🔥 iloc 사용

    params = {
        "objective": "binary",  
        "metric": "binary_logloss",
        "learning_rate": 0.015,
        "max_depth": 4,
        "subsample": 0.7,
        "colsample_bytree": 0.7,
        "random_state": 42,
        "device": "gpu"  # GPU 가속
    }

    # LightGBM Booster 모델 학습
    model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=100,
        valid_sets=[dtrain, dval],  
        valid_names=["train", "val"],  
        callbacks=[lgb.early_stopping(stopping_rounds=3)]  
    )
    
    models.append(model)


Training LightGBM Model for Label 1...
[LightGBM] [Info] Number of positive: 9026, number of negative: 4750
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1138
[LightGBM] [Info] Number of data points in the train set: 13776, number of used features: 10
[LightGBM] [Info] Using GPU Device: NVIDIA A10-4Q, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (0.16 MB) transferred to GPU in 0.063281 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655197 -> initscore=0.641965
[LightGBM] [Info] Start training from score 0.641965
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[100]	train's binary_logloss: 0.175922	val's binary_logloss: 0.179881
Training LightGBM Model for Label 2...
[LightGBM] [Info] Number of

In [14]:
from sklearn.metrics import hamming_loss, f1_score, jaccard_score

# 모든 모델(4개)에서 예측 수행
y_pred_probs = np.array([model.predict(x_test) for model in models]).T  # (샘플 개수, 라벨 개수) 형태 변환
y_pred = (y_pred_probs > 0.5).astype(int)  # 확률값을 0 또는 1로 변환

# Hamming Loss (낮을수록 좋음)
hamming = hamming_loss(y_test, y_pred)

# Micro F1-Score (전체 샘플 기반)
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Macro F1-Score (각 클래스별 평균)
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Jaccard Score (Intersection over Union)
jaccard = jaccard_score(y_test, y_pred, average='samples')

# 결과 출력
print(f"Hamming Loss: {hamming:.4f}")
print(f"Micro F1-Score: {micro_f1:.4f}")
print(f"Macro F1-Score: {macro_f1:.4f}")
print(f"Jaccard Score: {jaccard:.4f}")

Hamming Loss: 0.0063
Micro F1-Score: 0.9902
Macro F1-Score: 0.9776
Jaccard Score: 0.9880


In [None]:
# import lightgbm as lgb
# import joblib  # 여러 개의 모델을 하나의 파일에 저장

# # 모델 저장 경로 설정
# model_path = "multi_label.pkl"

# # 4개 모델을 하나의 파일로 저장
# joblib.dump(models, model_path)
# print(f" All models saved in {model_path}")


 All models saved in multi_label.pkl


In [None]:

# 다중 라벨 예측 수행
y_pred_probs = np.zeros((x_test.shape[0], len(models)))  # (샘플 수, 라벨 수) 크기의 확률값 배열
for i, model in enumerate(models):
    y_pred_probs[:, i] = model.predict(x_test)  # x_test를 직접 예측

# 확률값을 0.5 기준으로 이진화
y_pred_binary = np.round(y_pred_probs).astype(int)

# 각 라벨별 정확도 평가
accuracies = []
for i in range(y_test.shape[1]):
    acc = accuracy_score(y_test.iloc[:, i], y_pred_binary[:, i])  # 🔥 iloc 사용
    accuracies.append(acc)
    print(f"라벨 {i+1} 정확도: {acc:.4f}")

# 전체 평균 정확도
overall_accuracy = np.mean(accuracies)
print(f"\n전체 평균 정확도: {overall_accuracy:.4f}")

라벨 1 정확도: 0.9912
라벨 2 정확도: 0.9958
라벨 3 정확도: 0.9926
라벨 4 정확도: 0.9951

전체 평균 정확도: 0.9937


In [None]:
# 예측 수행
y_pred_probs = np.zeros((x_test.shape[0], len(models)))  
for i, model in enumerate(models):
    y_pred_probs[:, i] = model.predict(x_test)

# 확률값을 0.5 기준으로 이진화
y_pred_binary = np.round(y_pred_probs).astype(int)

# 기존 라벨과 예측 라벨 비교를 위한 데이터프레임 생성
df_comparison = pd.DataFrame({
    "Sample_Index": range(len(y_test)),
    "True_Labels": [list(row) for row in y_test.values],
    "Predicted_Labels": [list(row) for row in y_pred_binary]
})

# 데이터프레임 직접 출력
from IPython.display import display
display(df_comparison)



Unnamed: 0,Sample_Index,True_Labels,Predicted_Labels
0,0,"[0, 1, 1, 0]","[0, 1, 1, 0]"
1,1,"[0, 0, 1, 0]","[0, 0, 1, 0]"
2,2,"[1, 0, 0, 0]","[1, 0, 0, 0]"
3,3,"[0, 1, 0, 0]","[0, 1, 0, 0]"
4,4,"[0, 0, 1, 0]","[0, 0, 1, 0]"
...,...,...,...
4301,4301,"[1, 0, 0, 0]","[1, 0, 0, 0]"
4302,4302,"[1, 0, 0, 0]","[1, 0, 0, 0]"
4303,4303,"[0, 1, 0, 0]","[0, 1, 0, 0]"
4304,4304,"[1, 0, 0, 0]","[1, 0, 0, 0]"
