In [1]:
from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

In [3]:
# 1. 데이터 로드
df_X_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/X_train_sample2.csv')
df_y_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/y_train_sample2.csv')
df_X_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/X_test_original.csv')
df_y_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/y_test_original.csv')

In [4]:
len(df_y_train)

246

In [5]:
X_train = df_X_train.values
y_train = df_y_train.values.reshape(-1)  # 1차원 배열(벡터)로 변환
X_test = df_X_test.values
y_test = df_y_test.values.reshape(-1)  # 1차원 배열(벡터)로 변환

In [6]:
# 소수 클래스와 다수 클래스 식별
target_counts = pd.Series(y_train).value_counts()
minority_class = target_counts.idxmin()
majority_class = target_counts.idxmax()

# 소수 클래스 샘플 추출
X_minority = X_train[y_train == minority_class]
X_major = X_train[y_train == majority_class]

In [7]:
!pip install dtaidistance



In [8]:
from dtaidistance import dtw
from scipy.spatial.distance import euclidean
import random
from tqdm import tqdm
from dtaidistance.dtw import distance  # 함수 이름은 distance임

In [9]:
from dtaidistance.dtw import distance as dtw_distance  # 함수 이름을 바꿔서 import

def calculate_all_dtw_distances(X):
    distances = []
    n = len(X)
    for i in range(n):
        for j in range(i + 1, n):
            dist_val = dtw_distance(X[i], X[j])  # 함수 호출
            distances.append((i, j, dist_val))   # 결과 저장
    return distances


In [10]:
# DTW 거리 계산 실행
X_all = np.vstack((X_minority, X_major))  # 전체 학습 데이터
y_all = np.hstack((np.full(len(X_minority), minority_class), np.full(len(X_major), majority_class)))

print(f"전체 샘플 수: {len(X_all)}, 소수 클래스: {minority_class}, 다수 클래스: {majority_class}")

# 모든 샘플 간의 DTW 거리 계산
dtw_distances = calculate_all_dtw_distances(X_all)

# 거리 행렬 초기화
n_samples = len(X_all)
dtw_matrix = np.zeros((n_samples, n_samples))

# 거리 행렬 채우기
for i, j, dist_val in dtw_distances:
    dtw_matrix[i, j] = dist_val
    dtw_matrix[j, i] = dist_val  # 대칭 행렬

print("DTW 거리 행렬 계산 완료:", dtw_matrix.shape)


전체 샘플 수: 246, 소수 클래스: 0, 다수 클래스: 1
DTW 거리 행렬 계산 완료: (246, 246)


In [11]:
k = 3  # 홀수로 설정 (예: 3, 5, 7 등)

safe_minority_indices = []  # 안전 샘플 인덱스
risky_minority_indices = []  # 위험 샘플 인덱스

minority_indices = np.where(y_all == minority_class)[0]
majority_indices = np.where(y_all == majority_class)[0]

for idx in minority_indices:
    distances = dtw_matrix[idx]

    # 자기 자신 제외하고 가까운 이웃 k개 선택
    nearest_indices = np.argsort(distances)[1:k+1]

    # 이웃의 클래스 확인
    neighbor_labels = y_all[nearest_indices]
    minority_count = np.sum(neighbor_labels == minority_class)
    majority_count = np.sum(neighbor_labels == majority_class)

    # 안전 / 위험 분류
    if minority_count < majority_count:
        safe_minority_indices.append(idx)
    else:
        risky_minority_indices.append(idx)

print(f"총 소수 클래스 수: {len(minority_indices)}")
print(f"안전 샘플 수: {len(safe_minority_indices)}")
print(f"위험 샘플 수: {len(risky_minority_indices)}")


총 소수 클래스 수: 12
안전 샘플 수: 11
위험 샘플 수: 1


In [12]:
# 증강할 데이터 수
n_augment = len(X_major) - len(X_minority)  # 다수 - 소수
n_safe = len(safe_minority_indices)

# 한 쌍에서 생성해야 할 수
n_per_pair = n_augment // n_safe
remainder = n_augment % n_safe

print(f"총 생성할 샘플 수: {n_augment} (각 쌍당 {n_per_pair}개, 남는 수 {remainder})")


총 생성할 샘플 수: 222 (각 쌍당 20개, 남는 수 2)


In [13]:

augmented_data = []
random.seed(42)

# 안전 샘플 쌍 조합을 만들고 증강
for i, idx in enumerate(safe_minority_indices):
    for _ in range(n_per_pair):
        # 다른 안전 샘플 하나 무작위 선택
        other_idx = random.choice([j for j in safe_minority_indices if j != idx])

        # 선형 보간 (alpha: 0~1 사이 랜덤)
        alpha = np.random.rand()
        new_sample = alpha * X_all[idx] + (1 - alpha) * X_all[other_idx]
        augmented_data.append(new_sample)

# 남는 remainder 개수도 랜덤하게 더 생성
for _ in range(remainder):
    idx1, idx2 = random.sample(safe_minority_indices, 2)
    alpha = np.random.rand()
    new_sample = alpha * X_all[idx1] + (1 - alpha) * X_all[idx2]
    augmented_data.append(new_sample)

# 최종 증강 데이터와 레이블 결합
X_aug = np.array(augmented_data)
y_aug = np.full(len(X_aug), minority_class)

# 기존 학습 데이터와 결합
X_synthetic = np.vstack((X_all, X_aug))
y_synthetic = np.hstack((y_all, y_aug))

print("증강 완료!")
print(f"최종 데이터 크기: {X_synthetic.shape}, 레이블 분포: {pd.Series(y_synthetic).value_counts().to_dict()}")


증강 완료!
최종 데이터 크기: (468, 80), 레이블 분포: {0: 234, 1: 234}


In [14]:
# 교차 검증을 위한 설정
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import LSTM, Dense, Conv1D, Flatten
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.metrics import specificity_score # specificity_score 임포트


In [15]:
def predict_with_logistic_regression(X_train, y_train, X_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_cart(X_train, y_train, X_test):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_knn(X_train, y_train, X_test, k=3):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_xgboost(X_train, y_train, X_test):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_lstm(X_train, y_train, X_test):
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=0)
    return (model.predict(X_test_reshaped) > 0.5).astype(int)

def predict_with_cnn(X_train, y_train, X_test):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=0)
    return (model.predict(X_test_reshaped) > 0.5).astype(int)


In [17]:
# 예측 실행 및 평가
predictions = {
    "Logistic Regression": predict_with_logistic_regression(X_synthetic, y_synthetic, X_test),
    "CART": predict_with_cart(X_synthetic, y_synthetic, X_test),
    "KNN": predict_with_knn(X_synthetic, y_synthetic, X_test),
    "XGBoost": predict_with_xgboost(X_synthetic, y_synthetic, X_test),
    "LSTM": predict_with_lstm(X_synthetic, y_synthetic, X_test),
    "CNN": predict_with_cnn(X_synthetic, y_synthetic, X_test)
}

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

  super().__init__(**kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [18]:

# 성능 지표 저장을 위한 리스트
results = []

for model_name, pred_y in predictions.items():
    accuracy = accuracy_score(y_test, pred_y)
    recall = recall_score(y_test, pred_y)
    f1 = f1_score(y_test, pred_y)
    specificity = specificity_score(y_test, pred_y)
    conf_matrix = confusion_matrix(y_test, pred_y)
    results.append([accuracy, f1, recall, specificity, conf_matrix])


In [19]:
# 성능 지표를 DataFrame으로 변환
results_df = pd.DataFrame(results, columns=["Accuracy", "F1", "Recall", "Specificity", "Confusion Matrix"], index=predictions.keys())
model_results = results_df.T
# 결과를 출력
print("\n모델 성능 비교 결과:")
print(model_results)

# 결과를 CSV 파일로 저장
model_results.to_csv("/content/drive/My Drive/PhalangesOutlinesCorrect/results/bsmote_soso_result.csv")


모델 성능 비교 결과:
                     Logistic Regression                    CART  \
Accuracy                        0.611888                 0.62704   
F1                              0.740856                0.752705   
Recall                          0.904943                0.925856   
Specificity                      0.14759                0.153614   
Confusion Matrix  [[49, 283], [50, 476]]  [[51, 281], [39, 487]]   

                                      KNN                 XGBoost  \
Accuracy                         0.681818                0.648019   
F1                               0.769231                0.773273   
Recall                           0.865019                0.979087   
Specificity                      0.391566                0.123494   
Confusion Matrix  [[130, 202], [71, 455]]  [[41, 291], [11, 515]]   

                                      LSTM                     CNN  
Accuracy                          0.547786                 0.63986  
F1                      