In [1]:
from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tslearn
!pip install imbalanced-learn  # SMOTE를 사용하기 위해 imbalanced-learn 설치
!pip install dtaidistance

Collecting tslearn
  Downloading tslearn-0.6.3-py3-none-any.whl.metadata (14 kB)
Downloading tslearn-0.6.3-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tslearn
Successfully installed tslearn-0.6.3
Collecting dtaidistance
  Downloading dtaidistance-2.3.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading dtaidistance-2.3.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dtaidistance
Successfully installed dtaidistance-2.3.13


In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict
from dtaidistance import dtw
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, Flatten
from scipy.stats import mode
import matplotlib.pyplot as plt
import seaborn as sns
from tslearn.metrics import dtw_path
from imblearn.metrics import specificity_score
# LabelEncoder 임포트(수정 클래스가 1부터 시작해서 클래스 레이블이 0부터 시작 만들어주는 것)
from sklearn.preprocessing import LabelEncoder

In [4]:
df_X_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/X_train_Worms.csv')
df_y_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/y_train_Worms.csv')
df_X_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/X_test_Worms.csv')
df_y_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/y_test_Worms.csv')

In [5]:
# 데이터 전처리
X_train = df_X_train.values
y_train = df_y_train.values.reshape(-1)  # 1차원 배열(벡터)로 변환
X_test = df_X_test.values
y_test = df_y_test.values.reshape(-1)  # 1차원 배열(벡터)로 변환

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(181, 900)
(181,)
(77, 900)
(77,)


In [26]:
from collections import defaultdict
from dtaidistance import dtw

# DTW 기반 데이터 증강 함수
def generate_dtw_synthetic_data(X_class, num_samples):
    synthetic_data = []
    for _ in range(num_samples):
        idx1, idx2 = np.random.choice(len(X_class), 2, replace=False)
        a, b = X_class[idx1], X_class[idx2]

        path = dtw.warping_path(a, b)

        a_to_b_map = defaultdict(list)
        for i, j in path:
            a_to_b_map[i].append(b[j])
        a_mapped = [np.mean([(a[i] + bj) / 2 for bj in a_to_b_map[i]]) if i in a_to_b_map else a[i]
                    for i in range(len(a))]

        b_to_a_map = defaultdict(list)
        for i, j in path:
            b_to_a_map[j].append(a[i])
        b_mapped = [np.mean([(b[j] + ai) / 2 for ai in b_to_a_map[j]]) if j in b_to_a_map else b[j]
                    for j in range(len(b))]

        dtw_aug = np.mean([a_mapped, b_mapped], axis=0)
        synthetic_data.append(dtw_aug)

    return np.array(synthetic_data)

In [7]:
# 2. 클래스별로 증강이 필요한 개수 계산
class_counts = pd.Series(y_train).value_counts()
max_count = class_counts.max()

In [8]:
class_counts

Unnamed: 0,count
1,76
4,32
2,31
3,25
5,17


In [12]:
# 2. 평균 클래스 목표 수 (전체 샘플 수의 2배 / 클래스 수)
N = len(X_train)
target_total = N * 2
class_aver = target_total / len(class_counts)

In [13]:
len(X_train)

181

In [14]:
class_aver

72.4

In [15]:
# 3. 증강이 필요 없는 클래스 및 필요 클래스 분리
non_aug_classes = {}
aug_classes = {}

for cls, count in class_counts.items():
    if count >= class_aver:
        non_aug_classes[cls] = count
    else:
        aug_classes[cls] = count

In [18]:
print('증강 필요 없음 : ', non_aug_classes, '증강 해야될 클래스 : ', aug_classes)

증강 필요 없음 :  {1: 76} 증강 해야될 클래스 :  {4: 32, 2: 31, 3: 25, 5: 17}


In [19]:
# 4. 남은 목표 수 및 증강 클래스 수 계산
non_aug_sum = sum(non_aug_classes.values())
remaining_target = target_total - non_aug_sum
num_aug_classes = len(aug_classes)

In [21]:
remaining_target

286

In [22]:
num_aug_classes

4

In [23]:
# 5. 클래스별 목표 수 재산정
class_targets = {}
for cls, count in aug_classes.items():
    class_targets[cls] = int(remaining_target / num_aug_classes)

In [24]:
class_targets

{4: 71, 2: 71, 3: 71, 5: 71}

In [27]:
# 6. 클래스별 증강 수행
X_augmented_list = []
y_augmented_list = []

for cls, current_count in aug_classes.items():
    target = class_targets[cls]
    n_to_generate = target - current_count
    if n_to_generate > 0:
        X_class = X_train[y_train == cls]
        X_aug = generate_dtw_synthetic_data(X_class, n_to_generate)
        y_aug = np.full(len(X_aug), cls)
        X_augmented_list.append(X_aug)
        y_augmented_list.append(y_aug)

In [28]:
print("==== 증강 제외 클래스 ====")
for cls, count in non_aug_classes.items():
    print(f"Class {cls}: {count}개")

print("\n==== 증강 대상 클래스 및 목표 샘플 수 ====")
for cls in aug_classes:
    print(f"Class {cls}: 기존 {aug_classes[cls]}개 → 목표 {class_targets[cls]}개 → 생성 {class_targets[cls] - aug_classes[cls]}개")


==== 증강 제외 클래스 ====
Class 1: 76개

==== 증강 대상 클래스 및 목표 샘플 수 ====
Class 4: 기존 32개 → 목표 71개 → 생성 39개
Class 2: 기존 31개 → 목표 71개 → 생성 40개
Class 3: 기존 25개 → 목표 71개 → 생성 46개
Class 5: 기존 17개 → 목표 71개 → 생성 54개


In [29]:
# 7. 전체 학습 데이터 구성
if X_augmented_list:
    X_augmented_all = np.concatenate(X_augmented_list, axis=0)
    y_augmented_all = np.concatenate(y_augmented_list, axis=0)
    X_synthetic = np.concatenate((X_train, X_augmented_all), axis=0)
    y_synthetic = np.concatenate((y_train, y_augmented_all), axis=0)
else:
    X_synthetic = X_train
    y_synthetic = y_train

In [46]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# 클래스 인코딩
le = LabelEncoder()
y_synthetic_encoded = le.fit_transform(y_synthetic)
y_test_encoded = le.transform(y_test)

# 클래스 수 정확히 반영
num_classes = len(np.unique(y_synthetic_encoded))

# One-hot encoding (LSTM/CNN에 사용)
y_synthetic_cat = to_categorical(y_synthetic_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)


In [40]:
def predict_with_logistic_regression(X_train, y_train, X_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_cart(X_train, y_train, X_test):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_knn(X_train, y_train, X_test, k=3):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_xgboost(X_train, y_train, X_test, label_encoder=None):
    from xgboost import XGBClassifier

    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    pred_y = model.predict(X_test)

    if label_encoder is not None:
        pred_y = label_encoder.inverse_transform(pred_y)

    return pred_y

def predict_with_lstm_multiclass(X_train, y_train_cat, X_test):
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Dense(num_classes, activation='softmax'))  # 출력층 변경
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train_cat, epochs=50, batch_size=32, verbose=0)

    pred_prob = model.predict(X_test_reshaped)
    return np.argmax(pred_prob, axis=1)  # 예측 클래스 추출


def predict_with_cnn_multiclass(X_train, y_train_cat, X_test):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))  # 출력층 변경
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train_cat, epochs=50, batch_size=32, verbose=0)

    pred_prob = model.predict(X_test_reshaped)
    return np.argmax(pred_prob, axis=1)


In [47]:
predictions = {
    "Logistic Regression": predict_with_logistic_regression(X_synthetic, y_synthetic, X_test),
    "CART": predict_with_cart(X_synthetic, y_synthetic, X_test),
    "KNN": predict_with_knn(X_synthetic, y_synthetic, X_test),
    "XGBoost": predict_with_xgboost(X_synthetic, y_synthetic_encoded, X_test, label_encoder=le),
    "LSTM": predict_with_lstm_multiclass(X_synthetic, y_synthetic_cat, X_test),
    "CNN": predict_with_cnn_multiclass(X_synthetic, y_synthetic_cat, X_test)
}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

  super().__init__(**kwargs)


[1m2/3[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 314ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 607ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step


In [48]:
# 성능 지표 저장을 위한 리스트
results = []
for model_name, pred_y in predictions.items():
    accuracy = accuracy_score(y_test_encoded, pred_y)
    recall = recall_score(y_test_encoded, pred_y, average='macro')
    f1 = f1_score(y_test_encoded, pred_y, average='macro')
    specificity = specificity_score(y_test_encoded, pred_y, average='macro')
    conf_matrix = confusion_matrix(y_test_encoded, pred_y)
    results.append([accuracy, f1, recall, specificity, conf_matrix])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
from collections import Counter

# 증강 전 클래스별 샘플 수
original_class_distribution = Counter(y_train)
print("증강 전 클래스별 샘플 수:")
for cls, count in original_class_distribution.items():
    print(f"클래스 {cls}: {count}개")

# 증강 후 클래스별 샘플 수
augmented_class_distribution = Counter(y_synthetic)
print("\n증강 후 클래스별 샘플 수:")
for cls, count in augmented_class_distribution.items():
    print(f"클래스 {cls}: {count}개")

증강 전 클래스별 샘플 수:
클래스 1: 76개
클래스 2: 31개
클래스 3: 25개
클래스 4: 32개
클래스 5: 17개

증강 후 클래스별 샘플 수:
클래스 1: 76개
클래스 2: 71개
클래스 3: 71개
클래스 4: 71개
클래스 5: 71개


In [50]:
# 성능 지표를 DataFrame으로 변환
results_df = pd.DataFrame(results, columns=["Accuracy", "F1", "Recall", "Specificity", "Confusion Matrix"], index=predictions.keys())
model_results = results_df.T
# 결과를 출력
print("\n모델 성능 비교 결과:")
print(model_results)

# 결과를 CSV 파일로 저장
model_results.to_csv("/content/drive/My Drive/PhalangesOutlinesCorrect/results/(다중)dtw_tsaug_models_worms_result.csv")


모델 성능 비교 결과:
                                                Logistic Regression  \
Accuracy                                                   0.077922   
F1                                                         0.075766   
Recall                                                     0.080769   
Specificity                                                0.827329   
Confusion Matrix  [[0, 12, 1, 5, 8, 7], [0, 3, 1, 3, 3, 3], [0, ...   

                                                               CART  \
Accuracy                                                    0.12987   
F1                                                         0.120888   
Recall                                                     0.148077   
Specificity                                                0.836068   
Confusion Matrix  [[0, 14, 5, 5, 7, 2], [0, 4, 3, 0, 2, 4], [0, ...   

                                                                KNN  \
Accuracy                                                   0.