In [1]:
from google.colab import drive
import os

# Google Drive 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tslearn
!pip install imbalanced-learn  # SMOTE를 사용하기 위해 imbalanced-learn 설치


Collecting tslearn
  Downloading tslearn-0.6.3-py3-none-any.whl.metadata (14 kB)
Downloading tslearn-0.6.3-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tslearn
Successfully installed tslearn-0.6.3


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, recall_score, confusion_matrix
from urllib.request import urlopen
from io import StringIO
from google.colab import drive
import random
from imblearn.metrics import specificity_score
from scipy.stats import mode
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, Flatten
from tslearn.metrics import dtw_path
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from collections import defaultdict

In [4]:
df_X_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/X_train_Worms.csv')
df_y_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/y_train_Worms.csv')
df_X_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/X_test_Worms.csv')
df_y_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/y_test_Worms.csv')

In [5]:
X_train = df_X_train.values
y_train = df_y_train.values.reshape(-1)
X_test = df_X_test.values
y_test = df_y_test.values.reshape(-1)

In [7]:
# DTW 기반 평균 시계열 생성 함수
def generate_synthetic_data_class(X_class, num_samples):
    synthetic_data = []
    for _ in range(num_samples):
        idx1, idx2 = np.random.choice(len(X_class), 2, replace=False)
        new_sample = (X_class[idx1] + X_class[idx2]) / 2  # 단순 평균
        synthetic_data.append(new_sample)
    return np.array(synthetic_data)

In [8]:
# 2. 클래스별로 증강이 필요한 개수 계산
class_counts = pd.Series(y_train).value_counts()
# 2. 평균 클래스 목표 수 (전체 샘플 수의 2배 / 클래스 수)
N = len(X_train)
target_total = N * 2
class_aver = target_total / len(class_counts)
# 3. 증강이 필요 없는 클래스 및 필요 클래스 분리
non_aug_classes = {}
aug_classes = {}

for cls, count in class_counts.items():
    if count >= class_aver:
        non_aug_classes[cls] = count
    else:
        aug_classes[cls] = count

# 4. 남은 목표 수 및 증강 클래스 수 계산
non_aug_sum = sum(non_aug_classes.values())
remaining_target = target_total - non_aug_sum
num_aug_classes = len(aug_classes)

# 5. 클래스별 목표 수 재산정
class_targets = {}
for cls, count in aug_classes.items():
    class_targets[cls] = int(remaining_target / num_aug_classes)

# 6. 클래스별 증강 수행
X_augmented_list = []
y_augmented_list = []

for cls, current_count in aug_classes.items():
    target = class_targets[cls]
    n_to_generate = target - current_count
    if n_to_generate > 0:
        X_class = X_train[y_train == cls]
        X_aug = generate_synthetic_data_class(X_class, n_to_generate)
        y_aug = np.full(len(X_aug), cls)
        X_augmented_list.append(X_aug)
        y_augmented_list.append(y_aug)

In [9]:
print("==== 증강 제외 클래스 ====")
for cls, count in non_aug_classes.items():
    print(f"Class {cls}: {count}개")

print("\n==== 증강 대상 클래스 및 목표 샘플 수 ====")
for cls in aug_classes:
    print(f"Class {cls}: 기존 {aug_classes[cls]}개 → 목표 {class_targets[cls]}개 → 생성 {class_targets[cls] - aug_classes[cls]}개")


==== 증강 제외 클래스 ====
Class 1: 76개

==== 증강 대상 클래스 및 목표 샘플 수 ====
Class 4: 기존 32개 → 목표 71개 → 생성 39개
Class 2: 기존 31개 → 목표 71개 → 생성 40개
Class 3: 기존 25개 → 목표 71개 → 생성 46개
Class 5: 기존 17개 → 목표 71개 → 생성 54개


In [10]:
# 7. 전체 학습 데이터 구성
if X_augmented_list:
    X_augmented_all = np.concatenate(X_augmented_list, axis=0)
    y_augmented_all = np.concatenate(y_augmented_list, axis=0)
    X_synthetic = np.concatenate((X_train, X_augmented_all), axis=0)
    y_synthetic = np.concatenate((y_train, y_augmented_all), axis=0)
else:
    X_synthetic = X_train
    y_synthetic = y_train

In [16]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Label 인코딩
le = LabelEncoder()
y_synthetic_encoded = le.fit_transform(y_synthetic)
y_test_encoded = le.transform(y_test)
num_classes = len(np.unique(y_synthetic_encoded))

# One-hot encoding
y_synthetic_cat = to_categorical(y_synthetic_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)


In [19]:
def predict_with_logistic_regression(X_train, y_train, X_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_cart(X_train, y_train, X_test):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_knn(X_train, y_train, X_test, k=3):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_xgboost(X_train, y_train, X_test, label_encoder=None):
    from xgboost import XGBClassifier

    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    pred_y = model.predict(X_test)

    if label_encoder is not None:
        pred_y = label_encoder.inverse_transform(pred_y)

    return pred_y

def predict_with_lstm(X_train, y_train, X_test, num_classes):
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=0)
    pred_prob = model.predict(X_test_reshaped)
    return np.argmax(pred_prob, axis=1)


def predict_with_cnn(X_train, y_train, X_test, num_classes):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=0)
    pred_prob = model.predict(X_test_reshaped)
    return np.argmax(pred_prob, axis=1)


In [20]:
predictions = {
    "Logistic Regression": predict_with_logistic_regression(X_synthetic, y_synthetic_encoded, X_test),
    "CART": predict_with_cart(X_synthetic, y_synthetic_encoded, X_test),
    "KNN": predict_with_knn(X_synthetic, y_synthetic_encoded, X_test),
    "XGBoost": predict_with_xgboost(X_synthetic, y_synthetic_encoded, X_test),
    "LSTM": predict_with_lstm(X_synthetic, y_synthetic_cat, X_test, num_classes),
    "CNN": predict_with_cnn(X_synthetic, y_synthetic_cat, X_test, num_classes)
}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

  super().__init__(**kwargs)


[1m2/3[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 137ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 184ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


In [21]:
# 성능 지표 저장을 위한 리스트
results = []
for model_name, pred_y in predictions.items():
    accuracy = accuracy_score(y_test_encoded, pred_y)
    recall = recall_score(y_test_encoded, pred_y, average='macro')
    f1 = f1_score(y_test_encoded, pred_y, average='macro')
    specificity = specificity_score(y_test_encoded, pred_y, average='macro')
    conf_matrix = confusion_matrix(y_test_encoded, pred_y)
    results.append([accuracy, f1, recall, specificity, conf_matrix])

In [22]:
from collections import Counter

# 증강 전 클래스별 샘플 수
original_class_distribution = Counter(y_train)
print("증강 전 클래스별 샘플 수:")
for cls, count in original_class_distribution.items():
    print(f"클래스 {cls}: {count}개")

# 증강 후 클래스별 샘플 수
augmented_class_distribution = Counter(y_synthetic)
print("\n증강 후 클래스별 샘플 수:")
for cls, count in augmented_class_distribution.items():
    print(f"클래스 {cls}: {count}개")

증강 전 클래스별 샘플 수:
클래스 1: 76개
클래스 2: 31개
클래스 3: 25개
클래스 4: 32개
클래스 5: 17개

증강 후 클래스별 샘플 수:
클래스 1: 76개
클래스 2: 71개
클래스 3: 71개
클래스 4: 71개
클래스 5: 71개


In [23]:
# 성능 지표를 DataFrame으로 변환
results_df = pd.DataFrame(results, columns=["Accuracy", "F1", "Recall", "Specificity", "Confusion Matrix"], index=predictions.keys())
model_results = results_df.T
# 결과를 출력
print("\n모델 성능 비교 결과:")
print(model_results)

# 결과를 CSV 파일로 저장
model_results.to_csv("/content/drive/My Drive/PhalangesOutlinesCorrect/results/(다중)tsaug_models_result.csv")


모델 성능 비교 결과:
                                                Logistic Regression  \
Accuracy                                                   0.350649   
F1                                                         0.312383   
Recall                                                     0.365711   
Specificity                                                0.829261   
Confusion Matrix  [[13, 2, 4, 8, 6], [3, 1, 3, 4, 2], [4, 1, 3, ...   

                                                               CART  \
Accuracy                                                   0.402597   
F1                                                         0.368952   
Recall                                                     0.417541   
Specificity                                                0.853598   
Confusion Matrix  [[14, 1, 10, 5, 3], [3, 2, 1, 6, 1], [2, 0, 5,...   

                                                                KNN  \
Accuracy                                                   0.

In [None]:
# y_test의 샘플링 비율 및 개수 계산
y_test_series = pd.Series(y_test)
class_counts = y_test_series.value_counts() # 클래스별 개수 계산
sampling_ratio = y_test_series.value_counts(normalize=True)  # normalize=True를 사용하여 비율 계산

print("\ny_test 샘플링 비율 및 개수:")
for class_value, ratio in sampling_ratio.items():
    count = class_counts[class_value]
    print(f"클래스 {class_value}: 비율 = {ratio:.4f} (개수 = {count})")