In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install fastdtw

Collecting fastdtw
  Downloading fastdtw-0.3.4.tar.gz (133 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/133.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m133.1/133.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fastdtw
  Building wheel for fastdtw (setup.py) ... [?25l[?25hdone
  Created wheel for fastdtw: filename=fastdtw-0.3.4-cp311-cp311-linux_x86_64.whl size=542088 sha256=56e4e749bfd815de582b4fd0ed630107a14419943978c1946df637f946a74375
  Stored in directory: /root/.cache/pip/wheels/5c/8a/f6/fd3df9a9714677410a5ccbf3ca519e66db4a54a1c46ea95332
Successfully built fastdtw
Installing collected packages: fastdtw
Successfully installed fastdtw-0.3.4


In [3]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from imblearn.metrics import specificity_score # specificity_score 임포트
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, Flatten
import seaborn as sns
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import random
import numpy as np

In [4]:
df_X_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/X_train_Worms.csv')
df_y_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/y_train_Worms.csv')
df_X_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/X_test_Worms.csv')
df_y_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/y_test_Worms.csv')
X_train = df_X_train.values
y_train = df_y_train.values.reshape(-1)  # 1차원 배열(벡터)로 변환
X_test = df_X_test.values
y_test = df_y_test.values.reshape(-1)  # 1차원 배열(벡터)로 변환


In [5]:
# 1~5. 클래스별 증강이 필요한 개수 계산 및 목표 수 설정
from collections import defaultdict

class_counts = pd.Series(y_train).value_counts()
total_samples = len(X_train)
target_total = total_samples * 2
avg_target = target_total / len(class_counts)

non_aug_classes = {}
aug_classes = {}
for cls, count in class_counts.items():
    if count >= avg_target:
        non_aug_classes[cls] = count
    else:
        aug_classes[cls] = count

remaining_target = target_total - sum(non_aug_classes.values())
class_targets = {cls: int(remaining_target / len(aug_classes)) for cls in aug_classes}


In [6]:
remaining_target

286

In [7]:
class_targets

{4: 71, 2: 71, 3: 71, 5: 71}

In [10]:
# 6~9. 증강 수행 (DTW 기반 이웃 찾기 및 선형 보간)
def find_k_neighbors_dtw(X_class, k):
    neighbors_dict = {}
    for i, A in enumerate(X_class):
        distances = []
        for j, B in enumerate(X_class):
            if i == j:
                continue
            dist, _ = fastdtw(A.flatten(), B.flatten())
            distances.append((j, dist))
        distances.sort(key=lambda x: x[1])
        neighbors_dict[i] = [idx for idx, _ in distances[:k]]
    return neighbors_dict


In [11]:
def linear_interpolation(A, B, alpha):
    return alpha * A + (1 - alpha) * B

X_aug_list, y_aug_list = [], []

for cls in aug_classes:
    X_class = X_train[y_train == cls]
    target = class_targets[cls]
    n_to_generate = target - len(X_class)
    if n_to_generate <= 0:
        continue

    neighbor_dict = find_k_neighbors_dtw(X_class, k=5)

    for _ in range(n_to_generate):
        idx_a = np.random.randint(0, len(X_class))
        A = X_class[idx_a]
        neighbors = neighbor_dict[idx_a]
        idx_b = np.random.choice(neighbors)
        B = X_class[idx_b]

        alpha = np.random.uniform(0.2, 0.8)
        S = linear_interpolation(A, B, alpha)
        X_aug_list.append(S)
        y_aug_list.append(cls)

In [17]:
# 최종 데이터 생성
X_synthetic = np.vstack([X_train] + X_aug_list)
y_synthetic = np.concatenate([y_train, y_aug_list])

In [12]:
print("==== 증강 제외 클래스 ====")
for cls, count in non_aug_classes.items():
    print(f"Class {cls}: {count}개")

print("\n==== 증강 대상 클래스 및 목표 샘플 수 ====")
for cls in aug_classes:
    print(f"Class {cls}: 기존 {aug_classes[cls]}개 → 목표 {class_targets[cls]}개 → 생성 {class_targets[cls] - aug_classes[cls]}개")


==== 증강 제외 클래스 ====
Class 1: 76개

==== 증강 대상 클래스 및 목표 샘플 수 ====
Class 4: 기존 32개 → 목표 71개 → 생성 39개
Class 2: 기존 31개 → 목표 71개 → 생성 40개
Class 3: 기존 25개 → 목표 71개 → 생성 46개
Class 5: 기존 17개 → 목표 71개 → 생성 54개


In [18]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Label 인코딩
le = LabelEncoder()
y_synthetic_encoded = le.fit_transform(y_synthetic)
y_test_encoded = le.transform(y_test)
num_classes = len(np.unique(y_synthetic_encoded))

# One-hot encoding
y_synthetic_cat = to_categorical(y_synthetic_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)


In [19]:
def predict_with_logistic_regression(X_train, y_train, X_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_cart(X_train, y_train, X_test):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_knn(X_train, y_train, X_test, k=3):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_xgboost(X_train, y_train, X_test, label_encoder=None):
    from xgboost import XGBClassifier

    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    pred_y = model.predict(X_test)

    if label_encoder is not None:
        pred_y = label_encoder.inverse_transform(pred_y)

    return pred_y

def predict_with_lstm(X_train, y_train, X_test, num_classes):
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=0)
    pred_prob = model.predict(X_test_reshaped)
    return np.argmax(pred_prob, axis=1)


def predict_with_cnn(X_train, y_train, X_test, num_classes):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=0)
    pred_prob = model.predict(X_test_reshaped)
    return np.argmax(pred_prob, axis=1)


In [21]:
predictions = {
    "Logistic Regression": predict_with_logistic_regression(X_synthetic, y_synthetic_encoded, X_test),
    "CART": predict_with_cart(X_synthetic, y_synthetic_encoded, X_test),
    "KNN": predict_with_knn(X_synthetic, y_synthetic_encoded, X_test),
    "XGBoost": predict_with_xgboost(X_synthetic, y_synthetic_encoded, X_test),
    "LSTM": predict_with_lstm(X_synthetic, y_synthetic_cat, X_test, num_classes),
    "CNN": predict_with_cnn(X_synthetic, y_synthetic_cat, X_test, num_classes)
}


# 성능 지표 저장을 위한 리스트
results = []
for model_name, pred_y in predictions.items():
    accuracy = accuracy_score(y_test_encoded, pred_y)
    recall = recall_score(y_test_encoded, pred_y, average='macro')
    f1 = f1_score(y_test_encoded, pred_y, average='macro')
    specificity = specificity_score(y_test_encoded, pred_y, average='macro')
    conf_matrix = confusion_matrix(y_test_encoded, pred_y)
    results.append([accuracy, f1, recall, specificity, conf_matrix])

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 163ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


In [22]:
from collections import Counter

# 증강 전 클래스별 샘플 수
original_class_distribution = Counter(y_train)
print("증강 전 클래스별 샘플 수:")
for cls, count in original_class_distribution.items():
    print(f"클래스 {cls}: {count}개")

# 증강 후 클래스별 샘플 수
augmented_class_distribution = Counter(y_synthetic)
print("\n증강 후 클래스별 샘플 수:")
for cls, count in augmented_class_distribution.items():
    print(f"클래스 {cls}: {count}개")

증강 전 클래스별 샘플 수:
클래스 1: 76개
클래스 2: 31개
클래스 3: 25개
클래스 4: 32개
클래스 5: 17개

증강 후 클래스별 샘플 수:
클래스 1: 76개
클래스 2: 71개
클래스 3: 71개
클래스 4: 71개
클래스 5: 71개


In [None]:
# 성능 지표를 DataFrame으로 변환
results_df = pd.DataFrame(results, columns=["Accuracy", "F1", "Recall", "Specificity", "Confusion Matrix"], index=predictions.keys())
model_results = results_df.T
# 결과를 출력
print("\n모델 성능 비교 결과:")
print(model_results)

# 결과를 CSV 파일로 저장
model_results.to_csv("/content/drive/My Drive/PhalangesOutlinesCorrect/results/(다중)tsmote_models_result.csv")


모델 성능 비교 결과:
                       Logistic Regression                     CART  \
Accuracy                          0.632867                 0.656177   
F1                                0.710212                 0.758395   
Recall                             0.73384                 0.880228   
Specificity                       0.472892                 0.301205   
Confusion Matrix  [[157, 175], [140, 386]]  [[100, 232], [63, 463]]   

                                      KNN                 XGBoost  \
Accuracy                         0.699301                0.678322   
F1                               0.778351                0.786708   
Recall                           0.861217                0.967681   
Specificity                      0.442771                 0.21988   
Confusion Matrix  [[147, 185], [73, 453]]  [[73, 259], [17, 509]]   

                                      LSTM                       CNN  
Accuracy                           0.65035                  0.602564  
F1