In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from imblearn.metrics import specificity_score # specificity_score 임포트
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, Flatten
import seaborn as sns
from scipy.spatial.distance import euclidean
import random
import numpy as np

In [5]:
df_X_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/X_train_Worms.csv')
df_y_train = pd.read_csv('/content/drive/My Drive/PhalangesOutlinesCorrect/y_train_Worms.csv')
df_X_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/X_test_Worms.csv')
df_y_test = pd.read_csv('/content/drive/MyDrive/PhalangesOutlinesCorrect/y_test_Worms.csv')
X_train = df_X_train.values
y_train = df_y_train.values.reshape(-1)  # 1차원 배열(벡터)로 변환
X_test = df_X_test.values
y_test = df_y_test.values.reshape(-1)  # 1차원 배열(벡터)로 변환


In [6]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Label 인코딩
le = LabelEncoder()
y_synthetic_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
num_classes = len(np.unique(y_synthetic_encoded))

# One-hot encoding
y_synthetic_cat = to_categorical(y_synthetic_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)


In [7]:
def predict_with_logistic_regression(X_train, y_train, X_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_cart(X_train, y_train, X_test):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_knn(X_train, y_train, X_test, k=3):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict_with_xgboost(X_train, y_train, X_test, label_encoder=None):
    from xgboost import XGBClassifier

    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    pred_y = model.predict(X_test)

    if label_encoder is not None:
        pred_y = label_encoder.inverse_transform(pred_y)

    return pred_y

def predict_with_lstm(X_train, y_train, X_test, num_classes):
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=0)
    pred_prob = model.predict(X_test_reshaped)
    return np.argmax(pred_prob, axis=1)


def predict_with_cnn(X_train, y_train, X_test, num_classes):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=0)
    pred_prob = model.predict(X_test_reshaped)
    return np.argmax(pred_prob, axis=1)


In [8]:
predictions = {
    "Logistic Regression": predict_with_logistic_regression(X_train, y_synthetic_encoded, X_test),
    "CART": predict_with_cart(X_train, y_synthetic_encoded, X_test),
    "KNN": predict_with_knn(X_train, y_synthetic_encoded, X_test),
    "XGBoost": predict_with_xgboost(X_train, y_synthetic_encoded, X_test),
    "LSTM": predict_with_lstm(X_train, y_synthetic_cat, X_test, num_classes),
    "CNN": predict_with_cnn(X_train, y_synthetic_cat, X_test, num_classes)
}


# 성능 지표 저장을 위한 리스트
results = []
for model_name, pred_y in predictions.items():
    accuracy = accuracy_score(y_test_encoded, pred_y)
    recall = recall_score(y_test_encoded, pred_y, average='macro')
    f1 = f1_score(y_test_encoded, pred_y, average='macro')
    specificity = specificity_score(y_test_encoded, pred_y, average='macro')
    conf_matrix = confusion_matrix(y_test_encoded, pred_y)
    results.append([accuracy, f1, recall, specificity, conf_matrix])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 215ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step


In [9]:
# 성능 지표를 DataFrame으로 변환
results_df = pd.DataFrame(results, columns=["Accuracy", "F1", "Recall", "Specificity", "Confusion Matrix"], index=predictions.keys())
model_results = results_df.T
# 결과를 출력
print("\n모델 성능 비교 결과:")
print(model_results)

# 결과를 CSV 파일로 저장
model_results.to_csv("/content/drive/My Drive/PhalangesOutlinesCorrect/results/(다중)증강안함.csv")


모델 성능 비교 결과:
                                                Logistic Regression  \
Accuracy                                                   0.350649   
F1                                                         0.305043   
Recall                                                     0.356387   
Specificity                                                 0.82784   
Confusion Matrix  [[14, 1, 4, 8, 6], [3, 1, 3, 4, 2], [4, 1, 3, ...   

                                                               CART  \
Accuracy                                                   0.467532   
F1                                                         0.427157   
Recall                                                     0.438228   
Specificity                                                0.853046   
Confusion Matrix  [[19, 2, 4, 3, 5], [5, 3, 2, 2, 1], [3, 2, 5, ...   

                                                                KNN  \
Accuracy                                                   0.