In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from xgboost import XGBClassifier
import time



data = pd.read_csv('data_pca.csv')

# Minimum Distance Classifier
class MinimumDistanceClassifier(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        self.class_means_ = {label: X[y == label].mean(axis=0) for label in np.unique(y)}
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        distances = np.array([
            np.linalg.norm(X - mean, axis=1) for mean in self.class_means_.values()
        ]).T
        return self.classes_[np.argmin(distances, axis=1)]

# Classifier Modules
def knn_classifier(X_train, X_test, y_train, y_test, n_neighbors=5):
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    start = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start

def min_dist_classifier(X_train, X_test, y_train, y_test):
    clf = MinimumDistanceClassifier()
    start = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start

def svm_classifier(X_train, X_test, y_train, y_test, kernel='linear'):
    clf = SVC(kernel=kernel, probability=True)
    start = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start

def ann_classifier(X_train, X_test, y_train, y_test, hidden_layer_sizes=(100,)):
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=300, random_state=42)
    start = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start

def naive_bayes_classifier(X_train, X_test, y_train, y_test):
    clf = GaussianNB()
    start = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start

def xgboost_classifier(X_train, X_test, y_train, y_test):
    clf = XGBClassifier(eval_metric='logloss')  # Removed use_label_encoder
    start = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start

# Evaluation Module
def evaluate_classifiers(X, y, test_size=0.3, random_state=42):
    results = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize Features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    classifiers = {
        "KNN": knn_classifier,
        "Minimum Distance": min_dist_classifier,
        "SVM": svm_classifier,
        "ANN": ann_classifier,
        "Naive Bayes": naive_bayes_classifier,
        "XGBoost": xgboost_classifier
    }

    for name, func in classifiers.items():
        accuracy, runtime = func(X_train, X_test, y_train, y_test)
        results.append({"Classifier": name, "Accuracy": accuracy, "Runtime (s)": runtime})

    return pd.DataFrame(results)

if __name__ == "__main__":


    # Assuming `data` is a Pandas DataFrame
    # Separate features (X) and labels (y)
    X = data.iloc[:, :-1].values  # All columns except the last as features
    y = data.iloc[:, -1].values   # The last column as labels

    # Ensure labels are integers (required by some classifiers)
    y = y.astype(int)

    # Evaluate Classifiers
    results_df = evaluate_classifiers(X, y)

    # Print Results
    print(results_df)



         Classifier  Accuracy  Runtime (s)
0               KNN  0.843137     0.114703
1  Minimum Distance  0.817647     0.001092
2               SVM  0.850980     2.185329
3               ANN  0.858824     2.876361
4       Naive Bayes  0.800000     0.007266
5           XGBoost  0.860784     0.915887


In [5]:

from sklearn.impute import SimpleImputer

if __name__ == "__main__":
    # Load Dataset
    data = pd.read_csv('data-2.csv')

    # Separate features (X) and labels (y)
    X = data.iloc[:, :-1].values  # All columns except the last as features
    y = data.iloc[:, -1].values   # The last column as labels

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')  # Replace NaN with column mean
    X = imputer.fit_transform(X)

    # Ensure labels are integers (required by some classifiers)
    y = y.astype(int)

    # Evaluate Classifiers
    results_df = evaluate_classifiers(X, y)

    # Print Results
    print(results_df)


         Classifier  Accuracy  Runtime (s)
0               KNN  0.850980     0.044084
1  Minimum Distance  0.694118     0.010402
2               SVM  0.817647     1.763977
3               ANN  0.841176     7.277627
4       Naive Bayes  0.105882     0.007295
5           XGBoost  0.892157     0.511757


In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier


# Fine-tuned KNN Classifier
def knn_classifier(X_train, X_test, y_train, y_test):
    params = {'n_neighbors': [5, 7, 9, 11], 'weights': ['uniform', 'distance']}
    grid_search = GridSearchCV(KNeighborsClassifier(), params, cv=3)
    start = time.time()
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start, clf

# Fine-tuned SVM Classifier
def svm_classifier(X_train, X_test, y_train, y_test):
    params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
    grid_search = GridSearchCV(SVC(probability=True), params, cv=3)
    start = time.time()
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start, clf

# Fine-tuned ANN Classifier
def ann_classifier(X_train, X_test, y_train, y_test):
    params = {
        'hidden_layer_sizes': [(100,), (150, 100)],
        'learning_rate_init': [1, 0.01],
        'max_iter': [500]
    }
    grid_search = GridSearchCV(MLPClassifier(random_state=42), params, cv=3)
    start = time.time()
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start, clf

# Fine-tuned XGBoost Classifier
def xgboost_classifier(X_train, X_test, y_train, y_test):
    params = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    grid_search = GridSearchCV(XGBClassifier(eval_metric='mlogloss', verbosity=0), params, cv=3)
    start = time.time()
    grid_search.fit(X_train, y_train)
    clf = grid_search.best_estimator_
    y_pred = clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start, clf

# Ensemble Classifier: Voting Classifier with XGBoost and KNN
def ensemble_classifier(X_train, X_test, y_train, y_test, models):
    start = time.time()
    voting_clf = VotingClassifier(estimators=models, voting='soft')
    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_test)
    end = time.time()
    return accuracy_score(y_test, y_pred), end - start

# Evaluation Module
def evaluate_classifiers(X, y, test_size=0.3, random_state=42):
    results = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize Features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    classifiers = {
        "KNN": knn_classifier,
        "SVM": svm_classifier,
        "ANN": ann_classifier,
        "XGBoost": xgboost_classifier
    }

    # Evaluate individual classifiers
    trained_models = []
    for name, func in classifiers.items():
        accuracy, runtime, model = func(X_train, X_test, y_train, y_test)
        results.append({"Classifier": name, "Accuracy": accuracy, "Runtime (s)": runtime})
        trained_models.append((name, model))

    # Ensemble Classifier
    ensemble_accuracy, ensemble_runtime = ensemble_classifier(X_train, X_test, y_train, y_test, trained_models)
    results.append({"Classifier": "Ensemble (XGBoost + KNN + SVM)", "Accuracy": ensemble_accuracy, "Runtime (s)": ensemble_runtime})

    return pd.DataFrame(results)

if __name__ == "__main__":
    # Load Dataset
    data = pd.read_csv('data-2.csv')

    # Separate features (X) and labels (y)
    X = data.iloc[:, :-1].values  # All columns except the last as features
    y = data.iloc[:, -1].values   # The last column as labels

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')  # Replace NaN with column mean
    X = imputer.fit_transform(X)

    # Ensure labels are integers (required by some classifiers)
    y = y.astype(int)

    # Evaluate Classifiers
    results_df = evaluate_classifiers(X, y)

    # Print Results
    print(results_df)


                       Classifier  Accuracy  Runtime (s)
0                             KNN  0.850980     0.260188
1                             SVM  0.860784    18.354815
2                             ANN  0.835294     7.258079
3                         XGBoost  0.888235   159.974781
4  Ensemble (XGBoost + KNN + SVM)  0.858824     2.002827
