In [None]:
# ========================== IMPORT LIBRARIES ==========================
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

# ========================== LOAD DATA ==========================
print("\n🔹 Loading the split data...")

train_x_file = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\5. Splitting\X_train.csv"
train_y_file = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\5. Splitting\y_train.csv"
test_x_file = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\5. Splitting\X_test.csv"
test_y_file = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\5. Splitting\y_test.csv"

X_train = pd.read_csv(train_x_file)
y_train = pd.read_csv(train_y_file).values.ravel()  # Convert to 1D array
X_test = pd.read_csv(test_x_file)
y_test = pd.read_csv(test_y_file).values.ravel()

# ========================== REMOVE EXTRA COLUMN IF NEEDED ==========================
for df_name, df in zip(["X_train", "X_test"], [X_train, X_test]):
    if "Unnamed: 0" in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)
        print(f"⚠ Removed extra column 'Unnamed: 0' from {df_name}")

# Ensure columns match in train and test sets
if list(X_train.columns) != list(X_test.columns):
    print("\n⚠ Feature names in X_Train and X_Test do not match! Reordering X_Test...")
    X_test = X_test[X_train.columns]  # Reorder X_Test to match X_Train

# ========================== HYPERPARAMETER TUNING ==========================

models = {
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
            "criterion": ["gini", "entropy"]
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(),
        "params": {
            "C": [0.01, 0.1, 1, 10, 100],
            "solver": ["lbfgs", "liblinear"]
        }
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 7, 9],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan"]
        }
    },
    "SVM": {
        "model": SVC(probability=True),
        "params": {
            "C": [0.1, 1, 10, 100],
            "kernel": ["linear", "rbf", "poly"],
            "gamma": ["scale", "auto"]
        }
    }
}

best_models = {}

for name, config in models.items():
    print(f"\n🔹 Tuning {name} using RandomizedSearchCV...")

    # Perform hyperparameter search
    search = RandomizedSearchCV(config["model"], config["params"], n_iter=10, cv=5, scoring="accuracy", n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)

    # Train model with best parameters
    best_model = search.best_estimator_
    best_models[name] = best_model

    # Evaluate model
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"✅ Best Parameters for {name}: {search.best_params_}")
    print(f"🎯 Accuracy after tuning: {acc:.4f}")

# ========================== FINAL BEST MODEL SELECTION ==========================
best_model_name = max(best_models, key=lambda m: accuracy_score(y_test, best_models[m].predict(X_test)))
best_model = best_models[best_model_name]

print(f"\n🏆 Best Model: {best_model_name} with accuracy {accuracy_score(y_test, best_model.predict(X_test)):.4f}")