In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score, balanced_accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten
import keras_tuner as kt

# Paths for raw data
raw_train_path = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\2. Merged Data\train.csv"
raw_test_path = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\2. Merged Data\test.csv"

# Paths for feature-extracted data
X_train_path = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\5. Splitting\X_train.csv"
y_train_path = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\5. Splitting\y_train.csv"
X_test_path = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\5. Splitting\X_test.csv"
y_test_path = r"D:\Amrita\Sem-4\Machine Learning Lab\End Sem Project\Excel\5. Splitting\y_test.csv"

# Load raw data
def load_raw_data():
    train_df = pd.read_csv(raw_train_path)
    test_df = pd.read_csv(raw_test_path)
    
    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]
    X_test = test_df.iloc[:, :-1]
    y_test = test_df.iloc[:, -1]
    
    return X_train, y_train, X_test, y_test

# Load feature-extracted data
def load_feature_data():
    X_train = pd.read_csv(X_train_path)
    y_train = pd.read_csv(y_train_path).values.ravel()
    X_test = pd.read_csv(X_test_path)
    y_test = pd.read_csv(y_test_path).values.ravel()
    return X_train, y_train, X_test, y_test

# Standardize data
def preprocess_data(X_train, X_test):
    X_test = X_test[X_train.columns]  # Ensure same columns and order
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Hyperparameter tuning for traditional models
def tune_hyperparameters(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# CNN model function for tuning
def build_cnn_model(hp):
    model = Sequential()
    model.add(Conv1D(filters=hp.Int('filters', min_value=16, max_value=128, step=16), kernel_size=3, activation='relu', input_shape=(X_train_raw.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [0.01, 0.001, 0.0001])), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Function to train and evaluate models
def train_and_evaluate_models(X_train, y_train, X_test, y_test, dataset_type="Feature Extracted"):
    models = {
        "Decision Tree": (DecisionTreeClassifier(), {'max_depth': [5, 10, 20]}),
        "Random Forest": (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
        "KNN": (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
        "Logistic Regression": (LogisticRegression(), {'C': [0.1, 1, 10]}),
        "SVM (Linear)": (SVC(kernel='linear', probability=True), {'C': [0.1, 1, 10]}),
        "SVM (RBF)": (SVC(kernel='rbf', probability=True), {'C': [0.1, 1, 10]}),
        "ANN (ReLU)": (MLPClassifier(max_iter=500), {'hidden_layer_sizes': [(50,), (100,), (100, 50)]})
    }
    
    for name, (model, params) in models.items():
        best_model = tune_hyperparameters(model, params, X_train, y_train)
        y_pred = best_model.predict(X_test)
        print(f"\n{name} - {dataset_type} Data")
        print(classification_report(y_test, y_pred))

# Run for Feature Extracted Data
X_train, y_train, X_test, y_test = load_feature_data()
X_train, X_test = preprocess_data(X_train, X_test)
train_and_evaluate_models(X_train, y_train, X_test, y_test, dataset_type="Feature Extracted")

# Run for Raw Data (CNN Only)
X_train_raw, y_train_raw, X_test_raw, y_test_raw = load_raw_data()
X_train_raw, X_test_raw = preprocess_data(X_train_raw, X_test_raw)
X_train_raw = X_train_raw.reshape((X_train_raw.shape[0], X_train_raw.shape[1], 1))
X_test_raw = X_test_raw.reshape((X_test_raw.shape[0], X_test_raw.shape[1], 1))

tuner = kt.RandomSearch(build_cnn_model, objective='val_accuracy', max_trials=5, directory='cnn_tuning')
tuner.search(X_train_raw, y_train_raw, epochs=10, validation_split=0.2)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best CNN Hyperparameters: Filters: {best_hps.get('filters')}, Units: {best_hps.get('units')}, Learning Rate: {best_hps.get('learning_rate')}")

print("🚀 Model training and evaluation complete!")


Best parameters for DecisionTreeClassifier: {'max_depth': 5}

Decision Tree - Feature Extracted Data
              precision    recall  f1-score   support

           0       0.51      0.04      0.07      6607
           1       0.82      0.99      0.90     29429

    accuracy                           0.82     36036
   macro avg       0.67      0.52      0.49     36036
weighted avg       0.77      0.82      0.75     36036

Best parameters for RandomForestClassifier: {'n_estimators': 200}

Random Forest - Feature Extracted Data
              precision    recall  f1-score   support

           0       0.37      0.19      0.25      6607
           1       0.84      0.93      0.88     29429

    accuracy                           0.79     36036
   macro avg       0.60      0.56      0.57     36036
weighted avg       0.75      0.79      0.76     36036

Best parameters for KNeighborsClassifier: {'n_neighbors': 7}

KNN - Feature Extracted Data
              precision    recall  f1-score   su

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters for LogisticRegression: {'C': 0.1}

Logistic Regression - Feature Extracted Data
              precision    recall  f1-score   support

           0       0.43      0.06      0.10      6607
           1       0.82      0.98      0.90     29429

    accuracy                           0.81     36036
   macro avg       0.63      0.52      0.50     36036
weighted avg       0.75      0.81      0.75     36036

