In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
df = pd.read_csv('your_data.csv')

# Display dataset shape and first few rows
print(f"Dataset Shape: {df.shape}")
print(f"First 5 Rows:\n{df.head()}")

# Separate features and target variable
X = df.drop(columns=['target_column'], axis=1)
y = df['target_column']

# Identify categorical and numerical features
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=[np.object_]).columns

# Define preprocessing for numerical and categorical features
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

# Define multiple models for comparison
models = {
    'LogisticRegression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(max_iter=1000))
    ]),
    'KNeighborsClassifier': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', KNeighborsClassifier())
    ]),
    'DecisionTreeClassifier': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', DecisionTreeClassifier(random_state=42))
    ])
}

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluate models using cross-validation
print("Evaluating models using cross-validation...\n")
model_scores = {}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model_name, model_pipeline in models.items():
    cv_scores = cross_val_score(model_pipeline, X_train, y_train, cv=kf, scoring='accuracy', n_jobs=-1)
    model_scores[model_name] = np.mean(cv_scores)
    print(f"{model_name}: Average Accuracy = {np.mean(cv_scores):.4f}")
    print(f"{model_name}: Standard Deviation of Accuracy = {cv_scores.std():.4f}")
    print(f"{model_name}: 95% Confidence Interval (2.5th and 97.5th Percentiles) = {np.quantile(cv_scores, [0.025, 0.975]):.4f}")

# Select the best model based on CV scores
best_model_name = max(model_scores, key=model_scores.get)
print("\n" + "="*150)
print(f"Best model based on cross-validation scores: {best_model_name}")
print("="*150)

# Define hyperparameter grid for the selected model
if best_model_name == 'KNeighborsClassifier':
    param_grid = {
        'model__n_neighbors': [3, 5, 7, 9],
        'model__weights': ['uniform', 'distance'],
        'model__metric': ['euclidean', 'manhattan']
    }
elif best_model_name == 'LogisticRegression':
    param_grid = {
        'model__C': [0.1, 1, 10],
        'model__solver': ['liblinear', 'lbfgs']
    }
elif best_model_name == 'DecisionTreeClassifier':
    param_grid = {
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }

# Perform grid search on the best model
print(f"\nPerforming grid search for {best_model_name}...")
best_pipeline = models[best_model_name]
grid_search = GridSearchCV(best_pipeline, param_grid, cv=kf, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Display the best hyperparameters and CV score
print("\n" + "="*50)
print(f"Best parameters for {best_model_name}: {grid_search.best_params_}")
print(f"Best cross-validated Accuracy: {grid_search.best_score_:.4f}")
print("="*50)

# Evaluate the best model on the test set
final_model = grid_search.best_estimator_
y_pred = final_model.predict(X_test)

print("\n" + "="*150)
print(f"Test Accuracy for the best model ({best_model_name}): {final_model.score(X_test, y_test):.4f}")
print("="*150)

# Display confusion matrix and classification report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Completion message
print("\nPipeline training, hyperparameter tuning, and testing completed successfully!")