# ML Classification Pipeline

This notebook demonstrates a minimal scikit-learn pipeline using the iris dataset.

In [None]:
# Title: Machine Learning Pipeline for Iris Classification

# Import necessary libraries and custom modules
import pandas as pd
import sys
import os

# Add the 'src' directory to the Python path to import modules
sys.path.append(os.path.abspath('../src'))

from preprocess import load_and_prepare_data, scale_features
from models import get_logistic_regression_model, get_knn_model, get_svm_model
from evaluation import evaluate_model, plot_confusion_matrix, plot_model_comparison

print("All modules imported successfully!")

datasets_to_run = ['iris', 'breast_cancer']

# Wrap entire pipeline in a loop
for dataset in datasets_to_run:
    print(f"======================================================")
    print(f"  RUNNING PIPELINE FOR: {dataset.upper()}")
    print(f"======================================================")

In [None]:
# 1. Load and prepare the data
X_train, X_test, y_train, y_test = load_and_prepare_data(dataset_name=dataset)
X_train_scaled, X_test_scaled = scale_features(X_train, X_test)

print("Data loaded and split.")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print("\nFirst 5 rows of training features:")
print(X_train.head())

# --- Feature Engineering ---
X_train = engineer_features(X_train)
X_test = engineer_features(X_test)

# Scale the data (which includes the new features)
X_train_scaled, X_test_scaled = scale_features(X_train, X_test)

In [None]:
# In a new cell after loading the data
# Combine X_train and y_train for easier plotting
train_df = X_train.copy()
train_df['target'] = y_train

print("--- Exploratory Data Analysis (EDA) ---")

# 1. Get a quick overview
print("Dataset Info:")
train_df.info()

print("\nDescriptive Statistics:")
print(train_df.describe())

# 2. Check for class imbalance
print("\nClass Distribution:")
print(train_df['target'].value_counts())

# 3. Visualize distributions and correlations
import matplotlib.pyplot as plt
import seaborn as sns

# Pairplot for a few features
print("\nGenerating Pairplot...")
sns.pairplot(train_df, hue='target', diag_kind='kde')
plt.suptitle(f'Pairplot of Features for {dataset}', y=1.02)
plt.show()

# Correlation Matrix Heatmap
print("\nGenerating Correlation Matrix...")
plt.figure(figsize=(12, 10))
correlation_matrix = train_df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title(f'Correlation Matrix of Features for {dataset}')
plt.show()

print("\n--- EDA Complete ---")

In [None]:
# 2. Scale the features
X_train_scaled, X_test_scaled = scale_features(X_train, X_test)

print("Features scaled.")
print("\nFirst 5 rows of scaled training features:")
print(X_train_scaled.head())

In [None]:
# 3. Initialize the models
log_reg = get_logistic_regression_model()
knn = get_knn_model(n_neighbors=3) # Using 3 neighbors
svm = get_svm_model(C=0.5, kernel='linear') # Using a linear kernel

models = {
    "Logistic Regression": log_reg,
    "K-Nearest Neighbors": knn,
    "Support Vector Machine": svm
}

print("Models initialized.")

In [None]:
# 4. Train and evaluate each model
results = {}
model_names = []
accuracies = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    accuracy = evaluate_model(model, X_test_scaled, y_test, name)
    
    # Store results for comparison
    results[name] = {
        'model': model,
        'accuracy': accuracy
    }
    model_names.append(name)
    accuracies.append(accuracy)

In [None]:
# 5. Visualize the results

# Create the visualizations directory if it doesn't exist
if not os.path.exists('../visualizations'):
    os.makedirs('../visualizations')

# Plot confusion matrices
for name, result in results.items():
    y_pred = result['model'].predict(X_test_scaled)
    plot_confusion_matrix(
        y_test, 
        y_pred, 
        name, 
        save_path=f'../visualizations/{name.lower().replace(" ", "_")}_confusion_matrix.png'
    )

# Plot model comparison
plot_model_comparison(
    model_names, 
    accuracies, 
    save_path='../visualizations/model_accuracy_comparison.png'
)

print("\nAll plots have been generated and saved to the 'visualizations' folder.")

In [None]:
# 6. Conclusion
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
best_accuracy = results[best_model_name]['accuracy']

print("="*50)
print("PROJECT SUMMARY")
print("="*50)
print(f"Three classification models were trained and evaluated on the Iris dataset.")
print(f"The models compared were: {', '.join(model_names)}.")
print(f"\nBased on accuracy, the best performing model was the **{best_model_name}**.")
print(f"It achieved an accuracy of **{best_accuracy:.2%}** on the test set.")
print("="*50)

In [None]:
import os

# Create a 'models' directory if it doesn't exist
if not os.path.exists('../models'):
    os.makedirs('../models')

# Get the best model object from your results
best_model_object = results[best_model_name]['model']

# Define a filename for the saved model
model_filename = f'../models/best_{best_model_name.lower().replace(" ", "_")}_model.pkl'

# Save the best model
save_model(best_model_object, model_filename)

# --- Demonstrate loading it back ---
# loaded_model = load_model(model_filename)
# You could even test the loaded model to ensure it works:
# loaded_model_accuracy = evaluate_model(loaded_model, X_test_scaled, y_test, "Loaded Model")