# Hyperparameter Tuning

This notebook covers:
- GridSearchCV for systematic parameter search
- RandomizedSearchCV for efficient parameter exploration
- Model optimization and comparison
- Best parameter selection


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the data
X_selected = pd.read_csv('data/X_selected.csv')
y = pd.read_csv('data/y_target.csv').values.ravel()

print("Data loaded successfully!")
print(f"Features shape: {X_selected.shape}")
print(f"Target shape: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


In [None]:
# Define parameter grids for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [1000, 2000]
    },
    'Decision Tree': {
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'criterion': ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
        'kernel': ['rbf', 'linear', 'poly'],
        'degree': [2, 3, 4]
    }
}

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True)
}

print("Parameter grids defined for hyperparameter tuning")
print("=" * 50)
for model_name, params in param_grids.items():
    print(f"{model_name}: {len(params)} parameter categories")
    for param, values in params.items():
        print(f"  {param}: {values}")
    print()


In [None]:
# 1. GridSearchCV for systematic parameter search
print("1. GridSearchCV Hyperparameter Tuning:")
print("=" * 40)

grid_results = {}
grid_models = {}

for model_name, model in models.items():
    print(f"\nTuning {model_name}...")
    
    # Create GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        cv=5,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Store results
    grid_results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'best_estimator': grid_search.best_estimator_
    }
    grid_models[model_name] = grid_search.best_estimator_
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")

print("\nGridSearchCV completed for all models!")


In [None]:
# 2. RandomizedSearchCV for efficient parameter exploration
print("\n2. RandomizedSearchCV Hyperparameter Tuning:")
print("=" * 45)

# Create smaller parameter distributions for RandomizedSearchCV
random_param_distributions = {
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'Decision Tree': {
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'criterion': ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
        'kernel': ['rbf', 'linear', 'poly']
    }
}

random_results = {}
random_models = {}

for model_name, model in models.items():
    print(f"\nRandom tuning {model_name}...")
    
    # Create RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=random_param_distributions[model_name],
        n_iter=20,  # Number of parameter settings sampled
        cv=5,
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    # Fit the random search
    random_search.fit(X_train, y_train)
    
    # Store results
    random_results[model_name] = {
        'best_params': random_search.best_params_,
        'best_score': random_search.best_score_,
        'best_estimator': random_search.best_estimator_
    }
    random_models[model_name] = random_search.best_estimator_
    
    print(f"Best parameters: {random_search.best_params_}")
    print(f"Best CV score: {random_search.best_score_:.4f}")

print("\nRandomizedSearchCV completed for all models!")


In [None]:
# 3. Compare GridSearchCV vs RandomizedSearchCV results
print("\n3. Hyperparameter Tuning Comparison:")
print("=" * 40)

# Create comparison DataFrame
comparison_data = []
for model_name in models.keys():
    grid_score = grid_results[model_name]['best_score']
    random_score = random_results[model_name]['best_score']
    
    comparison_data.append({
        'Model': model_name,
        'GridSearchCV Score': grid_score,
        'RandomizedSearchCV Score': random_score,
        'Difference': grid_score - random_score,
        'Best Method': 'GridSearchCV' if grid_score > random_score else 'RandomizedSearchCV'
    })

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.round(4))

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Scores comparison
x = np.arange(len(models))
width = 0.35

grid_scores = [grid_results[name]['best_score'] for name in models.keys()]
random_scores = [random_results[name]['best_score'] for name in models.keys()]

axes[0].bar(x - width/2, grid_scores, width, label='GridSearchCV', alpha=0.8)
axes[0].bar(x + width/2, random_scores, width, label='RandomizedSearchCV', alpha=0.8)
axes[0].set_xlabel('Models')
axes[0].set_ylabel('F1 Score')
axes[0].set_title('GridSearchCV vs RandomizedSearchCV')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models.keys(), rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Difference plot
differences = [grid_scores[i] - random_scores[i] for i in range(len(models))]
colors = ['green' if diff > 0 else 'red' for diff in differences]
axes[1].bar(models.keys(), differences, color=colors, alpha=0.7)
axes[1].set_xlabel('Models')
axes[1].set_ylabel('Score Difference (Grid - Random)')
axes[1].set_title('GridSearchCV Advantage')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)
axes[1].axhline(y=0, color='black', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()


In [None]:
# 4. Final Model Evaluation
print("\n4. Final Model Evaluation on Test Set:")
print("=" * 45)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Evaluate both GridSearchCV and RandomizedSearchCV models
final_results = {}

for model_name in models.keys():
    print(f"\n{model_name}:")
    print("-" * 20)
    
    # GridSearchCV model
    grid_model = grid_models[model_name]
    grid_pred = grid_model.predict(X_test)
    grid_pred_proba = grid_model.predict_proba(X_test)[:, 1] if hasattr(grid_model, 'predict_proba') else None
    
    grid_accuracy = accuracy_score(y_test, grid_pred)
    grid_precision = precision_score(y_test, grid_pred)
    grid_recall = recall_score(y_test, grid_pred)
    grid_f1 = f1_score(y_test, grid_pred)
    grid_auc = roc_auc_score(y_test, grid_pred_proba) if grid_pred_proba is not None else None
    
    print(f"GridSearchCV - Accuracy: {grid_accuracy:.4f}, F1: {grid_f1:.4f}, AUC: {grid_auc:.4f if grid_auc else 'N/A'}")
    
    # RandomizedSearchCV model
    random_model = random_models[model_name]
    random_pred = random_model.predict(X_test)
    random_pred_proba = random_model.predict_proba(X_test)[:, 1] if hasattr(random_model, 'predict_proba') else None
    
    random_accuracy = accuracy_score(y_test, random_pred)
    random_precision = precision_score(y_test, random_pred)
    random_recall = recall_score(y_test, random_pred)
    random_f1 = f1_score(y_test, random_pred)
    random_auc = roc_auc_score(y_test, random_pred_proba) if random_pred_proba is not None else None
    
    print(f"RandomizedSearchCV - Accuracy: {random_accuracy:.4f}, F1: {random_f1:.4f}, AUC: {random_auc:.4f if random_auc else 'N/A'}")
    
    # Store best model
    if grid_f1 > random_f1:
        final_results[model_name] = {
            'model': grid_model,
            'method': 'GridSearchCV',
            'accuracy': grid_accuracy,
            'f1': grid_f1,
            'auc': grid_auc,
            'params': grid_results[model_name]['best_params']
        }
    else:
        final_results[model_name] = {
            'model': random_model,
            'method': 'RandomizedSearchCV',
            'accuracy': random_accuracy,
            'f1': random_f1,
            'auc': random_auc,
            'params': random_results[model_name]['best_params']
        }

# Find the overall best model
best_model_name = max(final_results.keys(), key=lambda x: final_results[x]['f1'])
best_model = final_results[best_model_name]

print(f"\nOverall Best Model: {best_model_name}")
print(f"Method: {best_model['method']}")
print(f"F1-Score: {best_model['f1']:.4f}")
print(f"Accuracy: {best_model['accuracy']:.4f}")
print(f"Best Parameters: {best_model['params']}")

# Save the best model
import joblib
joblib.dump(best_model['model'], 'models/best_model.pkl')
print(f"\nBest model saved to 'models/best_model.pkl'")
