# Machine Learning Model Training and Evaluation
This notebook extends the data loading template to train and evaluate multiple ML models using the pre-scaled dataset.

## üìÅ Expected Project Structure
```plaintext
Your Project/
‚îú‚îÄ‚îÄ 01_project_management/
‚îú‚îÄ‚îÄ 02_data/
‚îÇ   ‚îú‚îÄ‚îÄ Original_data/
‚îÇ   ‚îî‚îÄ‚îÄ Processed_data/    ‚Üê Pre-scaled data should be here
‚îú‚îÄ‚îÄ 03_notebooks/          ‚Üê Run notebooks from here
‚îÇ   ‚îú‚îÄ‚îÄ src/               ‚Üê Custom modules live here
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ file_handler.py
‚îÇ   ‚îî‚îÄ‚îÄ ml_model_training.ipynb  ‚Üê This notebook
‚îú‚îÄ‚îÄ 04_analyses/
‚îî‚îÄ‚îÄ 05_results/
```

## üéØ Objectives
1. Load pre-scaled dataset
2. Split data into training and test sets
3. Train multiple ML models with cross-validation
4. Optimize hyperparameters using GridSearchCV
5. Compare model performance and select the best model

---

## üìö 1. Import Libraries and Setup

In [None]:
# Core libraries
import sys
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Add src folder to Python path
sys.path.append(str(Path.cwd() / 'src'))

# Import custom modules
from file_handler import setup_paths, load_data_with_detection_enhanced
from data_exporter import export_data_interactive, quick_export

# Machine Learning libraries
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    GridSearchCV,
    StratifiedKFold
)
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix,
    roc_auc_score,
    roc_curve
)

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ All libraries imported successfully!")
print(f"üìÖ Analysis date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

## üì• 2. Load Pre-Scaled Dataset

In [None]:
# Set up project paths interactively
# You'll be prompted to select the folder containing your pre-scaled data
project_root, input_path, output_path = setup_paths()

# Load the pre-scaled dataset
print("\nüìä Loading pre-scaled dataset...")
df = load_data_with_detection_enhanced(input_path)[0]

# Display basic information about the dataset
print(f"\nüìã Dataset Information:")
print(f"   Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"   Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
print(f"\n   Column types:")
print(df.dtypes.value_counts())

In [None]:
#Reduce your dataset to a single year
dfyear = df[df['DATE'].astype(str).str.contains('1960')] #<-----INSERT YEAR HERE
dfyear

In [None]:
dfyear.describe()

In [None]:
#Drop the DATE and MONTH data as those numbers are not scaled with the rest.
notempyear = dfyear.drop(['DATE','MONTH'], axis=1)

## üîÑ 3. Prepare Data for Machine Learning

### Define Target Variable
First, we need to identify or create a target variable for classification. Adjust this based on project's specific use case.

In [None]:
import pandas as pd
import numpy as np

# Comprehensive column analysis
print("üìä COLUMN ANALYSIS AND GROUPING")
print("=" * 80)

# Create a DataFrame to display column information
column_info = pd.DataFrame({
    'Column Name': notempyear.columns,
    'Data Type': notempyear.dtypes.values,
    'Non-Null Count': notempyear.count().values,
    'Null %': (notempyear.isnull().sum() / len(notempyear) * 100).round(2).values
})

# Display all columns in a nice table format
print("\nüìã All Available Columns:")
print(column_info.to_string(index=False))

# Group columns by common patterns
print("\n\nüîç COLUMN GROUPING BY PATTERNS")
print("-" * 80)

# Define pattern groups
patterns = {
    'Statistical Measures': ['mean', 'max', 'min', 'std', 'avg', 'median', 'sum', 'count'],
    'Temperature Related': ['temp', 'temperature', 'celsius', 'fahrenheit'],
    'Time Related': ['date', 'time', 'year', 'month', 'day', 'hour', 'minute'],
    'Percentage/Ratio': ['pct', 'percent', 'ratio', 'rate'],
    'Categorical Likely': ['id', 'name', 'type', 'category', 'class', 'group'],
    'Measurement Values': ['value', 'amount', 'quantity', 'level', 'size', 'volume']
}

# Group columns by patterns
grouped_columns = {}
unmatched_columns = list(notempyear.columns)

for group_name, keywords in patterns.items():
    matched = []
    for col in notempyear.columns:
        col_lower = col.lower()
        if any(keyword in col_lower for keyword in keywords):
            matched.append(col)
            if col in unmatched_columns:
                unmatched_columns.remove(col)
    if matched:
        grouped_columns[group_name] = matched

# Display grouped columns with numbers for selection
print("\nColumn Groups Found:")
group_list = list(grouped_columns.keys())
for i, (group_name, cols) in enumerate(grouped_columns.items(), 1):
    print(f"\n{i}. üè∑Ô∏è {group_name} ({len(cols)} columns):")
    # Show first 5 columns as examples
    for col in cols[:5]:
        dtype = notempyear[col].dtype
        print(f"   ‚Ä¢ {col} ({dtype})")
    if len(cols) > 5:
        print(f"   ... and {len(cols) - 5} more")

if unmatched_columns:
    group_list.append("Other Columns")
    print(f"\n{len(group_list)}. ‚ùì Other Columns ({len(unmatched_columns)}):")
    for col in unmatched_columns[:5]:
        print(f"   ‚Ä¢ {col} ({notempyear[col].dtype})")
    if len(unmatched_columns) > 5:
        print(f"   ... and {len(unmatched_columns) - 5} more")
    grouped_columns["Other Columns"] = unmatched_columns

# INTERACTIVE SELECTION SECTION
print("\n\nüéØ SELECT COLUMNS FOR ANALYSIS")
print("=" * 80)

print("\nüìù Instructions:")
print("1. First, select which column groups to include")
print("2. Then, optionally filter by specific keywords within those groups")
print("3. Finally, choose a column for target variable creation")

# Prompt for group selection
print("\n" + "-"*60)
print("Which column groups do you want to include?")
print("Enter numbers separated by commas (e.g., 1,3,5) or 'all' for all groups:")
for i, group in enumerate(group_list, 1):
    num_cols = len(grouped_columns.get(group, []))
    print(f"  {i}. {group} ({num_cols} columns)")

# Get user input for groups
user_groups = input("\nüëâ Your selection: ").strip()

# Process group selection
selected_groups = []
if user_groups.lower() == 'all':
    selected_groups = group_list
    selected_columns = list(notempyear.columns)
else:
    try:
        group_indices = [int(x.strip()) - 1 for x in user_groups.split(',')]
        selected_groups = [group_list[i] for i in group_indices if 0 <= i < len(group_list)]
        selected_columns = []
        for group in selected_groups:
            selected_columns.extend(grouped_columns.get(group, []))
    except:
        print("‚ö†Ô∏è Invalid input. Using all columns.")
        selected_groups = group_list
        selected_columns = list(notempyear.columns)

print(f"\n‚úÖ Selected groups: {', '.join(selected_groups)}")
print(f"   Total columns selected: {len(selected_columns)}")

# Prompt for keyword filtering
print("\n" + "-"*60)
print("Do you want to filter columns by specific keywords?")
print("For example: 'mean' for only mean values, 'temp,mean' for temperature means")
print("Or press Enter to skip filtering")

keyword_filter = input("\nüëâ Enter keywords (comma-separated) or press Enter: ").strip()

# Apply keyword filtering if provided
if keyword_filter:
    keywords = [k.strip().lower() for k in keyword_filter.split(',')]
    filtered_columns = []
    for col in selected_columns:
        col_lower = col.lower()
        if all(keyword in col_lower for keyword in keywords):
            filtered_columns.append(col)
    
    if filtered_columns:
        print(f"\n‚úÖ Filtered to {len(filtered_columns)} columns containing all keywords: {keywords}")
        selected_columns = filtered_columns
        # Show filtered columns
        print("\nFiltered columns:")
        for col in filtered_columns[:10]:
            print(f"   ‚Ä¢ {col}")
        if len(filtered_columns) > 10:
            print(f"   ... and {len(filtered_columns) - 10} more")
    else:
        print(f"\n‚ö†Ô∏è No columns found with all keywords: {keywords}. Using original selection.")

# TARGET VARIABLE CREATION
print("\n\nüéØ CREATE TARGET VARIABLE")
print("=" * 80)

# Filter for numerical columns only
numerical_selected = [col for col in selected_columns if notempyear[col].dtype in ['int64', 'float64']]

if numerical_selected:
    print(f"\nFound {len(numerical_selected)} numerical columns for potential targets:")
    for i, col in enumerate(numerical_selected[:15], 1):  # Show up to 15
        print(f"  {i}. {col}")
    if len(numerical_selected) > 15:
        print(f"  ... and {len(numerical_selected) - 15} more")
    
    print("\n" + "-"*60)
    print("Target variable creation options:")
    print("‚Ä¢ Press Enter to use ALL filtered columns as features (no target creation)")
    print("‚Ä¢ Enter a number to select a specific column for target creation")
    print("‚Ä¢ Enter 'skip' to proceed without creating a target")
    
    target_choice = input("\nüëâ Your choice: ").strip()
    
    # Process target selection
    if target_choice == '':  # Enter pressed - use all columns as features
        print(f"\n‚úÖ Using all {len(selected_columns)} filtered columns as features")
        print("   No target variable created - suitable for unsupervised learning")
        target_column = None
        feature_columns = selected_columns.copy()
        
    elif target_choice.lower() == 'skip':
        print("\n‚úÖ Skipping target creation")
        target_column = None
        feature_columns = selected_columns.copy()
        
    else:  # Number entered - create target from specific column
        try:
            target_idx = int(target_choice) - 1
            if 0 <= target_idx < len(numerical_selected):
                target_column = numerical_selected[target_idx]
            else:
                print(f"‚ö†Ô∏è Invalid number. Using first column.")
                target_column = numerical_selected[0]
        except:
            print(f"‚ö†Ô∏è Invalid input. Using first column.")
            target_column = numerical_selected[0]
        
        # Create target variable
        print(f"\n‚úÖ Using column for target: {target_column}")
        
        # Ask for target type
        print("\nHow should the target be created?")
        print("  1. Binary (above/below median)")
        print("  2. Binary (above/below mean)")
        print("  3. Multi-class (3 equal groups)")
        print("  4. Multi-class (5 equal groups)")
        
        target_type = input("\nüëâ Your choice (1-4, default=1): ").strip()
        
        if target_type == '2':
            threshold = notempyear[target_column].mean()
            notempyear['target'] = (notempyear[target_column] > threshold).astype(int)
            print(f"\n‚úÖ Created binary target (above/below mean: {threshold:.2f})")
        elif target_type == '3':
            notempyear['target'] = pd.qcut(notempyear[target_column], q=3, labels=[0,1,2], duplicates='drop')
            print(f"\n‚úÖ Created 3-class target (equal groups)")
        elif target_type == '4':
            notempyear['target'] = pd.qcut(notempyear[target_column], q=5, labels=[0,1,2,3,4], duplicates='drop')
            print(f"\n‚úÖ Created 5-class target (equal groups)")
        else:  # Default: binary median
            threshold = notempyear[target_column].median()
            notempyear['target'] = (notempyear[target_column] > threshold).astype(int)
            print(f"\n‚úÖ Created binary target (above/below median: {threshold:.2f})")
        
        print(f"   Class distribution: {notempyear['target'].value_counts().sort_index().to_dict()}")
        
        # Define feature columns (exclude target and source column)
        feature_columns = [col for col in selected_columns if col not in ['target', target_column]]
    
else:
    print("\n‚ö†Ô∏è No numerical columns found in selection.")
    target_column = None
    feature_columns = selected_columns.copy()

# Look for temperature columns (for compatibility)
temp_columns = [col for col in notempyear.columns if 'temp' in col.lower()]

# SUMMARY
print("\n\nüìå ANALYSIS CONFIGURATION SUMMARY")
print("=" * 80)
print(f"‚úì Selected column groups: {', '.join(selected_groups)}")
if keyword_filter:
    print(f"‚úì Keyword filters applied: {keyword_filter}")
print(f"‚úì Total columns for analysis: {len(feature_columns)} features")
if 'target' in notempyear.columns:
    print(f"‚úì Target variable created from: {target_column}")
    print(f"‚úì Target type: {notempyear['target'].nunique()} classes")
else:
    print(f"‚úì No target variable - ready for unsupervised learning")

print("\nüìä VARIABLES READY FOR NEXT CELLS:")
print("-" * 40)
print(f"‚Ä¢ notempyear - DataFrame {'with target column' if 'target' in notempyear.columns else '(no target)'}")
print(f"‚Ä¢ feature_columns - List of {len(feature_columns)} selected features")
if target_column:
    print(f"‚Ä¢ target_column = '{target_column}'")
else:
    print(f"‚Ä¢ target_column = None")
print(f"‚Ä¢ temp_columns - Temperature columns found: {len(temp_columns)}")

# Save configuration for reference
config_summary = {
    'selected_groups': selected_groups,
    'keyword_filter': keyword_filter if keyword_filter else None,
    'feature_count': len(feature_columns),
    'target_column': target_column,
    'target_classes': notempyear['target'].nunique() if 'target' in notempyear.columns else None,
    'analysis_type': 'supervised' if 'target' in notempyear.columns else 'unsupervised'
}

print("\nüíæ Configuration saved in 'config_summary' dictionary")
print(f"   Analysis type: {config_summary['analysis_type']}")

In [None]:
df = notempyear

In [None]:
# Example: Create a binary target variable based on temperature
# We should modify this based on specific prediction task

# Check available columns
print("üìã Available columns:")
print(df.columns.tolist()[:20], "...")  # Show first 20 columns

# Example: Create binary target (modify based on your needs)
# This example creates a target based on whether it's above/below median temperature
# Replace this with your actual target variable

# Look for temperature columns
temp_columns = [col for col in df.columns if 'temp' in col.lower()]
print(f"\nüå°Ô∏è Temperature columns found: {temp_columns}")

if temp_columns:
    # Use the first temperature column found
    target_column = temp_columns[0]
    median_temp = df[target_column].median()
    df['target'] = (df[target_column] > median_temp).astype(int)
    print(f"\n‚úÖ Created binary target based on {target_column}")
    print(f"   Class distribution: {df['target'].value_counts().to_dict()}")
    
    # Remove the original temperature column from features
    feature_columns = [col for col in df.columns if col not in ['target', target_column]]
else:
    print("\n‚ö†Ô∏è No temperature columns found. Please create your target variable manually.")
    # Manual target creation example:
    # df['target'] = your_target_creation_logic_here
    # feature_columns = [col for col in df.columns if col != 'target']

In [None]:
# Prepare features and target
X = df[feature_columns]
y = df['target']

print(f"\nüìä Feature matrix shape: {X.shape}")
print(f"üéØ Target vector shape: {y.shape}")
print(f"\nüìà Target distribution:")
print(y.value_counts(normalize=True).round(3))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n‚úÇÔ∏è Data split completed:")
print(f"   Training set: {X_train.shape[0]:,} samples")
print(f"   Test set: {X_test.shape[0]:,} samples")

## ü§ñ 4. Define Models and Hyperparameter Grids

We'll use efficient parameter grids optimized for consumer-grade laptops.

### Models included:
- **Logistic Regression**: Linear model for baseline performance
- **Decision Tree**: Single tree for interpretability
- **Random Forest**: Ensemble of trees for better accuracy
- **Gradient Boosting**: Sequential ensemble that builds trees to correct previous errors
- **Support Vector Machine**: For complex non-linear boundaries

In [None]:
# Define models and their parameter grids
# Using smaller grids for efficiency on consumer laptops

models = {
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['l2'],
            'solver': ['lbfgs', 'liblinear']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'max_depth': [3, 5, 7, 10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42, n_jobs=-1),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
    },
    'Support Vector Machine': {
        'model': SVC(random_state=42, probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        }
    }
}

print("‚úÖ Models and parameter grids defined!")
print(f"\nüìã Models to train: {list(models.keys())}")

## üèÉ‚Äç‚ôÇÔ∏è 5. Train Models with Cross-Validation and GridSearchCV

In [None]:
# Initialize results storage
results = {}
best_models = {}

# Define cross-validation strategy
cv_folds = 5
cv_strategy = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

print(f"üîÑ Starting model training with {cv_folds}-fold cross-validation...\n")

# Train each model
for model_name, model_info in models.items():
    print(f"{'='*60}")
    print(f"ü§ñ Training {model_name}...")
    start_time = datetime.now()
    
    # Perform GridSearchCV
    grid_search = GridSearchCV(
        estimator=model_info['model'],
        param_grid=model_info['params'],
        cv=cv_strategy,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Store the best model
    best_models[model_name] = grid_search.best_estimator_
    
    # Make predictions
    y_pred = grid_search.predict(X_test)
    y_pred_proba = grid_search.predict_proba(X_test)[:, 1] if hasattr(grid_search.best_estimator_, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    cv_scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=cv_strategy)
    
    # Store results
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_cv_score': grid_search.best_score_,
        'cv_scores': cv_scores,
        'test_accuracy': accuracy,
        'predictions': y_pred,
        'pred_proba': y_pred_proba,
        'training_time': (datetime.now() - start_time).total_seconds()
    }
    
    print(f"\n‚úÖ {model_name} training completed!")
    print(f"   Best parameters: {grid_search.best_params_}")
    print(f"   Best CV score: {grid_search.best_score_:.4f}")
    print(f"   Test accuracy: {accuracy:.4f}")
    print(f"   Training time: {results[model_name]['training_time']:.2f} seconds")

print(f"\n{'='*60}")
print("‚úÖ All models trained successfully!")

## üìä 6. Compare Model Performance

In [None]:
# Create comparison dataframe
comparison_data = []
for model_name, result in results.items():
    comparison_data.append({
        'Model': model_name,
        'CV Mean Score': result['cv_scores'].mean(),
        'CV Std': result['cv_scores'].std(),
        'Test Accuracy': result['test_accuracy'],
        'Training Time (s)': result['training_time']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test Accuracy', ascending=False)

print("üìä Model Performance Comparison:")
print("=" * 80)
print(comparison_df.to_string(index=False))

# Identify best model
best_model_name = comparison_df.iloc[0]['Model']
print(f"\nüèÜ Best Model: {best_model_name} (Test Accuracy: {comparison_df.iloc[0]['Test Accuracy']:.4f})")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Test Accuracy Comparison
ax1 = axes[0, 0]
bars1 = ax1.bar(comparison_df['Model'], comparison_df['Test Accuracy'])
ax1.set_title('Model Test Accuracy Comparison', fontsize=14, fontweight='bold')
ax1.set_ylabel('Test Accuracy')
ax1.set_ylim(0, 1.1)
for bar, acc in zip(bars1, comparison_df['Test Accuracy']):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.3f}', ha='center', va='bottom')

# 2. CV Score with Error Bars
ax2 = axes[0, 1]
ax2.errorbar(comparison_df['Model'], comparison_df['CV Mean Score'], 
             yerr=comparison_df['CV Std'], fmt='o-', capsize=5, capthick=2)
ax2.set_title('Cross-Validation Scores (with std)', fontsize=14, fontweight='bold')
ax2.set_ylabel('CV Score')
ax2.set_ylim(0, 1.1)

# 3. Training Time Comparison
ax3 = axes[1, 0]
bars3 = ax3.bar(comparison_df['Model'], comparison_df['Training Time (s)'])
ax3.set_title('Training Time Comparison', fontsize=14, fontweight='bold')
ax3.set_ylabel('Training Time (seconds)')
for bar, time in zip(bars3, comparison_df['Training Time (s)']):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
             f'{time:.1f}s', ha='center', va='bottom')

# 4. Accuracy vs Training Time Trade-off
ax4 = axes[1, 1]
scatter = ax4.scatter(comparison_df['Training Time (s)'], comparison_df['Test Accuracy'], 
                      s=200, alpha=0.6)
for idx, row in comparison_df.iterrows():
    ax4.annotate(row['Model'], (row['Training Time (s)'], row['Test Accuracy']), 
                 xytext=(5, 5), textcoords='offset points')
ax4.set_title('Accuracy vs Training Time Trade-off', fontsize=14, fontweight='bold')
ax4.set_xlabel('Training Time (seconds)')
ax4.set_ylabel('Test Accuracy')

plt.tight_layout()
plt.show()

### üìä Ensemble Methods Comparison

Let's specifically compare Random Forest vs Gradient Boosting performance:

In [None]:
# Compare ensemble methods if both were trained
ensemble_models = ['Random Forest', 'Gradient Boosting']
if all(model in results for model in ensemble_models):
    print("\nüå≤ ENSEMBLE METHODS COMPARISON")
    print("="*60)
    
    # Create comparison
    ensemble_comparison = comparison_df[comparison_df['Model'].isin(ensemble_models)]
    
    for _, row in ensemble_comparison.iterrows():
        print(f"\n{row['Model']}:")
        print(f"  - Test Accuracy: {row['Test Accuracy']:.4f}")
        print(f"  - CV Score: {row['CV Mean Score']:.4f} (¬±{row['CV Std']:.4f})")
        print(f"  - Training Time: {row['Training Time (s)']:.2f}s")
    
    # Visualize ensemble comparison
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Accuracy comparison
    ensemble_data = ensemble_comparison.set_index('Model')
    x = range(len(ensemble_models))
    
    ax1.bar(x, ensemble_data['Test Accuracy'], alpha=0.7, label='Test Accuracy')
    ax1.bar(x, ensemble_data['CV Mean Score'], alpha=0.7, label='CV Score')
    ax1.set_xticks(x)
    ax1.set_xticklabels(ensemble_models)
    ax1.set_ylabel('Score')
    ax1.set_title('Ensemble Methods: Accuracy Comparison')
    ax1.legend()
    ax1.set_ylim(0, 1.1)
    
    # Training time vs accuracy trade-off
    ax2.scatter(ensemble_data['Training Time (s)'], ensemble_data['Test Accuracy'], 
                s=300, alpha=0.7)
    for model, row in ensemble_data.iterrows():
        ax2.annotate(model, (row['Training Time (s)'], row['Test Accuracy']), 
                     xytext=(5, 5), textcoords='offset points')
    ax2.set_xlabel('Training Time (seconds)')
    ax2.set_ylabel('Test Accuracy')
    ax2.set_title('Ensemble Methods: Efficiency vs Performance')
    
    plt.tight_layout()
    plt.show()
    
    # Explain the difference
    print("\nüìö Key Differences:")
    print("  ‚Ä¢ Random Forest: Builds trees in parallel (faster)")
    print("  ‚Ä¢ Gradient Boosting: Builds trees sequentially (often more accurate)")
    print("  ‚Ä¢ Random Forest: Less prone to overfitting")
    print("  ‚Ä¢ Gradient Boosting: Can achieve higher accuracy with tuning")

In [None]:
# Get the best model
best_model = best_models[best_model_name]
best_result = results[best_model_name]

print(f"üèÜ Detailed Analysis of {best_model_name}")
print("=" * 60)

# Classification Report
print("\nüìã Classification Report:")
print(classification_report(y_test, best_result['predictions']))

# Confusion Matrix
cm = confusion_matrix(y_test, best_result['predictions'])
print("\nüìä Confusion Matrix:")
print(cm)

In [None]:
# Visualize confusion matrix and ROC curve for best model
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# 1. Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')

# 2. ROC Curve (if probability predictions available)
if best_result['pred_proba'] is not None:
    fpr, tpr, _ = roc_curve(y_test, best_result['pred_proba'])
    auc_score = roc_auc_score(y_test, best_result['pred_proba'])
    
    ax2.plot(fpr, tpr, 'b-', linewidth=2, label=f'ROC Curve (AUC = {auc_score:.3f})')
    ax2.plot([0, 1], [0, 1], 'k--', alpha=0.5)
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title(f'ROC Curve - {best_model_name}', fontsize=14, fontweight='bold')
    ax2.legend(loc='lower right')
    ax2.grid(True, alpha=0.3)
else:
    ax2.text(0.5, 0.5, 'ROC Curve not available\nfor this model', 
             ha='center', va='center', transform=ax2.transAxes)

plt.tight_layout()
plt.show()

## üìà 8. Feature Importance Analysis (if applicable)

In [None]:
# Feature importance for tree-based models
if best_model_name in ['Decision Tree', 'Random Forest']:
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Display top 20 features
    print(f"\nüìä Top 20 Most Important Features ({best_model_name}):")
    print(feature_importance.head(20).to_string(index=False))
    
    # Visualize feature importance
    plt.figure(figsize=(10, 8))
    top_features = feature_importance.head(20)
    plt.barh(top_features['feature'], top_features['importance'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importances - {best_model_name}', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

elif best_model_name == 'Logistic Regression':
    # Get coefficients for logistic regression
    coefficients = pd.DataFrame({
        'feature': X_train.columns,
        'coefficient': best_model.coef_[0]
    })
    coefficients['abs_coefficient'] = coefficients['coefficient'].abs()
    coefficients = coefficients.sort_values('abs_coefficient', ascending=False)
    
    print(f"\nüìä Top 20 Most Important Features ({best_model_name}):")
    print(coefficients.head(20)[['feature', 'coefficient']].to_string(index=False))
    
    # Visualize coefficients
    plt.figure(figsize=(10, 8))
    top_coef = coefficients.head(20)
    colors = ['red' if x < 0 else 'blue' for x in top_coef['coefficient']]
    plt.barh(top_coef['feature'], top_coef['coefficient'], color=colors)
    plt.xlabel('Coefficient Value')
    plt.title(f'Top 20 Feature Coefficients - {best_model_name}', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print(f"\nüìä Feature importance not directly available for {best_model_name}")

## üíæ 9. Save Results and Best Model

In [None]:
# Prepare results summary
results_summary = {
    'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M'),
    'dataset_shape': df.shape,
    'train_size': X_train.shape[0],
    'test_size': X_test.shape[0],
    'model_comparison': comparison_df.to_dict('records'),
    'best_model': {
        'name': best_model_name,
        'parameters': results[best_model_name]['best_params'],
        'test_accuracy': results[best_model_name]['test_accuracy'],
        'cv_mean_score': results[best_model_name]['cv_scores'].mean(),
        'cv_std_score': results[best_model_name]['cv_scores'].std()
    }
}

# Save results summary as JSON
import json
results_filename = f"ml_results_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
results_path = output_path / results_filename

with open(results_path, 'w') as f:
    json.dump(results_summary, f, indent=4)

print(f"‚úÖ Results summary saved to: {results_path}")

# Save the best model
import joblib
model_filename = f"best_model_{best_model_name.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M')}.pkl"
model_path = output_path / model_filename

joblib.dump(best_model, model_path)
print(f"‚úÖ Best model saved to: {model_path}")

# Save predictions
predictions_df = pd.DataFrame({
    'actual': y_test,
    'predicted': best_result['predictions']
})
if best_result['pred_proba'] is not None:
    predictions_df['probability'] = best_result['pred_proba']

predictions_filename = f"predictions_{best_model_name.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
predictions_df.to_csv(output_path / predictions_filename, index=False)
print(f"‚úÖ Predictions saved to: {output_path / predictions_filename}")

## üéØ 10. Conclusions and Next Steps

### Summary of Results

In [None]:
print("\n" + "="*60)
print("üìä FINAL ANALYSIS SUMMARY")
print("="*60)

print(f"\nüéØ Best Model: {best_model_name}")
print(f"   - Test Accuracy: {results[best_model_name]['test_accuracy']:.4f}")
print(f"   - CV Mean Score: {results[best_model_name]['cv_scores'].mean():.4f} (¬±{results[best_model_name]['cv_scores'].std():.4f})")
print(f"   - Training Time: {results[best_model_name]['training_time']:.2f} seconds")
print(f"\nüîß Optimal Parameters:")
for param, value in results[best_model_name]['best_params'].items():
    print(f"   - {param}: {value}")

print("\nüìà All Models Performance Ranking:")
for idx, row in comparison_df.iterrows():
    print(f"   {idx+1}. {row['Model']}: {row['Test Accuracy']:.4f}")

print("\nüí° Recommendations for Next Steps:")
print("   1. Consider ensemble methods combining top models")
print("   2. Perform feature engineering to improve performance")
print("   3. Collect more data if possible")
print("   4. Try deep learning approaches for complex patterns")
print("   5. Deploy the best model for real-world testing")

print("\n‚úÖ Analysis completed successfully!")
print(f"üìÅ All results saved to: {output_path}")