# Logistic Regression Model - Enhanced Version
This notebook includes overfitting/underfitting detection and resolution, cross-validation, and model persistence.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
import os
from sklearn.model_selection import train_test_split, cross_val_score, validation_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the feature engineered data
data_path = Path('../Data/feature_engineered_data.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"Features: {df.columns.tolist()}")

# Separate features and target
X = df.drop('Food_Name_Encoded', axis=1)
y = df['Food_Name_Encoded']

print(f"\nFeatures shape: {X.shape}")
print(f"Target classes: {sorted(y.unique())}")
print(f"Class distribution:\n{y.value_counts().sort_index()}")

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## Baseline Model Training

In [None]:
# Train baseline logistic regression
baseline_model = LogisticRegression(random_state=42, max_iter=1000)
baseline_model.fit(X_train, y_train)

# Predictions
train_pred_baseline = baseline_model.predict(X_train)
test_pred_baseline = baseline_model.predict(X_test)

# Calculate accuracies
train_acc_baseline = accuracy_score(y_train, train_pred_baseline)
test_acc_baseline = accuracy_score(y_test, test_pred_baseline)

print("=== Baseline Logistic Regression ===")
print(f"Training Accuracy: {train_acc_baseline:.4f}")
print(f"Test Accuracy: {test_acc_baseline:.4f}")
print(f"Difference: {train_acc_baseline - test_acc_baseline:.4f}")

# Check for overfitting/underfitting
if train_acc_baseline - test_acc_baseline > 0.1:
    print("⚠️  OVERFITTING DETECTED: Training accuracy >> Test accuracy")
    overfitting_issue = True
elif train_acc_baseline < 0.8 and test_acc_baseline < 0.8:
    print("⚠️  UNDERFITTING DETECTED: Both accuracies are low")
    underfitting_issue = True
else:
    print("✅ Model appears well-balanced")
    overfitting_issue = False
    underfitting_issue = False

## Cross-Validation Analysis

In [None]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(baseline_model, X_train, y_train, cv=5, scoring='accuracy')

print("=== Cross-Validation Results ===")
print(f"CV Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"CV vs Test Accuracy: {cv_scores.mean():.4f} vs {test_acc_baseline:.4f}")

## Regularization Analysis (Overfitting Solution)

In [None]:
# Test different regularization strengths
C_values = [0.01, 0.1, 1.0, 10.0, 100.0]
results = []

for C in C_values:
    model = LogisticRegression(C=C, random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    cv_score = cross_val_score(model, X_train, y_train, cv=5).mean()
    
    results.append({
        'C': C,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'cv_score': cv_score,
        'overfitting': train_acc - test_acc
    })

results_df = pd.DataFrame(results)
print("=== Regularization Analysis ===")
print(results_df.round(4))

# Find best C value (highest CV score with minimal overfitting)
best_idx = results_df['cv_score'].idxmax()
best_C = results_df.loc[best_idx, 'C']
print(f"\nBest C value: {best_C}")

## Polynomial Features (Underfitting Solution)

In [None]:
# Test polynomial features for underfitting
if 'underfitting_issue' in locals() and underfitting_issue:
    print("=== Testing Polynomial Features for Underfitting ===")
    
    # Create polynomial features (degree 2)
    poly_pipeline = Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('logistic', LogisticRegression(C=best_C, random_state=42, max_iter=2000))
    ])
    
    poly_pipeline.fit(X_train, y_train)
    
    train_acc_poly = accuracy_score(y_train, poly_pipeline.predict(X_train))
    test_acc_poly = accuracy_score(y_test, poly_pipeline.predict(X_test))
    cv_score_poly = cross_val_score(poly_pipeline, X_train, y_train, cv=5).mean()
    
    print(f"Polynomial Features Results:")
    print(f"Training Accuracy: {train_acc_poly:.4f}")
    print(f"Test Accuracy: {test_acc_poly:.4f}")
    print(f"CV Score: {cv_score_poly:.4f}")
    print(f"Overfitting: {train_acc_poly - test_acc_poly:.4f}")
    
    # Use polynomial model if it's better
    if cv_score_poly > cv_scores.mean():
        print("✅ Polynomial features improved performance")
        final_model = poly_pipeline
        final_train_acc = train_acc_poly
        final_test_acc = test_acc_poly
        model_type = "Polynomial Logistic Regression"
    else:
        print("❌ Polynomial features didn't help")
        final_model = LogisticRegression(C=best_C, random_state=42, max_iter=1000)
        final_model.fit(X_train, y_train)
        final_train_acc = accuracy_score(y_train, final_model.predict(X_train))
        final_test_acc = accuracy_score(y_test, final_model.predict(X_test))
        model_type = "Regularized Logistic Regression"
else:
    # Use regularized model
    final_model = LogisticRegression(C=best_C, random_state=42, max_iter=1000)
    final_model.fit(X_train, y_train)
    final_train_acc = accuracy_score(y_train, final_model.predict(X_train))
    final_test_acc = accuracy_score(y_test, final_model.predict(X_test))
    model_type = "Regularized Logistic Regression"

## Final Model Evaluation

In [None]:
# Final predictions
final_predictions = final_model.predict(X_test)

print(f"=== Final {model_type} Results ===")
print(f"Training Accuracy: {final_train_acc:.4f}")
print(f"Test Accuracy: {final_test_acc:.4f}")
print(f"Overfitting Score: {final_train_acc - final_test_acc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, final_predictions))

# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, final_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'{model_type} - Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Feature importance (for non-polynomial models)
if not isinstance(final_model, Pipeline):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': np.abs(final_model.coef_[0])
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance.head(10)['feature'], feature_importance.head(10)['importance'])
    plt.title('Top 10 Feature Importance (Absolute Coefficients)')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

## Model Persistence

In [None]:
# Create models directory if it doesn't exist
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save the model
model_filename = models_dir / 'logistic_regression_model.joblib'
joblib.dump(final_model, model_filename)

# Save model metadata
metadata = {
    'model_type': model_type,
    'training_accuracy': final_train_acc,
    'test_accuracy': final_test_acc,
    'overfitting_score': final_train_acc - final_test_acc,
    'best_C_parameter': best_C if 'best_C' in locals() else 1.0,
    'feature_names': X.columns.tolist(),
    'target_classes': sorted(y.unique()),
    'dataset_shape': df.shape
}

import json
metadata_filename = models_dir / 'logistic_regression_metadata.json'
with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✅ Model saved to: {model_filename}")
print(f"✅ Metadata saved to: {metadata_filename}")

# Test loading the model
loaded_model = joblib.load(model_filename)
test_prediction = loaded_model.predict(X_test[:5])
print(f"\n✅ Model loading test successful")
print(f"Sample predictions: {test_prediction}")

## Summary

This enhanced logistic regression model includes:
1. **Overfitting Detection**: Comparing training vs test accuracy
2. **Regularization**: Testing different C values to reduce overfitting
3. **Underfitting Solution**: Polynomial features if needed
4. **Cross-Validation**: 5-fold CV for robust evaluation
5. **Model Persistence**: Saving model and metadata for later use
6. **Comprehensive Evaluation**: Classification report, confusion matrix, feature importance