In [4]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Create directories if they don't exist
os.makedirs('models', exist_ok=True)
os.makedirs('figures', exist_ok=True)  # For saving plots

# Load the split data
try:
    with open('data/train_test_split.pkl', 'rb') as f:
        X_train, X_test, y_train, y_test = pickle.load(f)
    print("Successfully loaded training and testing data")
    print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
    print(f"Testing set: {X_test.shape[0]} samples, {X_test.shape[1]} features")
except FileNotFoundError:
    print("Error: train_test_split.pkl file not found. Make sure you've run the data preprocessing script first.")
    raise

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'report': report,
        'predictions': y_pred
    }
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}\n")
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    # Save figure to figures directory
    plt_filename = f'figures/confusion_matrix_{name.replace(" ", "_")}.png'
    plt.savefig(plt_filename)
    print(f"Saved confusion matrix plot to {plt_filename}")
    plt.close()  # Close the figure to avoid displaying it in notebooks

# Save all models
print("\nSaving all models...")
for name, result in results.items():
    model_filename = f'models/{name.replace(" ", "_").lower()}_model.pkl'
    with open(model_filename, 'wb') as f:
        pickle.dump(result['model'], f)
    print(f"Saved {name} model to {model_filename}")

# Identify and save the best model
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
best_model = results[best_model_name]['model']
best_model_filename = 'models/best_basic_model.pkl'
with open(best_model_filename, 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nBest model: {best_model_name} with accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"Best model saved to {best_model_filename}")

# Create a comparison plot of model accuracies
plt.figure(figsize=(10, 6))
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]

bars = plt.bar(model_names, accuracies, color=['blue', 'green', 'purple'])
plt.title('Model Accuracy Comparison', fontsize=16)
plt.ylabel('Accuracy', fontsize=14)
plt.ylim(0, 1.0)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Add accuracy values on top of bars
for bar, accuracy in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{accuracy:.4f}', ha='center', fontsize=12)

plt.savefig('figures/model_accuracy_comparison.png')
print("Saved model comparison plot to figures/model_accuracy_comparison.png")
plt.close()

print("\nModel training and evaluation completed successfully!")

Successfully loaded training and testing data
Training set: 242 samples, 13 features
Testing set: 61 samples, 13 features

Training Logistic Regression...
Model: Logistic Regression
Accuracy: 0.5574
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        29
           1       0.38      0.25      0.30        12
           2       0.25      0.22      0.24         9
           3       0.17      0.29      0.21         7
           4       0.00      0.00      0.00         4

    accuracy                           0.56        61
   macro avg       0.32      0.34      0.32        61
weighted avg       0.52      0.56      0.53        61


Saved confusion matrix plot to figures/confusion_matrix_Logistic_Regression.png

Training Decision Tree...
Model: Decision Tree
Accuracy: 0.4590
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.79      0.78        29
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved confusion matrix plot to figures/confusion_matrix_Decision_Tree.png

Training Random Forest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Random Forest
Accuracy: 0.5082
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.97      0.82        29
           1       0.11      0.08      0.10        12
           2       0.17      0.11      0.13         9
           3       0.14      0.14      0.14         7
           4       0.00      0.00      0.00         4

    accuracy                           0.51        61
   macro avg       0.23      0.26      0.24        61
weighted avg       0.40      0.51      0.45        61


Saved confusion matrix plot to figures/confusion_matrix_Random_Forest.png

Saving all models...
Saved Logistic Regression model to models/logistic_regression_model.pkl
Saved Decision Tree model to models/decision_tree_model.pkl
Saved Random Forest model to models/random_forest_model.pkl

Best model: Logistic Regression with accuracy: 0.5574
Best model saved to models/best_basic_model.pkl
Saved model comparison plot to figures/model_accuracy_comp