In [None]:
# %% [markdown]
# # Task 3: Predictive Analytics for Resource Allocation
# ## Using Machine Learning to Predict Issue Priority
# 
# **Dataset:** Breast Cancer Wisconsin (Diagnostic) Dataset
# **Model:** Random Forest Classifier
# **Objective:** Predict issue priority (High/Medium/Low) for resource allocation

# %%
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, f1_score, classification_report, 
                           confusion_matrix, precision_recall_curve, roc_curve, auc)
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# %%
# Load and explore the dataset
print("Loading Breast Cancer Dataset...")
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

print(f"Dataset shape: {X.shape}")
print(f"Target distribution: {pd.Series(y).value_counts()}")
print(f"Feature names: {data.feature_names}")

# %%
# Convert binary classification to multi-class for priority prediction
# Simulating software issue priorities: 0=Low, 1=Medium, 2=High
np.random.seed(42)
y_priority = np.where(y == 1, 2, 0)  # Convert 1s to High(2), 0s to Low(0)

# Introduce medium priority samples by adding noise to some high priority cases
medium_mask = (y_priority == 2) & (np.random.random(len(y_priority)) < 0.3)
y_priority[medium_mask] = 1

print("Target Priority Distribution:")
priority_counts = pd.Series(y_priority).value_counts().sort_index()
print(priority_counts)
print(f"Low: {priority_counts[0]}, Medium: {priority_counts[1]}, High: {priority_counts[2]}")

# %%
# Data preprocessing
print("Preprocessing data...")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_priority, test_size=0.3, random_state=42, stratify=y_priority
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")

# %%
# Train Random Forest model
print("Training Random Forest Classifier...")
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

# %%
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score (weighted): {f1:.4f}")

# Cross-validation scores
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# %%
# Detailed classification report
print("\n" + "="*60)
print("DETAILED CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred, 
                          target_names=['Low Priority', 'Medium Priority', 'High Priority']))

# %%
# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Low', 'Medium', 'High'],
            yticklabels=['Low', 'Medium', 'High'])
plt.title('Confusion Matrix - Issue Priority Prediction')
plt.xlabel('Predicted Priority')
plt.ylabel('Actual Priority')
plt.tight_layout()
plt.show()

# %%
# Feature Importance Analysis
feature_importance = pd.DataFrame({
    'feature': data.feature_names,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
plt.title('Top 15 Most Important Features for Priority Prediction')
plt.xlabel('Feature Importance')
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

# %%
# Precision-Recall Curve for each class
plt.figure(figsize=(12, 8))
for i, priority in enumerate(['Low', 'Medium', 'High']):
    precision, recall, _ = precision_recall_curve(
        (y_test == i), y_pred_proba[:, i]
    )
    plt.plot(recall, precision, lw=2, 
             label=f'{priority} Priority (AP = {auc(recall, precision):.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve by Priority Class')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# %%
# Model Insights and Business Impact
print("="*60)
print("BUSINESS IMPACT ANALYSIS")
print("="*60)

# Calculate potential time savings
total_issues = len(y_test)
correct_predictions = np.sum(y_test == y_pred)

low_issues = np.sum(y_test == 0)
medium_issues = np.sum(y_test == 1)
high_issues = np.sum(y_test == 2)

print(f"Total issues in test set: {total_issues}")
print(f"Correctly prioritized: {correct_predictions} ({accuracy*100:.1f}%)")
print(f"\nPriority Distribution in Test Set:")
print(f"  Low Priority: {low_issues} issues")
print(f"  Medium Priority: {medium_issues} issues")
print(f"  High Priority: {high_issues} issues")

# Estimated time savings (in hours)
time_per_issue = [2, 8, 16]  # Hours per issue type: Low, Medium, High
total_time_without_ai = sum([
    low_issues * time_per_issue[0],
    medium_issues * time_per_issue[1],
    high_issues * time_per_issue[2]
])

# With AI, high priority issues get addressed faster
time_savings = high_issues * (time_per_issue[2] - time_per_issue[1]) * 0.3  # 30% efficiency gain

print(f"\nEstimated Resource Allocation Impact:")
print(f"Total processing time without AI: {total_time_without_ai} hours")
print(f"Estimated time savings with AI prioritization: {time_savings:.1f} hours")
print(f"Efficiency improvement: {(time_savings/total_time_without_ai)*100:.1f}%")

# %%
# Save performance metrics for report
performance_metrics = {
    'accuracy': accuracy,
    'f1_score': f1,
    'cv_accuracy_mean': cv_scores.mean(),
    'cv_accuracy_std': cv_scores.std(),
    'feature_importance': feature_importance,
    'confusion_matrix': cm
}

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE")
print("="*60)
print("✅ Data preprocessing completed")
print("✅ Random Forest model trained")
print("✅ Performance metrics calculated")
print("✅ Feature importance analyzed")
print("✅ Business impact assessed")

# Save important features to CSV
feature_importance.head(10).to_csv('top_features.csv', index=False)
print("✅ Top features saved to 'top_features.csv'")