<a href="https://colab.research.google.com/github/kimanirobbi/wk-4-ai/blob/main/Task_3_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Predictive Analytics for Resource Allocation**

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load and prepare dataset
def load_and_preprocess_data():
    """Load breast cancer dataset and preprocess for priority prediction"""
    data = load_breast_cancer()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['target'] = data.target

    # Convert binary classification to multi-class priority levels
    # For demonstration, we'll create 3 priority levels based on feature combinations
    df['priority'] = 0  # Default: low priority

    # Create synthetic priority labels based on tumor characteristics
    malignant_mask = df['target'] == 0
    high_risk_features = ['worst radius', 'worst texture', 'worst perimeter']

    # High priority: malignant with large tumor size
    high_priority_mask = malignant_mask & (df['worst radius'] > df['worst radius'].median())
    df.loc[high_priority_mask, 'priority'] = 2  # High priority

    # Medium priority: malignant with smaller tumors or benign with concerning features
    medium_priority_mask = (malignant_mask & ~high_priority_mask) | \
                          ((df['target'] == 1) & (df['worst radius'] > df['worst radius'].quantile(0.75)))
    df.loc[medium_priority_mask, 'priority'] = 1  # Medium priority

    return df, data.feature_names

def train_priority_model():
    """Train Random Forest model for priority prediction"""
    df, feature_names = load_and_preprocess_data()

    # Select features for training
    X = df[feature_names]
    y = df['priority']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train model
    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=10,
        class_weight='balanced'
    )

    model.fit(X_train_scaled, y_train)

    # Predictions
    y_pred = model.predict(X_test_scaled)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("=== Model Performance ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High']))

    # Feature importance
    feature_imp = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 10 Important Features:")
    print(feature_imp.head(10))

    return model, scaler, accuracy, f1

# Execute the training
model, scaler, accuracy, f1 = train_priority_model()

=== Model Performance ===
Accuracy: 0.9649
F1-Score: 0.9592

Classification Report:
              precision    recall  f1-score   support

         Low       0.95      1.00      0.97       107
      Medium       0.00      0.00      0.00         2
        High       1.00      0.94      0.97        62

    accuracy                           0.96       171
   macro avg       0.65      0.65      0.65       171
weighted avg       0.96      0.96      0.96       171


Top 10 Important Features:
                 feature  importance
23            worst area    0.098214
27  worst concave points    0.088455
7    mean concave points    0.081496
20          worst radius    0.077589
24      worst smoothness    0.069084
22       worst perimeter    0.064600
6         mean concavity    0.064247
2         mean perimeter    0.049880
26       worst concavity    0.049552
0            mean radius    0.044424
