<a href="https://colab.research.google.com/github/jidza1972/Procurement-Anomaly-Dashboard/blob/main/Public_Procurement_Anomaly_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
import shap
import matplotlib.pyplot as plt

# --- App Configuration ---
st.set_page_config(
    page_title="Procurement Anomaly Detector",
    page_icon="🔎",
    layout="wide",
    initial_sidebar_state="expanded",
)

# Custom CSS for a better look and feel
st.markdown("""
<style>
    .reportview-container {
        background: #f0f2f6;
    }
    .sidebar .sidebar-content {
        background: #ffffff;
    }
    .stButton>button {
        color: #ffffff;
        background-color: #0068c9;
        border-radius: 8px;
        border: none;
        padding: 10px 20px;
    }
    .stButton>button:hover {
        background-color: #00509e;
        color: #ffffff;
    }
    .st-expander {
        border: 1px solid #e6e9ef;
        border-radius: 8px;
    }
    h1, h2, h3 {
        color: #1e293b;
    }
</style>
""", unsafe_allow_html=True)


# --- Helper Functions ---

def generate_synthetic_data():
    """
    Generates a synthetic procurement dataset with known anomalies.
    This helps in demonstrating the app's functionality without real data.
    """
    np.random.seed(42)
    num_records = 500
    num_anomalies = 25

    # Normal data
    data = {
        'supplier_id': np.random.choice([f'SUP-{i:03}' for i in range(20)], num_records),
        'item_category': np.random.choice(['Office Supplies', 'IT Hardware', 'Consulting', 'Construction'], num_records, p=[0.4, 0.3, 0.2, 0.1]),
        'quantity': np.random.randint(1, 100, size=num_records),
        'unit_price': np.random.uniform(10, 500, size=num_records),
        'contract_duration_days': np.random.randint(30, 365, size=num_records)
    }
    df = pd.DataFrame(data)
    df['total_cost'] = df['quantity'] * df['unit_price']

    # --- Inject Anomalies ---
    anomaly_indices = np.random.choice(df.index, num_anomalies, replace=False)
    df['ground_truth_anomaly'] = 0
    df.loc[anomaly_indices, 'ground_truth_anomaly'] = 1 # 1 for anomaly, 0 for normal

    # Anomaly Type 1: Extremely high unit price
    df.loc[anomaly_indices[:8], 'unit_price'] *= np.random.uniform(10, 20, size=8)

    # Anomaly Type 2: Unusually large quantity
    df.loc[anomaly_indices[8:15], 'quantity'] *= np.random.randint(15, 30, size=7)

    # Anomaly Type 3: Suspiciously low contract duration for high cost
    high_cost_anomalies = anomaly_indices[15:20]
    df.loc[high_cost_anomalies, 'total_cost'] *= np.random.uniform(5, 10, size=5)
    df.loc[high_cost_anomalies, 'contract_duration_days'] = np.random.randint(1, 7, size=5)

    # Anomaly Type 4: Mismatched total cost
    mismatch_indices = anomaly_indices[20:]
    df.loc[mismatch_indices, 'total_cost'] *= np.random.uniform(2, 5, size=len(mismatch_indices))

    # Recalculate total cost for non-mismatched anomalies to ensure consistency
    df['total_cost'] = df['quantity'] * df['unit_price']
    df.loc[mismatch_indices, 'total_cost'] *= 1.5 # Keep the mismatch for these specific anomalies

    return df.sample(frac=1).reset_index(drop=True) # Shuffle data

def preprocess_data(df):
    """
    Creates a preprocessing pipeline to handle categorical and numerical features.
    """
    # Identify categorical and numerical features
    categorical_features = df.select_dtypes(include=['object', 'category']).columns
    numerical_features = df.select_dtypes(include=np.number).columns

    # Drop any ground truth label if it exists
    if 'ground_truth_anomaly' in numerical_features:
        numerical_features = numerical_features.drop('ground_truth_anomaly')

    # Create preprocessing pipelines for both feature types
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a column transformer to apply different transformations to different columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    return preprocessor, numerical_features, categorical_features

# --- Main App UI ---
st.title("🔎 Public Procurement Anomaly Detector")
st.write("""
This application uses the **Isolation Forest** algorithm to identify potential anomalies in procurement data.
Upload your dataset or use the sample data to begin. Adjust the model parameters in the sidebar and view the
detected anomalies and their explanations powered by **SHAP**.
""")

# --- Sidebar for Controls ---
with st.sidebar:
    st.header("⚙️ Controls")

    st.subheader("1. Data Input")
    uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
    use_sample_data = st.button("Use Sample Data")

    st.subheader("2. Model Parameters")
    contamination = st.slider(
        "Contamination Level",
        min_value=0.01, max_value=0.5, value=0.05, step=0.01,
        help="The expected proportion of anomalies in the data. This is a key parameter for Isolation Forest."
    )
    n_estimators = st.slider(
        "Number of Estimators",
        min_value=50, max_value=500, value=100, step=10,
        help="The number of base trees in the ensemble."
    )
    random_seed = st.number_input(
        "Random Seed",
        value=42,
        help="Seed for reproducibility. Change this to see how it affects the model's outcome."
    )

# --- Data Loading and Caching ---
@st.cache_data
def load_data(file):
    return pd.read_csv(file)

if use_sample_data:
    df = generate_synthetic_data()
    st.session_state['df'] = df
elif uploaded_file is not None:
    df = load_data(uploaded_file)
    st.session_state['df'] = df
else:
    df = None

if 'df' in st.session_state and st.session_state['df'] is not None:
    df = st.session_state['df']

    st.header("📊 Data Preview")
    st.dataframe(df.head())

    # --- Preprocessing and Model Training ---
    st.header("🚀 Analysis Results")
    try:
        # Define features to be used in the model
        if 'ground_truth_anomaly' in df.columns:
            features_df = df.drop('ground_truth_anomaly', axis=1)
        else:
            features_df = df.copy()

        preprocessor, num_features, cat_features = preprocess_data(features_df)

        # Create the model pipeline
        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('isolator', IsolationForest(
                n_estimators=n_estimators,
                contamination=contamination,
                random_state=random_seed,
                n_jobs=-1
            ))
        ])

        with st.spinner("Analyzing data and training model..."):
            # Fit the model and get predictions
            model.fit(features_df)
            df['anomaly_score'] = model.named_steps['isolator'].decision_function(model.named_steps['preprocessor'].transform(features_df))
            df['predicted_anomaly'] = model.named_steps['isolator'].predict(model.named_steps['preprocessor'].transform(features_df))
            # Convert predictions from -1/1 to 1/0
            df['predicted_anomaly'] = df['predicted_anomaly'].apply(lambda x: 1 if x == -1 else 0)


        anomalies = df[df['predicted_anomaly'] == 1].sort_values(by='anomaly_score', ascending=True)

        st.subheader(f"🚨 Detected Anomalies ({len(anomalies)} found)")
        if not anomalies.empty:
            st.dataframe(anomalies)
        else:
            st.success("No anomalies were detected with the current settings.")

        # --- Model Evaluation ---
        if 'ground_truth_anomaly' in df.columns:
            st.subheader("📈 Model Performance Metrics")
            col1, col2, col3 = st.columns(3)
            precision = precision_score(df['ground_truth_anomaly'], df['predicted_anomaly'])
            recall = recall_score(df['ground_truth_anomaly'], df['predicted_anomaly'])
            f1 = f1_score(df['ground_truth_anomaly'], df['predicted_anomaly'])

            col1.metric("Precision", f"{precision:.2%}")
            col2.metric("Recall", f"{recall:.2%}")
            col3.metric("F1-Score", f"{f1:.2%}")
            st.info("These metrics are calculated because the sample data includes a 'ground_truth_anomaly' column for validation. If you use your own data, this section will not appear unless a column with this exact name exists.")


        # --- SHAP Validation and Explanation ---
        if not anomalies.empty:
            st.header("🔍 Anomaly Explanation with SHAP")
            st.write("""
            SHAP (SHapley Additive exPlanations) helps explain *why* a data point was flagged as an anomaly.
            Features that push the score higher (to the right) contribute to the data point being considered 'normal',
            while features that push the score lower (to the left) contribute to it being an 'anomaly'.
            """)

            with st.spinner("Calculating SHAP values... This may take a moment."):
                # We need to use the transformed data for SHAP
                data_transformed = model.named_steps['preprocessor'].transform(features_df)
                transformed_feature_names = list(num_features) + \
                    model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(cat_features).tolist()

                # SHAP works with models that have a `predict` function.
                # The decision_function provides a more nuanced score, so we wrap it.
                def decision_function_wrapper(X):
                    return model.named_steps['isolator'].decision_function(X)

                explainer = shap.KernelExplainer(decision_function_wrapper, data_transformed, link="identity")

                # Use a smaller background set for faster computation if data is large
                background_data = shap.sample(data_transformed, 100) if data_transformed.shape[0] > 100 else data_transformed
                shap_explainer = shap.KernelExplainer(decision_function_wrapper, background_data)

                # Get SHAP values for the anomalous data points
                anomalies_transformed = model.named_steps['preprocessor'].transform(anomalies[features_df.columns])
                shap_values = shap_explainer.shap_values(anomalies_transformed)

            st.subheader("Summary of Anomaly Features")
            fig_summary, ax_summary = plt.subplots()
            shap.summary_plot(shap_values, anomalies_transformed, feature_names=transformed_feature_names, show=False)
            st.pyplot(fig_summary)
            plt.close(fig_summary)
            st.write("The summary plot above shows the most important features driving anomaly scores across all detected anomalies. For example, a high `total_cost` (red dot on the right of the `total_cost` row) strongly contributes to a record being flagged as an anomaly.")

            st.subheader("Individual Anomaly Explanations")
            # Select an anomaly to inspect
            anomaly_to_inspect_index = st.selectbox(
                "Select an anomaly to inspect in detail:",
                options=anomalies.index,
                format_func=lambda x: f"Index {x} (Score: {anomalies.loc[x, 'anomaly_score']:.2f})"
            )

            if anomaly_to_inspect_index:
                anomaly_idx_in_anomalies_df = anomalies.index.get_loc(anomaly_to_inspect_index)

                # Create a force plot for the selected anomaly
                fig_force, ax_force = plt.subplots(figsize=(10, 3))
                shap.force_plot(
                    shap_explainer.expected_value,
                    shap_values[anomaly_idx_in_anomalies_df, :],
                    anomalies_transformed[anomaly_idx_in_anomalies_df, :],
                    feature_names=transformed_feature_names,
                    matplotlib=True,
                    show=False
                )
                plt.tight_layout()
                st.pyplot(fig_force)
                plt.close(fig_force)

                st.write("**Original Data for this Anomaly:**")
                st.dataframe(pd.DataFrame(anomalies.loc[anomaly_to_inspect_index]).T)


    except Exception as e:
        st.error(f"An error occurred during analysis: {e}")
        st.exception(e)

else:
    st.info("Please upload a file or use the sample data to get started.")

# --- Footer ---
st.markdown("---")
st.write("Built with ❤️ using Streamlit, Scikit-learn, and SHAP.")

ModuleNotFoundError: No module named 'streamlit'