In [0]:
from dlt_utils import DLTWriter
from production_predictor import *
from src.data_loader import DataLoader
from src.sales_processor import SalesProcessor
from src.feature_engineer import FeatureEngineer
from src.data_aggregator import DataAggregator
from src.model_pipeline import ModelPipeline

import shap
import json
import pickle
import datetime
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
from pytz import timezone


In [0]:
data_loader = DataLoader()
# data_dict = data_loader.load_all_data()
customer_master = data_loader.load_customer_master()
sales_data = data_loader.load_sales_data()

sales_processor = SalesProcessor(sales_data, customer_master)
sales_results = sales_processor.process_sales_data()

last_billed = sales_results['last_billed']
sales_journey = sales_results['sales_journey']
days_between_purchase = sales_results['days_between_purchase']


print(f"âœ… Sales data processed successfully:")
print(f"   - Last Billed: {last_billed.shape}")
print(f"   - Sales Journey: {sales_journey.shape}")
print(f"   - Days Between Purchase: {days_between_purchase.shape}")        

In [0]:
print("\nðŸ”§ Step 3: Feature Engineering")
print("-" * 30)
feature_engineer = FeatureEngineer(sales_journey, customer_master, sales_processor.reference_date)

# Add dealership age
customer_master = feature_engineer.add_dealership_age()

# Create monthly sales features
monthly_sales = feature_engineer.create_monthly_sales_features()

# Create territory and club features
terrwise_monthly_sales, dealerclubwise_monthly_sales = feature_engineer.create_territory_and_club_features(monthly_sales)

# Merge territory and club features
monthly_sales = pd.merge(
    monthly_sales, 
    terrwise_monthly_sales, 
    on=['territory_code', 'period'], 
    how='left'
).merge(
    dealerclubwise_monthly_sales, 
    on=['dealer_club_category', 'period'], 
    how='left'
)

In [0]:
sas_monthly_data = data_loader.load_sas_monthly_data()
# Process SAS data
sas_monthly_data = feature_engineer.process_sas_data(sas_monthly_data)

# Create rotation features
monthly_sales, terrwise_sas, dealerclub_wise_sas = feature_engineer.create_rotation_features(
    monthly_sales, sas_monthly_data
)

# Merge SAS features
monthly_sales = pd.merge(
    monthly_sales, 
    terrwise_sas, 
    on=['territory_code', 'period'], 
    how='left'
).merge(
    dealerclub_wise_sas, 
    on=['dealer_club_category', 'period'], 
    how='left'
)

In [0]:
credit_note_df = data_loader.load_credit_note_data()

In [0]:
movement_counts = feature_engineer.create_club_movement_features(sas_monthly_data)
outstanding_df = data_loader.load_outstanding_data()


In [0]:
# Process outstanding data
outstanding_df = feature_engineer.process_outstanding_data(outstanding_df)
# Create credit note features
monthly_credit_note = feature_engineer.create_credit_note_features(credit_note_df)


In [0]:
orders_df = data_loader.load_orders_data()

In [0]:
claims_data = data_loader.load_claims_data()

In [0]:
visits_data = data_loader.load_visits_data()

In [0]:
# Create order type features
order_types = feature_engineer.create_order_type_features(sales_data, orders_df)

# Create claims features
claim_count = feature_engineer.create_claims_features(claims_data)

# Create visit features
visit_count = feature_engineer.create_visit_features(visits_data)

# Create CM labels
monthly_sales = feature_engineer.create_cm_labels(monthly_sales)

print(f"âœ… Features engineered successfully:")
print(f"   - Monthly Sales Features: {monthly_sales.shape}")
print(f"   - Movement Counts: {movement_counts.shape}")
print(f"   - Outstanding Data: {outstanding_df.shape}")
print(f"   - Credit Note Features: {monthly_credit_note.shape}")
print(f"   - Order Type Features: {order_types.shape}")
print(f"   - Claims Features: {claim_count.shape}")
print(f"   - Visit Features: {visit_count.shape}")


In [0]:
# Step 4: Data Aggregation
print("\nðŸ“ˆ Step 4: Data Aggregation")
print("-" * 30)

territory_master = data_loader.load_territory_master()

data_aggregator = DataAggregator()

final_data = data_aggregator.aggregate_all_data(
    monthly_sales, outstanding_df, monthly_credit_note, order_types, 
    claim_count, visit_count, days_between_purchase, movement_counts, 
    customer_master, territory_master, last_billed, save_dataset=True
)

print("\nðŸŽ‰ Pipeline completed successfully!")
print(f"Final dataset shape: {final_data.shape}")
print(f"Output saved to: offset_features.csv")


In [0]:
# from src.model_pipeline import ModelPipeline

model_pipeline = ModelPipeline()

In [0]:
results = model_pipeline.run_complete_training_pipeline(data_path="offset_features")

In [0]:
results['explanation_results']['explanations_df'].to_csv("output/explanations_df.csv")

In [0]:
# import xgboost as xgb

# Path to your model
model_path = "models/xgb_churn_model.pkl"

# Load model
with open(model_path, "rb") as f:
    model = pickle.load(f)

X = pd.concat([results['preprocessed_data']['X_train'], results['preprocessed_data']['X_test']])

print(type(model))
print(model.get_booster().feature_names[:10])  # shows first few features if available


In [0]:
# Suppose you have your full feature dataset X_full
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Plot
shap.summary_plot(shap_values, X)


In [0]:
def generate_explanations_df(X, y_pred, shap_values, explainer=None, top_n=10, threshold=0.5):
    """
    Generate simplified SHAP explanations DataFrame for all dealers.

    Parameters
    ----------
    X : pd.DataFrame
        Feature dataset (indexed by dealer_code).
    y_pred : array-like or pd.Series
        Model predictions. Can be probabilities or class labels.
    shap_values : shap.Explanation or np.ndarray
        SHAP values for X.
    explainer : shap.Explainer, optional
        SHAP explainer (needed only if shap_values is ndarray).
    top_n : int
        Number of top features to include in 'top_features'.
    threshold : float
        Probability threshold for converting to binary labels (default=0.5).

    Returns
    -------
    pd.DataFrame
        Explanations summary per dealer.
    """

    # Handle plain numpy arrays
    if isinstance(shap_values, np.ndarray):
        if explainer is None:
            raise ValueError("Explainer required when shap_values is numpy array.")
        shap_values = shap.Explanation(
            values=shap_values,
            base_values=explainer.expected_value,
            data=X,
            feature_names=X.columns
        )

    # Convert SHAP values to DataFrame
    shap_df = pd.DataFrame(shap_values.values, columns=X.columns, index=X.index)

    # Convert predictions to binary labels if they look like probabilities
    if isinstance(y_pred, (pd.Series, np.ndarray)):
        y_pred = pd.Series(y_pred, index=X.index)
        if y_pred.max() <= 1.0:  # assume probabilities
            y_pred = (y_pred >= threshold).astype(int)
    else:
        raise ValueError("y_pred must be array-like or pandas Series")

    results = []
    print(f"ðŸ§® Generating explanations for {len(X)} dealers...")

    for dealer_code in tqdm(X.index):
        row_shap = shap_df.loc[dealer_code]

        # Split into positive/negative features
        positive_features = row_shap[row_shap > 0].sort_values(ascending=False)
        negative_features = row_shap[row_shap < 0].sort_values(ascending=True)

        # Top absolute SHAP features
        abs_features = row_shap.abs().sort_values(ascending=False).head(top_n)

        results.append({
            "dealer_code": dealer_code,
            "predicted_churn": int(y_pred.loc[dealer_code]),
            "positive_features": positive_features.to_dict(),
            "negative_features": negative_features.to_dict(),
            "top_features": abs_features.to_dict(),
            "total_positive_impact": positive_features.sum(),
            "total_negative_impact": negative_features.sum(),
            "net_impact": row_shap.sum()
        })

    explanations_df = pd.DataFrame(results).set_index("dealer_code")
    print("âœ… Explanations DataFrame created successfully!")

    return explanations_df


In [0]:
# Load model and data
import pickle
import numpy as np

with open("models/xgb_churn_model.pkl", "rb") as f:
    model = pickle.load(f)

# Prepare data
# X is your full dataset (dealer_code as index)
y_pred = pd.Series(model.predict_proba(X)[:, 1], index=X.index)

# SHAP values
explainer = shap.Explainer(model, X)
shap_values = explainer(X)

# Generate explanations
explanations_df = generate_explanations_df(X, y_pred, shap_values, explainer, top_n=10)

# Save or inspect
explanations_df.head()


In [0]:
for col in ["positive_features", "negative_features", "top_features"]:
    explanations_df[col] = explanations_df[col].apply(json.dumps)

explanations_df.reset_index(inplace=True)

In [0]:
churn_dealers_df = predict_all_dealers()

cm = spark.read.table("`provisioned-tableau-data`.gold_layer.customer_master").select("dealer_code","territory_code").toPandas()

churn_dealers_df = pd.merge(churn_dealers_df,cm,on="dealer_code",how="left")

## Writing data in DLT

In [0]:
explanations_df['created_at'] = datetime.datetime.now(timezone('Asia/Kolkata')).date()

In [0]:
table_name = "`provisioned-tableau-data`.`data_science`.`explainations`"

(
    spark.createDataFrame(explanations_df)
    .write
    .format("delta")
    .mode("append")
    .option("mergeSchema", "true")
    # .option("overwriteSchema", "true")
    .saveAsTable(table_name)
)


In [0]:
table_name = "`provisioned-tableau-data`.`data_science`.`dealer_churn_predictions`"

(
    spark.createDataFrame(churn_dealers_df)
    .write
    .format("delta")
    .mode("append")
    .option("mergeSchema", "true")
    # .option("overwriteSchema", "true")
    .saveAsTable(table_name)
)