#### **Import necessary libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import export_text

import joblib


#### **Load the model**

In [2]:
file_path = "../resources/data/output/customer_churn_prepared.csv"   
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,call_failure,complains,subscription_length,charge_amount,seconds_of_use,frequency_of_SMS,distinct_called_numbers,age_group,tariff_plan,status,customer_value,churn
0,8,0,38,0,4370,5,17,3,1,1,197.64,0
1,0,0,39,0,318,7,4,2,1,2,46.035,0
2,10,0,37,0,2453,359,24,3,1,1,1536.52,0
3,10,0,38,0,4198,1,35,1,1,1,240.02,0
4,3,0,38,0,2393,2,33,1,1,1,145.805,0


In [4]:
X = df.drop(columns=["customer_value", "churn"])

#### **Load Decision Regression pipeline**

In [5]:
model_path = '../resources/models/Decision_tree_regression_model_final.pkl'
loaded_pipeline = joblib.load(model_path)

In [6]:
# Extract the preprocessor and decision tree regressor from the pipeline
preprocessor = loaded_pipeline.named_steps['preprocessor']
decision_tree = loaded_pipeline.named_steps['dt']

In [7]:
# Retrieve the output feature names from the preprocessor.
# This method returns an array of the names of all transformed features.
feature_names = preprocessor.get_feature_names_out()

# Display or use the feature names as required.
feature_names


array(['num__call_failure', 'num__complains', 'num__seconds_of_use',
       'num__frequency_of_SMS', 'num__distinct_called_numbers',
       'num__charge_amount', 'num__subscription_length',
       'cat__tariff_plan_2', 'cat__age_group_2', 'cat__age_group_3',
       'cat__age_group_4', 'cat__age_group_5', 'cat__status_2'],
      dtype=object)

#### **Feature Importance Extraction**

In [8]:
importances = decision_tree.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort the DataFrame by the 'Importance' column in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("Feature Importances:")
print(feature_importance_df)


Feature Importances:
                         Feature  Importance
3          num__frequency_of_SMS    0.663858
2            num__seconds_of_use    0.299373
11              cat__age_group_5    0.013366
10              cat__age_group_4    0.009707
4   num__distinct_called_numbers    0.009205
9               cat__age_group_3    0.001293
0              num__call_failure    0.001209
8               cat__age_group_2    0.000867
6       num__subscription_length    0.000536
5             num__charge_amount    0.000385
7             cat__tariff_plan_2    0.000183
12                 cat__status_2    0.000016
1                 num__complains    0.000003


- Most Important Features:
    - Frequency of SMS (66.4% importance)
    - Seconds of use (29.9% importance)
    These two usage metrics account for over 96% of the model's predictive power
- Moderately Important Features:
    - Age group 5 (1.3%)
    - Age group 4 (1.0%)
    - Distinct called numbers (0.9%)
- Minimal Impact Features:
    - Call failures, complaints, subscription length, and charge amount all have less than 0.1% importance
    - Status and tariff plan have negligible impact

This suggests that usage patterns, particularly SMS frequency, are the strongest predictors of Customer Lifetime Value.

In [18]:
# Extract rules from the decision tree as text
# tree_rules = export_text(decision_tree, feature_names=list(feature_names))
# print("\nDecision Tree Rules:")
# print(tree_rules)