In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

: 

In [None]:
# Read in iris_raw.csv
df = pd.read_csv("../data/customer_raw.csv")

# Build X, y (target -> y), drop species if present
y = df["churn"]
X = df.drop(columns=["churn",'product_id','sale_id', 'customer_id', 'price', 'total_value','feedback_text'], errors="ignore")

: 

In [None]:
# one hot encode categorical variables
categorical_cols = ['gender', 'region', 'segment', 'category', 'product_name', 'sentiment']
ohe = OneHotEncoder(sparse_output=False, drop='first')
X_categorical = ohe.fit_transform(X[categorical_cols])
X_numerical = X.drop(columns=categorical_cols, errors="ignore")
# Combine numerical and categorical features
X = pd.concat([X_numerical, pd.DataFrame(X_categorical, columns=ohe.get_feature_names_out(categorical_cols))], axis=1)
X.head()

: 

In [None]:
# Convert date columns to datetime
# Convert to datetime
df["sale_date"] = pd.to_datetime(df["sale_date"], errors="coerce")
df["last_purchase_date"] = pd.to_datetime(df["last_purchase_date"], errors="coerce")

# Derive features (example)
df["days_since_last_purchase"] = (pd.Timestamp("today") - df["last_purchase_date"]).dt.days
df["sale_year"] = df["sale_date"].dt.year
df["sale_month"] = df["sale_date"].dt.month

: 

In [None]:
# add the new features to X
X["days_since_last_purchase"] = df["days_since_last_purchase"]
X["sale_year"] = df["sale_year"]
X["sale_month"] = df["sale_month"]
# drop the original date columns
X = X.drop(columns=["sale_date", "last_purchase_date"], errors="ignore")

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:


# Train/test split already done (X_train, X_test, y_train, y_test)

# Build tree (class_weight balances churn vs non-churn)
dt = DecisionTreeClassifier(
    max_depth=5,          # limit depth to avoid overfitting
    min_samples_split=20, # minimum samples to split
    class_weight="balanced",
    random_state=42
)

# Fit model
dt.fit(X_train, y_train)

# Evaluate
y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:,1]

print("ROC AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))

# Visualize
plt.figure(figsize=(20,8))
plot_tree(dt, feature_names=X_train.columns, class_names=["No Churn","Churn"], filled=True)
plt.show()
