In [109]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import count, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, DoubleType, LongType, FloatType, BooleanType, NumericType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.classification import LinearSVC, DecisionTreeClassifier, NaiveBayes

spark = SparkSession.builder \
.appName("Olist Customer Churn") \
.config("spark.driver.memory", "4g") \
.config("spark.executor.memory", "4g") \
.getOrCreate()

spark.sparkContext.setLogLevel("WARN")

In [110]:
# ==================
# Load Dataset
# ===================
df_path = r"data\finalData\customer_training.csv"

df = spark.read.csv(
    df_path,
    header= True,
    inferSchema= True,
    multiLine=True,
    escape='"',
    quote='"'
)
df.show(5)


+--------------------+-------+---------+--------+--------+--------------------+-----+------------------+------------------+------------+-----------+--------------------+--------------------+----------------+----------------+----------------+-------------------+-----+
|  customer_unique_id|recency|frequency|monetary|zip_code|                city|state|total_items_volume|max_delivery_delay|is_delivered|is_approved|payment_method_count|primary_payment_type|max_installments|avg_satisfaction|min_review_score|total_reviews_given|label|
+--------------------+-------+---------+--------+--------+--------------------+-----+------------------+------------------+------------+-----------+--------------------+--------------------+----------------+----------------+----------------+-------------------+-----+
|0006fdc98a402fceb...|    232|        1|    29.0|   29400|       mimoso do sul|   ES|                 1|               -12|           1|          1|                   1|         credit_card|      

## Handle Irrelevant Features

In [111]:
# ============================================
# Check ID-like columns (all values unique)
# ============================================
from pyspark.sql import functions as F

row_count = df.count()

id_like_cols = [
    c for c in df.columns
    if df.select(F.countDistinct(c)).collect()[0][0] == row_count
]

print("\nID-like columns to drop:")
print(id_like_cols)


ID-like columns to drop:
['customer_unique_id']


In [112]:
# ==================================
# Check constant columns
# ===================================

distinct_counts = df.agg(*[
    F.countDistinct(c).alias(c) for c in df.columns
]).collect()[0].asDict()

constant_cols = [c for c, v in distinct_counts.items() if v == 1]

print("\nConstant columns to drop:")
print(constant_cols)


Constant columns to drop:
['is_approved']


In [113]:
# ==========================================
# Check High Cardinality
# ==========================================
total_rows = df.count()
high_card_cols = []

print(f"{'Column Name':<20} | {'Unique Count':<12} | {'Ratio %':<10}")
print("-" * 50)

for col_name in df.columns:
    # Skip the target label
    if col_name == 'label': continue
    
    unique_count = df.select(col_name).distinct().count()
    ratio = (unique_count / total_rows) * 100
    
    if unique_count > 50 and ratio > 1.0:
        high_card_cols.append(col_name)
        print(f"{col_name:<20} | {unique_count:<12} | {ratio:<10.2f}% (DROPPED)")
    else:
        print(f"{col_name:<20} | {unique_count:<12} | {ratio:<10.2f}% (KEEP)")

print("\nFinal list of columns to drop:", high_card_cols)

Column Name          | Unique Count | Ratio %   
--------------------------------------------------
customer_unique_id   | 55245        | 100.00    % (DROPPED)
recency              | 366          | 0.66      % (KEEP)
frequency            | 8            | 0.01      % (KEEP)
monetary             | 18539        | 33.56     % (DROPPED)
zip_code             | 13131        | 23.77     % (DROPPED)
city                 | 3542         | 6.41      % (DROPPED)
state                | 27           | 0.05      % (KEEP)
total_items_volume   | 17           | 0.03      % (KEEP)
max_delivery_delay   | 62           | 0.11      % (KEEP)
is_delivered         | 2            | 0.00      % (KEEP)
is_approved          | 1            | 0.00      % (KEEP)
payment_method_count | 19           | 0.03      % (KEEP)
primary_payment_type | 4            | 0.01      % (KEEP)
max_installments     | 22           | 0.04      % (KEEP)
avg_satisfaction     | 32           | 0.06      % (KEEP)
min_review_score     | 5         

In [114]:
# =========================================
# Check  Multicollinearity
# =========================================

# STEP 1: Identify Numeric Columns - EXCLUDING the label
numeric_types = (IntegerType, DoubleType, LongType, FloatType)
feature_cols = [f.name for f in df.schema.fields 
                if isinstance(f.dataType, numeric_types) and f.name != 'label']

# STEP 2: Compute Correlation Matrix
assembler = VectorAssembler(inputCols=feature_cols, outputCol="corr_features", handleInvalid="skip")
df_vector = assembler.transform(df).select("corr_features")

matrix = Correlation.corr(df_vector, "corr_features").head()[0]
corr_matrix = matrix.toArray()

# STEP 3: Identify Highly Correlated Pairs
high_corr_cols = set()
n = len(feature_cols)

print(f"{'Feature A':<20} | {'Feature B':<20} | {'Correlation'}")
print("-" * 55)

for i in range(n):
    for j in range(i + 1, n):
        corr_val = abs(float(corr_matrix[i, j]))
        if corr_val > 0.80:
            print(f"{feature_cols[i]:<20} | {feature_cols[j]:<20} | {corr_val:.4f}")
            high_corr_cols.add(feature_cols[j])

print("\nFinal suggested columns to drop (Redundant):", list(high_corr_cols))

Feature A            | Feature B            | Correlation
-------------------------------------------------------
frequency            | total_reviews_given  | 0.8642
avg_satisfaction     | min_review_score     | 0.9957

Final suggested columns to drop (Redundant): ['min_review_score', 'total_reviews_given']


In [115]:
# ==========================================
# Check Low-Variance
# ==========================================

# STEP 1: Identify Numeric Columns - EXCLUDING the label
numeric_types = (IntegerType, DoubleType, LongType, FloatType)
# Safety: Don't check the label for variance thresholding
cols_to_test = [f.name for f in df.schema.fields 
                if isinstance(f.dataType, numeric_types) and f.name != 'label']

# STEP 2: Calculate Variance
variance_df = df.select([F.variance(c).alias(c) for c in cols_to_test])
variance_dict = variance_df.first().asDict()

# STEP 3: Identify Low-Variance Columns (threshold = 0.01)
threshold = 0.01
low_variance_cols = []

print(f"{'Column Name':<25} | {'Variance':<12} | {'Status'}")
print("-" * 50)

for col, var in variance_dict.items():
    if var is not None and var <= threshold:
        low_variance_cols.append(col)
        print(f"{col:<25} | {var:<12.6f} | DROP")
    else:
        print(f"{col:<25} | {var:<12.6f} | OK")

print(f"\nFinal list of redundant low-variance columns: {low_variance_cols}")

Column Name               | Variance     | Status
--------------------------------------------------
recency                   | 10214.420772 | OK
frequency                 | 0.040364     | OK
monetary                  | 48468.659007 | OK
zip_code                  | 892344667.024462 | OK
total_items_volume        | 0.368729     | OK
max_delivery_delay        | 115.807551   | OK
is_delivered              | 0.022443     | OK
is_approved               | 0.000000     | DROP
payment_method_count      | 0.175390     | OK
max_installments          | 7.714447     | OK
avg_satisfaction          | 1.818769     | OK
min_review_score          | 1.851334     | OK
total_reviews_given       | 0.075351     | OK

Final list of redundant low-variance columns: ['is_approved']


In [116]:

cols_to_drop_set = set(id_like_cols) | set(high_card_cols) | set(high_corr_cols) | set(low_variance_cols)
cols_to_drop_set.discard('monetary')

cols_to_drop = list(cols_to_drop_set)

print(f"Total columns to drop: {len(cols_to_drop)}")
print(f"Columns to be removed: {cols_to_drop}")

df_final = df.drop(*cols_to_drop)
df_final.show(5)

Total columns to drop: 6
Columns to be removed: ['customer_unique_id', 'zip_code', 'city', 'is_approved', 'min_review_score', 'total_reviews_given']
+-------+---------+--------+-----+------------------+------------------+------------+--------------------+--------------------+----------------+----------------+-----+
|recency|frequency|monetary|state|total_items_volume|max_delivery_delay|is_delivered|payment_method_count|primary_payment_type|max_installments|avg_satisfaction|label|
+-------+---------+--------+-----+------------------+------------------+------------+--------------------+--------------------+----------------+----------------+-----+
|    232|        1|    29.0|   ES|                 1|               -12|           1|                   1|         credit_card|               2|             3.0|    1|
|     85|        1|    29.0|   MG|                 1|                -9|           1|                   1|         credit_card|               2|             5.0|    1|
|    353|  

In [117]:
df_final.printSchema()

root
 |-- recency: integer (nullable = true)
 |-- frequency: integer (nullable = true)
 |-- monetary: double (nullable = true)
 |-- state: string (nullable = true)
 |-- total_items_volume: integer (nullable = true)
 |-- max_delivery_delay: integer (nullable = true)
 |-- is_delivered: integer (nullable = true)
 |-- payment_method_count: integer (nullable = true)
 |-- primary_payment_type: string (nullable = true)
 |-- max_installments: integer (nullable = true)
 |-- avg_satisfaction: double (nullable = true)
 |-- label: integer (nullable = true)



In [118]:
# =========================================
# Analyse Columns to Normalize and Encode
# =========================================

def analyze_encoding(df_final, target=None):
    normalize_cols = []
    label_encode_cols = []
    one_hot_encode_cols = []
    
    
    # 2. Define numeric type prefixes for PySpark
    numeric_prefixes = ('int', 'double', 'float', 'long', 'decimal', 'short')
    
    # 3. Iterate through columns
    for col_name, dtype in df_final.dtypes:
        if col_name == target:
            continue  # Skip target column
            
        # Get unique count (using distinct().count())
        unique_vals = df_final.select(col_name).distinct().count()
            
        if dtype.startswith(numeric_prefixes):
            # Numeric columns
            if unique_vals > 50:
                normalize_cols.append(col_name)
        else:
            # Categorical columns
            if unique_vals == 2:
                label_encode_cols.append(col_name)
            elif unique_vals <= 30:
                one_hot_encode_cols.append(col_name)
            else:
                print(f" Warning: High-cardinality categorical column: {col_name} ({unique_vals} unique values)")
                
    return normalize_cols, label_encode_cols, one_hot_encode_cols

target_col = 'label'
norm_cols, label_cols, onehot_cols = analyze_encoding(df_final, target=target_col)

print("\n--- Strategy Results ---")
print(" Normalize (Standard/Robust/MinMax):", norm_cols)
print(" Label Encode (StringIndexer):", label_cols)
print(" One-Hot Encode (StringIndexer + OneHotEncoder):", onehot_cols)


--- Strategy Results ---
 Normalize (Standard/Robust/MinMax): ['recency', 'monetary', 'max_delivery_delay']
 Label Encode (StringIndexer): []
 One-Hot Encode (StringIndexer + OneHotEncoder): ['state', 'primary_payment_type']


In [119]:
def suggest_scaler_optimized(df, numeric_cols):
    total_count = df.count()
    if total_count == 0: return "standard"

    # STEP 1: CALCULATE ALL SKEWNESS IN ONE PASS
    skew_expressions = [F.skewness(c).alias(c) for c in numeric_cols]
    skews = df.select(skew_expressions).first().asDict()
    
    # STEP 2: CALCULATE ALL QUANTILES IN ONE PASS
    all_quantiles = df.approxQuantile(numeric_cols, [0.25, 0.5, 0.75], 0.01)
    
    quantile_map = {col: all_quantiles[i] for i, col in enumerate(numeric_cols)}

    # STEP 3: CALCULATE ALL OUTLIER COUNTS IN ONE PASS
    outlier_expressions = []
    for col in numeric_cols:
        q1, median, q3 = quantile_map[col]
        iqr = q3 - q1
        if iqr > 0:
            # Standard Tukey's Fences: x < Q1 - 1.5*IQR or x > Q3 + 1.5*IQR
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            outlier_expressions.append(
                F.sum(F.when((F.col(col) < lower_bound) | (F.col(col) > upper_bound), 1).otherwise(0)).alias(col)
            )
        else:
            outlier_expressions.append(F.lit(0).alias(col))

    outlier_counts = df.select(outlier_expressions).first().asDict()

    # STEP 4: DECISION LOGIC
    avg_skew = np.mean([abs(v) for v in skews.values() if v is not None])
    avg_outlier_ratio = np.mean([v / total_count for v in outlier_counts.values()])

    print(f"--- Optimized Scaler Analysis ---")
    print(f"Avg Skewness: {avg_skew:.2f}")
    print(f"Avg Outlier Ratio: {avg_outlier_ratio:.4f}")

    if avg_outlier_ratio > 0.15:
        return "robust"
    elif avg_skew > 1.0:
        return "minmax"
    else:
        return "standard"

# --- Execution ---
scaler_name = suggest_scaler_optimized(df_final, norm_cols)
print(f"Final Decision: {scaler_name.upper()}")

--- Optimized Scaler Analysis ---
Avg Skewness: 4.25
Avg Outlier Ratio: 0.0521
Final Decision: MINMAX


In [120]:
label_col = 'label'

numeric_cols = [
    f.name for f in df_final.schema.fields
    if isinstance(f.dataType, NumericType)
]

other_numeric_cols = [
    col for col in numeric_cols
    if col not in norm_cols and col != label_col
]
print(f"Ohter Features: {other_numeric_cols}")

Ohter Features: ['frequency', 'total_items_volume', 'is_delivered', 'payment_method_count', 'max_installments', 'avg_satisfaction']


In [121]:
# ==============================
# Split data into train and test
# ==============================
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)

### Feature Engineering Pipeline

In [122]:
#===========================================
# One-Hot Encode
# ==========================================
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="keep")
    for col in onehot_cols
]

encoders = [
    OneHotEncoder(inputCol=f"{col}_index", outputCol=f"{col}_ohe")
    for col in onehot_cols
]

#===========================================
# Normilization
# ==========================================

norm_assembler = VectorAssembler(
    inputCols=norm_cols,
    outputCol="norm_vector"
)

scaler = MinMaxScaler(
    inputCol="norm_vector",
    outputCol="norm_features"
)

#===========================================
# Assemble ALL features into final vector
# ==========================================
final_assembler = VectorAssembler(
    inputCols=
        ['norm_features'] +
        other_numeric_cols +
        [f"{col}_ohe" for col in onehot_cols],
    outputCol="features"
)

# --------- Execution -------------
pipeline = Pipeline(stages=[
    # Categorical
    *indexers,
    *encoders,

    # Normalize selected numeric features
    norm_assembler,
    scaler,

    # Assemble EVERYTHING
    final_assembler
])

pipeline_model = pipeline.fit(train_df)

In [123]:

train_transformed = pipeline_model.transform(train_df)
test_transformed  = pipeline_model.transform(test_df)

**Check Results**

In [124]:
train_transformed.printSchema()

root
 |-- recency: integer (nullable = true)
 |-- frequency: integer (nullable = true)
 |-- monetary: double (nullable = true)
 |-- state: string (nullable = true)
 |-- total_items_volume: integer (nullable = true)
 |-- max_delivery_delay: integer (nullable = true)
 |-- is_delivered: integer (nullable = true)
 |-- payment_method_count: integer (nullable = true)
 |-- primary_payment_type: string (nullable = true)
 |-- max_installments: integer (nullable = true)
 |-- avg_satisfaction: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- state_index: double (nullable = false)
 |-- primary_payment_type_index: double (nullable = false)
 |-- state_ohe: vector (nullable = true)
 |-- primary_payment_type_ohe: vector (nullable = true)
 |-- norm_vector: vector (nullable = true)
 |-- norm_features: vector (nullable = true)
 |-- features: vector (nullable = true)



In [125]:
train_transformed.select("primary_payment_type", "primary_payment_type_index", "primary_payment_type_ohe").show(10, truncate=False)

+--------------------+--------------------------+------------------------+
|primary_payment_type|primary_payment_type_index|primary_payment_type_ohe|
+--------------------+--------------------------+------------------------+
|boleto              |1.0                       |(4,[1],[1.0])           |
|credit_card         |0.0                       |(4,[0],[1.0])           |
|credit_card         |0.0                       |(4,[0],[1.0])           |
|credit_card         |0.0                       |(4,[0],[1.0])           |
|credit_card         |0.0                       |(4,[0],[1.0])           |
|credit_card         |0.0                       |(4,[0],[1.0])           |
|credit_card         |0.0                       |(4,[0],[1.0])           |
|boleto              |1.0                       |(4,[1],[1.0])           |
|credit_card         |0.0                       |(4,[0],[1.0])           |
|boleto              |1.0                       |(4,[1],[1.0])           |
+--------------------+---

In [126]:
train_transformed.select("state", "state_index", "state_ohe").show(10, truncate=False)

+-----+-----------+--------------+
|state|state_index|state_ohe     |
+-----+-----------+--------------+
|MG   |2.0        |(27,[2],[1.0])|
|SP   |0.0        |(27,[0],[1.0])|
|SP   |0.0        |(27,[0],[1.0])|
|SP   |0.0        |(27,[0],[1.0])|
|SP   |0.0        |(27,[0],[1.0])|
|MG   |2.0        |(27,[2],[1.0])|
|RJ   |1.0        |(27,[1],[1.0])|
|MG   |2.0        |(27,[2],[1.0])|
|PR   |4.0        |(27,[4],[1.0])|
|DF   |8.0        |(27,[8],[1.0])|
+-----+-----------+--------------+
only showing top 10 rows


In [127]:
train_transformed.select("norm_vector", "norm_features").show(5, truncate=False)

+----------------+-----------------------------------------------+
|norm_vector     |norm_features                                  |
+----------------+-----------------------------------------------+
|[0.0,30.88,-4.0]|[0.0,0.00141000333273515,0.42622950819672134]  |
|[0.0,33.39,-5.0]|[0.0,0.0015938531179385382,0.4098360655737705] |
|[0.0,40.23,-2.0]|[0.0,0.0020948620943493654,0.45901639344262296]|
|[0.0,45.29,-7.0]|[0.0,0.0024654915418111764,0.3770491803278689] |
|[0.0,47.51,-6.0]|[0.0,0.0026280997183655676,0.39344262295081966]|
+----------------+-----------------------------------------------+
only showing top 5 rows


In [128]:
train_transformed.select("features", "label").show(5, truncate=False)

+-------------------------------------------------------------------------------------------------------+-----+
|features                                                                                               |label|
+-------------------------------------------------------------------------------------------------------+-----+
|(40,[1,2,3,4,5,6,7,8,11,37],[0.00141000333273515,0.42622950819672134,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0]) |1    |
|(40,[1,2,3,4,5,6,7,8,9,36],[0.0015938531179385382,0.4098360655737705,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0]) |1    |
|(40,[1,2,3,4,5,6,7,8,9,36],[0.0020948620943493654,0.45901639344262296,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0])|1    |
|(40,[1,2,3,4,5,6,7,8,9,36],[0.0024654915418111764,0.3770491803278689,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0]) |1    |
|(40,[1,2,3,4,5,6,7,8,9,36],[0.0026280997183655676,0.39344262295081966,1.0,1.0,1.0,1.0,4.0,3.0,1.0,1.0])|1    |
+-------------------------------------------------------------------------------------------------------

In [129]:
# Pick the first row
row = train_transformed.select("features").first()[0]

print("Vector size:", row.size)
print("Non-zero indices:", row.indices)
print("Non-zero values:", row.values)

Vector size: 40
Non-zero indices: [ 1  2  3  4  5  6  7  8 11 37]
Non-zero values: [1.41000333e-03 4.26229508e-01 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 5.00000000e+00
 1.00000000e+00 1.00000000e+00]


**Why I choose "Class Weights" instaed of "Downsampling3 or "Oversampling"**

Instead of deleting data (Downsampling) or making up fake data (Oversampling), the most professional way to handle this in PySpark is to use **Class Weights**. This allows me to **keep all ~44,000 rows**.

In [130]:
# ==================================
# Calculate the weight
# ===================================

total_count = train_transformed.count()
loyal_count = train_transformed.filter(F.col("label") == 0).count()
churn_count = train_transformed.filter(F.col("label") == 1).count()

weight_for_loyal = total_count / (2.0 * loyal_count)
weight_for_churn = total_count / (2.0 * churn_count)

# 2. Add a weight column to your dataframe
train_with_weights = train_transformed.withColumn("weight", 
    F.when(F.col("label") == 0, weight_for_loyal).otherwise(weight_for_churn)
)
train_with_weights.show(5)

+-------+---------+--------+-----+------------------+------------------+------------+--------------------+--------------------+----------------+----------------+-----+-----------+--------------------------+--------------+------------------------+----------------+--------------------+--------------------+-----------------+
|recency|frequency|monetary|state|total_items_volume|max_delivery_delay|is_delivered|payment_method_count|primary_payment_type|max_installments|avg_satisfaction|label|state_index|primary_payment_type_index|     state_ohe|primary_payment_type_ohe|     norm_vector|       norm_features|            features|           weight|
+-------+---------+--------+-----+------------------+------------------+------------+--------------------+--------------------+----------------+----------------+-----+-----------+--------------------------+--------------+------------------------+----------------+--------------------+--------------------+-----------------+
|      0|        1|   30.88|

### Need to work from here

In [131]:
# =========================================
# Evaluates a churn model's predictions
# =========================================
'''
def evaluate_churn_model(predictions, model_name="Model"):
    # STEP 1: SETUP EVALUATORS
    # binary_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="probability")
    binary_evaluator = BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC") # For SVM

    multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

    # STEP 2: CALCULATE GLOBAL METRICS
    accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
    f1_score = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "f1"})
    roc_auc  = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
    pr_auc   = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderPR"})

    # STEP 3: CALCULATE CLASS-SPECIFIC METRICS (Label 0 = Loyal)
    loyalty_precision = multi_evaluator.evaluate(predictions, {
        multi_evaluator.metricName: "precisionByLabel", multi_evaluator.metricLabel: 0.0})
    loyalty_recall = multi_evaluator.evaluate(predictions, {
        multi_evaluator.metricName: "recallByLabel", multi_evaluator.metricLabel: 0.0})

    # STEP 4. DISPLAY RESULTS
    print("\n" + "="*50)
    print(f"      PERFORMANCE REPORT: {model_name.upper()}      ")
    print("="*50)
    print(f"{'Overall Accuracy:':<25} {accuracy:.4f}")
    print(f"{'F1-Score:':<25} {f1_score:.4f}")
    print("-" * 50)
    print(f"{'ROC AUC:':<25} {roc_auc:.4f} (Separation Power)")
    print(f"{'PR AUC:':<25} {pr_auc:.4f} (Loyalty Focus)")
    print("-" * 50)
    print(f"{'LOYALTY Precision:':<25} {loyalty_precision:.4f} (Accuracy of 'Loyal' flags)")
    print(f"{'LOYALTY Recall:':<25} {loyalty_recall:.4f} (Coverage of actual Loyalists)")
    print("-" * 50)

    # 5. CONFUSION MATRIX
    print("\nConfusion Matrix (Label 0=Loyal, 1=Churn):")
    predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

    # 6. BUSINESS SUMMARY
    tp = predictions.filter("label = 0 AND prediction = 0.0").count()
    fn = predictions.filter("label = 0 AND prediction = 1.0").count()
    total_loyal = tp + fn
    catch_rate = (tp / total_loyal * 100) if total_loyal > 0 else 0
    
    print("Business Summary:")
    print(f">> Model caught {tp} out of {total_loyal} loyal customers ({catch_rate:.1f}%).")
    print(f">> Model missed {fn} loyal customers (high-risk errors).")
    print("="*50 + "\n")
    
    # Return metrics as a dict in case you want to store them for a final table
    return {"model": model_name, "auc": roc_auc, "pr_auc": pr_auc, "recall_0": loyalty_recall}
'''

'\ndef evaluate_churn_model(predictions, model_name="Model"):\n    # STEP 1: SETUP EVALUATORS\n    # binary_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="probability")\n    binary_evaluator = BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC") # For SVM\n\n    multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")\n\n    # STEP 2: CALCULATE GLOBAL METRICS\n    accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})\n    f1_score = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "f1"})\n    roc_auc  = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})\n    pr_auc   = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderPR"})\n\n    # STEP 3: CALCULATE CLASS-SPECIFIC METRICS (Label 0 = Loyal)\n    loyalty_precision = multi_evaluator.evaluate(pred

In [132]:
# =========================================
# Evaluates a churn model's predictions
# =========================================
def evaluate_churn_model(predictions, model_name="Model"):
   
    # 1. SETUP EVALUATORS
    # We use rawPrediction for AUC/PR calculations (works for both Probabilistic models and SVM)
    binary_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
    multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

    # 2. CALCULATE ALL METRICS
    accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
    f1_score = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "f1"})
    roc_auc  = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
    pr_auc   = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderPR"})

    # Class-specific metrics for Loyalty (Label 0)
    loyalty_precision = multi_evaluator.evaluate(predictions, {
        multi_evaluator.metricName: "precisionByLabel", multi_evaluator.metricLabel: 0.0})
    loyalty_recall = multi_evaluator.evaluate(predictions, {
        multi_evaluator.metricName: "recallByLabel", multi_evaluator.metricLabel: 0.0})

    # 3. CALCULATE BUSINESS SUMMARY DATA
    # tp = True Positives (Predicted Loyal, Actually Loyal)
    # fn = False Negatives (Predicted Churn, Actually Loyal)
    tp = predictions.filter("label = 0 AND prediction = 0.0").count()
    fn = predictions.filter("label = 0 AND prediction = 1.0").count()
    
    total_loyal = tp + fn
    catch_rate = (tp / total_loyal * 100) if total_loyal > 0 else 0
    
    # 4. DISPLAY ONLY BUSINESS SUMMARY
    print(f"\n>>> BUSINESS SUMMARY: {model_name.upper()}")
    print(f">> Model caught {tp} out of {total_loyal} loyal customers ({catch_rate:.1f}%).")
    print(f">> Model missed {fn} loyal customers (high-risk errors).")
    print("="*50)
    print("\n")
    
    # 5. RETURN COMPLETE DICTIONARY
    # This dictionary will be used to build your final leaderboard DataFrame
    return {
        "model": model_name,
        "accuracy": accuracy,
        "f1": f1_score,
        "auc": roc_auc,
        "pr_auc": pr_auc,
        "loyalty_precision": loyalty_precision,
        "loyalty_recall": loyalty_recall,
        "caught_loyal": tp,
        "missed_loyal": fn
    }

### Train Model
**Train Model using all of the Selected Features**

In [133]:
# Check and delete Negative Values
def has_negative(vector):
    if vector is not None:
        return any(x < 0 for x in vector.toArray())
    return False

In [134]:
# Define the "Model Zoo"
classifiers = [
    ("Logistic Regression", LogisticRegression(featuresCol="features", labelCol="label", weightCol="weight")),
    ("Random Forest", RandomForestClassifier(featuresCol="features", labelCol="label", weightCol="weight")),
    ("GBT", GBTClassifier(featuresCol="features", labelCol="label", weightCol="weight")),
    ("Linear SVC", LinearSVC(featuresCol="features", labelCol="label", weightCol="weight")),
    ("Decision Tree", DecisionTreeClassifier(featuresCol="features", labelCol="label", weightCol="weight")),
    ("Naive Bayes", NaiveBayes(featuresCol="features", labelCol="label", weightCol="weight"))    
]

results_leaderboard = []

for name, clf in classifiers:
    print(f"Training {name}...")
    model = clf.fit(train_with_weights)

    if name == "Naive Bayes":
        check_neg_udf = F.udf(has_negative, BooleanType())
        test_transformed_cleaned = test_transformed.filter(~check_neg_udf(F.col("features")))
        predictions = model.transform(test_transformed_cleaned)
    else:
        predictions = model.transform(test_transformed)
    
    metrics = evaluate_churn_model(predictions, name)
    results_leaderboard.append(metrics)

# SHOW FINAL COMPARISON TABLE
leaderboard_df = pd.DataFrame(results_leaderboard)
print("\n--- FINAL BEST EVALUATION LEADERBOARD ---")
print(leaderboard_df.sort_values(by="pr_auc", ascending=False))

Training Logistic Regression...

>>> BUSINESS SUMMARY: LOGISTIC REGRESSION
>> Model caught 71 out of 134 loyal customers (53.0%).
>> Model missed 63 loyal customers (high-risk errors).


Training Random Forest...

>>> BUSINESS SUMMARY: RANDOM FOREST
>> Model caught 51 out of 134 loyal customers (38.1%).
>> Model missed 83 loyal customers (high-risk errors).


Training GBT...

>>> BUSINESS SUMMARY: GBT
>> Model caught 54 out of 134 loyal customers (40.3%).
>> Model missed 80 loyal customers (high-risk errors).


Training Linear SVC...

>>> BUSINESS SUMMARY: LINEAR SVC
>> Model caught 67 out of 134 loyal customers (50.0%).
>> Model missed 67 loyal customers (high-risk errors).


Training Decision Tree...

>>> BUSINESS SUMMARY: DECISION TREE
>> Model caught 87 out of 134 loyal customers (64.9%).
>> Model missed 47 loyal customers (high-risk errors).


Training Naive Bayes...

>>> BUSINESS SUMMARY: NAIVE BAYES
>> Model caught 77 out of 134 loyal customers (57.5%).
>> Model missed 57 loyal 

In [135]:
final_leaderboard = pd.DataFrame(leaderboard_df)

final_leaderboard = final_leaderboard.sort_values(by="pr_auc", ascending=False)

# 3. Save to CSV
final_leaderboard.to_csv("Result_Data/1.Default_para_All_features.csv", index=False)

print("Success! Your experiment results have been saved to 'churn_model_experiments_final.csv'")

display(final_leaderboard.head())

Success! Your experiment results have been saved to 'churn_model_experiments_final.csv'


Unnamed: 0,model,accuracy,f1,auc,pr_auc,loyalty_precision,loyalty_recall,caught_loyal,missed_loyal
0,Logistic Regression,0.590223,0.73168,0.603106,0.991028,0.015543,0.529851,71,63
3,Linear SVC,0.552211,0.701034,0.581361,0.990621,0.013446,0.5,67,67
1,Random Forest,0.674515,0.795196,0.572766,0.989668,0.014206,0.380597,51,83
2,GBT,0.668045,0.790537,0.561163,0.989318,0.014722,0.402985,54,80
5,Naive Bayes,0.490249,0.647169,0.530949,0.988361,0.013528,0.574627,77,57


**Feature Importance Ranking Based  Model Training**

In [136]:
# =====================================
# Feature Importance Ranking
# =====================================

# STEP 1: GET ALL FEATURE NAMES IN ORDER
all_feature_names = []
for col in norm_cols: all_feature_names.append(col)
for col in other_numeric_cols: all_feature_names.append(col)
for col in onehot_cols:
    num_cats = train_with_weights.select(f"{col}_ohe").first()[0].size
    for i in range(num_cats):
        all_feature_names.append(f"{col}_{i}")

# STEP 2. GET RANKINGS
rf_baseline = RandomForestClassifier(featuresCol="features", labelCol="label", weightCol="weight", seed=42)
rf_model_baseline = rf_baseline.fit(train_with_weights)

# 3. MAP AND SORT IMPORTANCE
importances = rf_model_baseline.featureImportances.toArray()
feature_importance_map = sorted(zip(all_feature_names, importances), key=lambda x: x[1], reverse=True)

print("--- TOP RANKED FEATURES ---")
for i, (name, score) in enumerate(feature_importance_map[:]):
    print(f"{i+1}. {name:<20}: {score:.4f}")

--- TOP RANKED FEATURES ---
1. recency             : 0.1843
2. total_items_volume  : 0.1699
3. frequency           : 0.1342
4. monetary            : 0.0951
5. avg_satisfaction    : 0.0851
6. max_delivery_delay  : 0.0811
7. state_0             : 0.0474
8. max_installments    : 0.0433
9. state_1             : 0.0174
10. state_11            : 0.0169
11. primary_payment_type_0: 0.0113
12. state_5             : 0.0098
13. state_13            : 0.0098
14. primary_payment_type_1: 0.0094
15. state_9             : 0.0091
16. state_7             : 0.0083
17. payment_method_count: 0.0078
18. is_delivered        : 0.0072
19. primary_payment_type_3: 0.0068
20. state_10            : 0.0068
21. state_6             : 0.0067
22. state_15            : 0.0044
23. primary_payment_type_2: 0.0044
24. state_4             : 0.0043
25. state_8             : 0.0039
26. state_14            : 0.0028
27. state_2             : 0.0024
28. state_19            : 0.0021
29. state_12            : 0.0019
30. state_3     

**Train The Model using top 3 to 15 rank Features**

In [137]:
experiment_results = []

# Check and delete Negative Values
def has_negative(vector):
    if vector is not None:
        return any(x < 0 for x in vector.toArray())
    return False


for k in range(3,15):
    print(50 * "=")
    print(f"\n>>> Running Experiment: Top {k} Features")
    print(50 * "=")
    
    top_features = [f[0] for f in feature_importance_map[:k]] # top K features names
    top_indices = [all_feature_names.index(name) for name in top_features]
    
    # Note: For strict "Top K", we would slice the existing 'features' vector
    from pyspark.ml.feature import VectorSlicer
    slicer = VectorSlicer(inputCol="features", outputCol="top_k_features", indices=top_indices)
    
    train_subset = slicer.transform(train_with_weights)
    test_subset = slicer.transform(test_transformed)

    # "Model Zoo"
    classifiers = [
        ("Logistic Regression", LogisticRegression(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("Random Forest", RandomForestClassifier(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("GBT", GBTClassifier(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("Linear SVC", LinearSVC(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("Decision Tree", DecisionTreeClassifier(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("Naive Bayes", NaiveBayes(featuresCol="top_k_features", labelCol="label", weightCol="weight"))
    ]

    results_leaderboard = []

    for name, clf in classifiers:
        print(f"Training {name} Top-{k}...")
        model = clf.fit(train_subset)

        if name == "Naive Bayes":
            check_neg_udf = F.udf(has_negative, BooleanType())
            test_transformed_cleaned = test_subset.filter(~check_neg_udf(F.col("top_k_features")))
            predictions = model.transform(test_transformed_cleaned)
        else:
            predictions = model.transform(test_subset)
        
        metrics = evaluate_churn_model(predictions, f"{name} Top-{k}")
        experiment_results.append(metrics)
    
# 3. FINAL COMPARISON TABLE
comparison_df = pd.DataFrame(experiment_results)
print("\n--- FEATURE IMPORTANCE EXPERIMENT SUMMARY ---")
print(comparison_df.sort_values(by="pr_auc", ascending=False))


>>> Running Experiment: Top 3 Features
Training Logistic Regression Top-3...

>>> BUSINESS SUMMARY: LOGISTIC REGRESSION TOP-3
>> Model caught 67 out of 134 loyal customers (50.0%).
>> Model missed 67 loyal customers (high-risk errors).


Training Random Forest Top-3...

>>> BUSINESS SUMMARY: RANDOM FOREST TOP-3
>> Model caught 62 out of 134 loyal customers (46.3%).
>> Model missed 72 loyal customers (high-risk errors).


Training GBT Top-3...

>>> BUSINESS SUMMARY: GBT TOP-3
>> Model caught 61 out of 134 loyal customers (45.5%).
>> Model missed 73 loyal customers (high-risk errors).


Training Linear SVC Top-3...

>>> BUSINESS SUMMARY: LINEAR SVC TOP-3
>> Model caught 19 out of 134 loyal customers (14.2%).
>> Model missed 115 loyal customers (high-risk errors).


Training Decision Tree Top-3...

>>> BUSINESS SUMMARY: DECISION TREE TOP-3
>> Model caught 62 out of 134 loyal customers (46.3%).
>> Model missed 72 loyal customers (high-risk errors).


Training Naive Bayes Top-3...

>>> BUS

In [138]:
final_leaderboard = pd.DataFrame(experiment_results)
final_leaderboard = final_leaderboard.sort_values(by="pr_auc", ascending=False)

# 3. Save to CSV
final_leaderboard.to_csv("Result_Data/2.Default_para_top_k_features.csv", index=False)

print("Success! Your experiment results have been saved to 'churn_model_experiments_final.csv'")

display(final_leaderboard.head(10))

Success! Your experiment results have been saved to 'churn_model_experiments_final.csv'


Unnamed: 0,model,accuracy,f1,auc,pr_auc,loyalty_precision,loyalty_recall,caught_loyal,missed_loyal
6,Logistic Regression Top-4,0.663731,0.787251,0.627279,0.992633,0.017647,0.492537,66,68
9,Linear SVC Top-4,0.937994,0.956922,0.629204,0.992489,0.031987,0.141791,19,115
7,Random Forest Top-4,0.743979,0.842508,0.629268,0.992376,0.021164,0.447761,60,74
3,Linear SVC Top-3,0.937994,0.956922,0.627644,0.992104,0.031987,0.141791,19,115
0,Logistic Regression Top-3,0.681165,0.799677,0.627857,0.992093,0.018884,0.5,67,67
12,Logistic Regression Top-5,0.626348,0.759518,0.621696,0.991993,0.017737,0.552239,74,60
18,Logistic Regression Top-6,0.628145,0.760894,0.616994,0.991902,0.01759,0.544776,73,61
21,Linear SVC Top-6,0.677301,0.796927,0.623519,0.991887,0.018926,0.507463,68,66
13,Random Forest Top-5,0.757189,0.851163,0.615763,0.991858,0.019101,0.380597,51,83
54,Logistic Regression Top-12,0.620417,0.755084,0.613511,0.991829,0.016777,0.529851,71,63


**Best Features Combination[3, 4, 5, 6, 12]**

In [139]:
experiment_results = []
top_k_list = [3, 4, 5, 6, 12]

# Check and delete Negative Values
def has_negative(vector):
    if vector is not None:
        return any(x < 0 for x in vector.toArray())
    return False


for k in top_k_list:
    print(50 * "=")
    print(f"\n>>> Running Experiment: Top {k} Features")
    print(50 * "=")
    
    top_features = [f[0] for f in feature_importance_map[:k]] # top K features names
    top_indices = [all_feature_names.index(name) for name in top_features]
    
    # Note: For strict "Top K", we would slice the existing 'features' vector
    from pyspark.ml.feature import VectorSlicer
    slicer = VectorSlicer(inputCol="features", outputCol="top_k_features", indices=top_indices)
    
    train_subset = slicer.transform(train_with_weights)
    test_subset = slicer.transform(test_transformed)

    # Define the "Model Zoo"
    classifiers = [
        ("Logistic Regression", LogisticRegression(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("Random Forest", RandomForestClassifier(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("Linear SVC", LinearSVC(featuresCol="top_k_features", labelCol="label", weightCol="weight"))
    ]

    results_leaderboard = []

    for name, clf in classifiers:
        print(f"Training {name} Top-{k}...")
        model = clf.fit(train_subset)

        if name == "Naive Bayes":
            check_neg_udf = F.udf(has_negative, BooleanType())
            test_transformed_cleaned = test_subset.filter(~check_neg_udf(F.col("top_k_features")))
            predictions = model.transform(test_transformed_cleaned)
        else:
            predictions = model.transform(test_subset)
        
        metrics = evaluate_churn_model(predictions, f"{name} Top-{k}")
        experiment_results.append(metrics)
    
# 3. FINAL COMPARISON TABLE
comparison_df = pd.DataFrame(experiment_results)
print("\n--- FEATURE IMPORTANCE EXPERIMENT SUMMARY ---")
print(comparison_df.sort_values(by="pr_auc", ascending=False))


>>> Running Experiment: Top 3 Features
Training Logistic Regression Top-3...

>>> BUSINESS SUMMARY: LOGISTIC REGRESSION TOP-3
>> Model caught 67 out of 134 loyal customers (50.0%).
>> Model missed 67 loyal customers (high-risk errors).


Training Random Forest Top-3...

>>> BUSINESS SUMMARY: RANDOM FOREST TOP-3
>> Model caught 62 out of 134 loyal customers (46.3%).
>> Model missed 72 loyal customers (high-risk errors).


Training Linear SVC Top-3...

>>> BUSINESS SUMMARY: LINEAR SVC TOP-3
>> Model caught 19 out of 134 loyal customers (14.2%).
>> Model missed 115 loyal customers (high-risk errors).



>>> Running Experiment: Top 4 Features
Training Logistic Regression Top-4...

>>> BUSINESS SUMMARY: LOGISTIC REGRESSION TOP-4
>> Model caught 66 out of 134 loyal customers (49.3%).
>> Model missed 68 loyal customers (high-risk errors).


Training Random Forest Top-4...

>>> BUSINESS SUMMARY: RANDOM FOREST TOP-4
>> Model caught 60 out of 134 loyal customers (44.8%).
>> Model missed 74 loya

**Hyper Tuning**

In [140]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pandas as pd

# 1. SETUP TUNING EVALUATOR (PR AUC is critical for 98:2 imbalance)
tuning_evaluator = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderPR"
)

# 2. DEFINE PARAMETER GRIDS (Keeping them small for performance)
lr_param_grid = (ParamGridBuilder()
    .addGrid(LogisticRegression.regParam, [0.01, 0.1, 1.0])          # Controls the strength of regularization
    .addGrid(LogisticRegression.elasticNetParam, [0.0, 0.5, 1.0])   # 0=L2 (Ridge), 1=L1 (Lasso), 0.5=Both
    .addGrid(LogisticRegression.maxIter, [10, 50, 100])             # Ensures the algorithm has time to converge
    .addGrid(LogisticRegression.threshold, [0.3, 0.4, 0.5])         # Essential for imbalanced churn data
    .build())

rf_param_grid = (ParamGridBuilder()
    .addGrid(RandomForestClassifier.numTrees, [50, 100, 200])       # More trees reduce variance
    .addGrid(RandomForestClassifier.maxDepth, [5, 10, 15])          # Deeper trees capture more complex patterns
    .addGrid(RandomForestClassifier.featureSubsetStrategy, ["auto", "sqrt", "log2"]) # Variety of feature looks
    .addGrid(RandomForestClassifier.minInstancesPerNode, [1, 5, 10]) # Prevents overfitting on noise
    .build())

lsvc_param_grid = (ParamGridBuilder()
    .addGrid(LinearSVC.regParam, [0.01, 0.1, 1.0])                  
    .addGrid(LinearSVC.maxIter, [10, 50, 100])                      
    .addGrid(LinearSVC.standardization, [True, False])              # Whether to scale the data internally
    .addGrid(LinearSVC.tol, [1e-6, 1e-4])                           # Convergence tolerance
    .build())

# Mapping names to grids
grid_map = {
    "Logistic Regression": lr_param_grid,
    "Random Forest": rf_param_grid,
    "Linear SVC": lsvc_param_grid
}

# 3. THE TUNING LOOP
experiment_results = []
top_k_list = [3, 4, 5, 6, 12]

for k in top_k_list:
    print("\n" + 60 * "=")
    print(f" EXPERIMENT: TOP {k} FEATURES (TUNING ENABLED) ")
    print(60 * "=")
    
    # Slice features based on your ranking
    top_features = [f[0] for f in feature_importance_map[:k]]
    top_indices = [all_feature_names.index(name) for name in top_features]
    
    from pyspark.ml.feature import VectorSlicer
    slicer = VectorSlicer(inputCol="features", outputCol="top_k_features", indices=top_indices)
    
    train_subset = slicer.transform(train_with_weights).cache()
    test_subset = slicer.transform(test_transformed).cache()

    classifiers = [
        ("Logistic Regression", LogisticRegression(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("Random Forest", RandomForestClassifier(featuresCol="top_k_features", labelCol="label", weightCol="weight")),
        ("Linear SVC", LinearSVC(featuresCol="top_k_features", labelCol="label", weightCol="weight"))
    ]

    for name, clf in classifiers:
        print(f"--- Tuning {name} (Top-{k} features) ---")
        
        # Initialize CrossValidator
        cv = CrossValidator(
            estimator=clf,
            estimatorParamMaps=grid_map[name],
            evaluator=tuning_evaluator,
            numFolds=3,
            seed=42,
            parallelism=1  # <--- parallelism=4 will train 4 models at the same time if your PC allows it
        )

        # FIT: This runs all grid combinations and picks the best one
        cv_model = cv.fit(train_subset)
        best_model = cv_model.bestModel
        
        # TRANSFORM & EVALUATE
        predictions = best_model.transform(test_subset)
        metrics = evaluate_churn_model(predictions, f"{name} (Tuned) Top-{k}")
        
        # Save the best params to the results for your project documentation
        best_params = best_model.extractParamMap()
        #metrics["best_params"] = str({p.name: v for p, v in best_params.items() if p.name in ['regParam', 'elasticNetParam', 'numTrees', 'maxDepth']})
        metrics["best_params"] = str({p.name: v for p, v in best_params.items() if p.name in 
                                    ['regParam', 'elasticNetParam', 'numTrees', 'maxDepth', 
                                    'threshold', 'maxIter', 'featureSubsetStrategy', 'minInstancesPerNode']})

        experiment_results.append(metrics)
    train_subset.unpersist()
    test_subset.unpersist()

# 4. FINAL RESULTS LEADERBOARD
comparison_df = pd.DataFrame(experiment_results)
print("\n--- TUNED EXPERIMENT SUMMARY ---")
# Showing top results by PR AUC
print(comparison_df.sort_values(by="pr_auc", ascending=False))


 EXPERIMENT: TOP 3 FEATURES (TUNING ENABLED) 
--- Tuning Logistic Regression (Top-3 features) ---

>>> BUSINESS SUMMARY: LOGISTIC REGRESSION (TUNED) TOP-3
>> Model caught 67 out of 134 loyal customers (50.0%).
>> Model missed 67 loyal customers (high-risk errors).


--- Tuning Random Forest (Top-3 features) ---

>>> BUSINESS SUMMARY: RANDOM FOREST (TUNED) TOP-3
>> Model caught 62 out of 134 loyal customers (46.3%).
>> Model missed 72 loyal customers (high-risk errors).


--- Tuning Linear SVC (Top-3 features) ---

>>> BUSINESS SUMMARY: LINEAR SVC (TUNED) TOP-3
>> Model caught 19 out of 134 loyal customers (14.2%).
>> Model missed 115 loyal customers (high-risk errors).



 EXPERIMENT: TOP 4 FEATURES (TUNING ENABLED) 
--- Tuning Logistic Regression (Top-4 features) ---

>>> BUSINESS SUMMARY: LOGISTIC REGRESSION (TUNED) TOP-4
>> Model caught 66 out of 134 loyal customers (49.3%).
>> Model missed 68 loyal customers (high-risk errors).


--- Tuning Random Forest (Top-4 features) ---

>>> 

In [141]:
final_leaderboard = pd.DataFrame(experiment_results)
final_leaderboard = final_leaderboard.sort_values(by="pr_auc", ascending=False)

# 3. Save to CSV
final_leaderboard.to_csv("Result_Data/3.hyperparameters_top_k_features.csv", index=False)

print("Success! Your experiment results have been saved to 'churn_model_experiments_final.csv'")

display(final_leaderboard.head(10))

Success! Your experiment results have been saved to 'churn_model_experiments_final.csv'


Unnamed: 0,model,accuracy,f1,auc,pr_auc,loyalty_precision,loyalty_recall,caught_loyal,missed_loyal,best_params
3,Logistic Regression (Tuned) Top-4,0.663731,0.787251,0.627279,0.992633,0.017647,0.492537,66,68,"{'elasticNetParam': 0.0, 'maxIter': 100, 'regP..."
5,Linear SVC (Tuned) Top-4,0.937994,0.956922,0.629204,0.992489,0.031987,0.141791,19,115,"{'maxIter': 100, 'regParam': 0.0, 'threshold':..."
4,Random Forest (Tuned) Top-4,0.743979,0.842508,0.629268,0.992376,0.021164,0.447761,60,74,"{'featureSubsetStrategy': 'auto', 'maxDepth': ..."
2,Linear SVC (Tuned) Top-3,0.937994,0.956922,0.627644,0.992104,0.031987,0.141791,19,115,"{'maxIter': 100, 'regParam': 0.0, 'threshold':..."
0,Logistic Regression (Tuned) Top-3,0.681165,0.799677,0.627857,0.992093,0.018884,0.5,67,67,"{'elasticNetParam': 0.0, 'maxIter': 100, 'regP..."
6,Logistic Regression (Tuned) Top-5,0.626348,0.759518,0.621696,0.991993,0.017737,0.552239,74,60,"{'elasticNetParam': 0.0, 'maxIter': 100, 'regP..."
9,Logistic Regression (Tuned) Top-6,0.628145,0.760894,0.616994,0.991902,0.01759,0.544776,73,61,"{'elasticNetParam': 0.0, 'maxIter': 100, 'regP..."
11,Linear SVC (Tuned) Top-6,0.677301,0.796927,0.623519,0.991887,0.018926,0.507463,68,66,"{'maxIter': 100, 'regParam': 0.0, 'threshold':..."
7,Random Forest (Tuned) Top-5,0.757189,0.851163,0.615763,0.991858,0.019101,0.380597,51,83,"{'featureSubsetStrategy': 'auto', 'maxDepth': ..."
12,Logistic Regression (Tuned) Top-12,0.620417,0.755084,0.613511,0.991829,0.016777,0.529851,71,63,"{'elasticNetParam': 0.0, 'maxIter': 100, 'regP..."
