In [22]:
import findspark
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StringType
import pyspark.sql.functions as f

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings 

warnings.filterwarnings("ignore")

In [23]:
findspark.init()

spark = SparkSession.builder \
    .appName("Risk Budget") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.extraClassPath", r"C:\Drivers\sqljdbc_12.10.0.0_enu\sqljdbc_12.10\enu\jars\mssql-jdbc-12.10.0.jre11.jar") \
    .getOrCreate()

# Read CSV
customers = spark.read.csv("../final data/customer_features.csv", header=True, inferSchema=True)
orders = spark.read.csv("../final data/orders_facts.csv", header=True, inferSchema =True)

In [24]:
orders = orders \
    .withColumn(
        "order_year", f.year(f.col("order_date"))
    ).withColumn(
        "order_month", f.month(f.col("order_date"))
    )

# RISK BUDGET 
Not all churn is created equal. Losing a high-value customer is much more costly than losing a low value customer. Therefore, when allocating your retention budget think about different value-based segments and how likely they are to buy from you again.

In [25]:
risk_budget = orders.join(
    customers, 
    "customer_id"
).withColumn(
    "start_year", f.year(f.to_date("customer_first_date"))
).select(
    "customer_id", 
    "total_price", 
    "start_year", 
    "order_year"
).groupBy(
    "customer_id",
    "start_year"
).agg(
    f.round(f.avg("total_price"), 2).alias("order_value")
).orderBy(
    f.asc("customer_id")
)

risk_budget.show()


+-----------+----------+-----------+
|customer_id|start_year|order_value|
+-----------+----------+-----------+
|          2|      2012|        5.7|
|          3|      2010|     824.58|
|          5|      2010|     2999.4|
|          6|      2011|     848.09|
|          7|      2012|     221.73|
|          8|      2012|      648.5|
|          9|      2012|    2065.36|
|         10|      2011|     204.96|
|         11|      2010|     196.02|
|         12|      2012|      84.32|
|         14|      2010|     243.73|
|         15|      2010|       81.0|
|         16|      2010|     386.15|
|         17|      2011|      20.32|
|         18|      2010|     625.24|
|         19|      2010|     217.44|
|         20|      2012|     278.74|
|         21|      2010|     1604.8|
|         23|      2011|      807.8|
|         24|      2010|     546.35|
+-----------+----------+-----------+
only showing top 20 rows



In [26]:
upper_quantile = risk_budget.approxQuantile("order_value", [0.66], 0)[0]
lower_quantile = risk_budget.approxQuantile("order_value", [0.33], 0)[0]

risk_budget = risk_budget.withColumn(
    "customer_group",
    f.when(f.col("start_year") == 2013, "New")
     .when(f.col("order_value") > upper_quantile, "High")
     .when(f.col("order_value") < lower_quantile, "Low")
     .otherwise("Med")
)

risk_budget.show()

+-----------+----------+-----------+--------------+
|customer_id|start_year|order_value|customer_group|
+-----------+----------+-----------+--------------+
|          2|      2012|        5.7|           Low|
|          3|      2010|     824.58|          High|
|          5|      2010|     2999.4|          High|
|          6|      2011|     848.09|          High|
|          7|      2012|     221.73|           Med|
|          8|      2012|      648.5|           Med|
|          9|      2012|    2065.36|          High|
|         10|      2011|     204.96|           Med|
|         11|      2010|     196.02|           Low|
|         12|      2012|      84.32|           Low|
|         14|      2010|     243.73|           Med|
|         15|      2010|       81.0|           Low|
|         16|      2010|     386.15|           Med|
|         17|      2011|      20.32|           Low|
|         18|      2010|     625.24|           Med|
|         19|      2010|     217.44|           Med|
|         20

In [40]:
customers_2012 = [row.customer_id for row in orders.select("customer_id")
                  .filter(f.col("order_year") == 2012).distinct().collect()]

customers_2013 = [row.customer_id for row in orders.select("customer_id")
                  .filter(f.col("order_year") == 2013).distinct().collect()]

In [None]:
returning_customers = list(set(customers_2012) & set(customers_2013))

risk_budget_2014 = risk_budget.withColumn(
    "is_returning", 
    f.when(f.col("customer_id").isin(returning_customers), 1).otherwise(0)
).filter(
    f.col("start_year") >= 2012
)

risk_budget_2014.orderBy(f.desc("is_returning")).show()

[2048, 2050, 2053, 2055, 2058, 2059, 2062, 15, 2063, 18, 2066, 20, 21, 23, 24, 25, 26, 27, 2075, 33, 2081, 35, 2082, 2087, 42, 2090, 2092, 48, 2100, 2101, 2103, 2107, 60, 61, 2108, 2110, 2111, 66, 68, 2118, 2119, 2123, 81, 83, 2131, 2132, 2133, 2139, 93, 94, 95, 2148, 102, 119, 122, 125, 130, 133, 2183, 2185, 2188, 2189, 2190, 2192, 145, 2196, 2197, 152, 153, 154, 2202, 2203, 2204, 2209, 164, 2216, 2218, 2224, 177, 2225, 180, 181, 2243, 200, 2248, 2249, 2258, 212, 217, 2265, 2266, 2268, 221, 2270, 2277, 2279, 236, 238, 2287, 241, 248, 251, 2300, 255, 2303, 258, 2309, 262, 2313, 270, 272, 274, 2329, 2332, 287, 2337, 290, 2341, 2342, 2343, 2347, 2351, 2352, 305, 306, 307, 308, 310, 2360, 2366, 324, 2373, 2374, 2380, 2382, 2385, 2395, 2397, 2398, 2400, 354, 2403, 357, 360, 2409, 2410, 2417, 373, 375, 378, 2426, 2431, 388, 393, 394, 2442, 397, 398, 399, 2447, 402, 2450, 2453, 2454, 408, 2457, 2463, 417, 2466, 2469, 2472, 427, 430, 2480, 2484, 2487, 2490, 2491, 448, 2498, 2499, 453, 454, 25

In [57]:
risk_budget_final = risk_budget_2014.groupBy("customer_group").agg(
    f.countDistinct("customer_id").alias("Number of Customers"),
    f.round(f.avg("order_value"), 2).alias("Customer Value"),
    f.round((f.avg("is_returning")) * 100, 2).alias("Retention")
).withColumn(
    "$ at Risk",
    f.round(f.col("Number of Customers") * f.col("Customer Value") * (f.col("Retention") / 100), 2)
)

total_at_risk = risk_budget_final.agg(f.sum(f.col("$ at Risk")).alias("total")).collect()[0]["total"]

risk_budget_final = risk_budget_final.withColumn(
    "Risk",
    f.round(f.col("$ at Risk") / f.lit(total_at_risk), 2) * 100
)

risk_budget_final.show()


+--------------+-------------------+--------------+---------+---------+----+
|customer_group|Number of Customers|Customer Value|Retention|$ at Risk|Risk|
+--------------+-------------------+--------------+---------+---------+----+
|          High|                130|       1893.87|    52.31|128788.84|80.0|
|           Low|                203|         80.14|    21.67|  3525.37| 2.0|
|           Med|                129|        417.09|    54.26| 29194.38|18.0|
|           New|                397|        774.87|      0.0|      0.0| 0.0|
+--------------+-------------------+--------------+---------+---------+----+



#### Key Insights 

* **High-value customers**, while fewer in number, represent 80% of the total retention risk. Losing just one of these customers cost ~24x more than a low-value customer. 
* **Medium-value customers**, contribute 18% of total risk, and ahve the highest retention rate (54%.26%), indicating good loyalty with potential for even more updside. 
* **Low-value customers**, have a low retention rate (21.67%) but pose minimal financial risk (2%) retention efforts here are least efficient. 