In [22]:
import findspark
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StringType
import pyspark.sql.functions as f

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings 

warnings.filterwarnings("ignore")

In [23]:
findspark.init()

spark = SparkSession.builder \
    .appName("Risk Budget") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.extraClassPath", r"C:\Drivers\sqljdbc_12.10.0.0_enu\sqljdbc_12.10\enu\jars\mssql-jdbc-12.10.0.jre11.jar") \
    .getOrCreate()

# Read CSV
customers = spark.read.csv("../final data/customer_features.csv", header=True, inferSchema=True)
orders = spark.read.csv("../final data/orders_facts.csv", header=True, inferSchema =True)

In [24]:
orders = orders \
    .withColumn(
        "order_year", f.year(f.col("order_date"))
    ).withColumn(
        "order_month", f.month(f.col("order_date"))
    )

# RISK BUDGET 
Not all churn is created equal. Losing a high-value customer is much more costly than losing a low value customer. Therefore, when allocating your retention budget think about different value-based segments and how likely they are to buy from you again.

In [None]:
years = [row["order_year"] for row in set(orders.select("order_year").collect())]

[2010, 2011, 2012, 2013]

In [64]:
yearly_budget = []

for year in years:
    risk_budget = orders.join(
        customers, 
        "customer_id"
    ).withColumn(
        "start_year", f.year(f.to_date("customer_first_date"))
    ).select(
        "customer_id", 
        "total_price", 
        "start_year", 
        "order_year"
    ).groupBy(
        "customer_id",
        "start_year"
    ).agg(
        f.round(f.avg("total_price"), 2).alias("order_value")
    ).orderBy(
        f.asc("customer_id")
    )

    upper_quantile = risk_budget.approxQuantile("order_value", [0.66], 0)[0]
    lower_quantile = risk_budget.approxQuantile("order_value", [0.33], 0)[0]

    risk_budget = risk_budget.withColumn(
        "customer_group",
        f.when(f.col("start_year") == year, "New")
        .when(f.col("order_value") > upper_quantile, "High")
        .when(f.col("order_value") < lower_quantile, "Low")
        .otherwise("Med")
    )

    customers_a = [row.customer_id for row in orders.select("customer_id")
                  .filter(f.col("order_year") == year - 1).distinct().collect()]

    customers_b = [row.customer_id for row in orders.select("customer_id")
                  .filter(f.col("order_year") == year).distinct().collect()]
    
    returning_customers = list(set(customers_a) & set(customers_b))

    risk_budget_2014 = risk_budget.withColumn(
        "is_returning", 
        f.when(f.col("customer_id").isin(returning_customers), 1).otherwise(0)
    ).filter(
        f.col("start_year") >= year - 1
    )

    risk_budget_final = risk_budget_2014.groupBy("customer_group").agg(
        f.countDistinct("customer_id").alias("Number of Customers"),
        f.round(f.avg("order_value"), 2).alias("Customer Value"),
        f.round((f.avg("is_returning")) * 100, 2).alias("Retention")
    ).withColumn(
        "$ at Risk",
        f.round(f.col("Number of Customers") * f.col("Customer Value") * (f.col("Retention") / 100), 2)
    )

    total_at_risk = risk_budget_final.agg(f.sum(f.col("$ at Risk")).alias("total")).collect()[0]["total"]

    risk_budget_final = risk_budget_final.withColumn(
        "Risk",
        f.coalesce(f.round(f.col("$ at Risk") / f.lit(total_at_risk), 2) * 100, f.lit("--"))
    )

    yearly_budget.append(risk_budget_final)

In [None]:
for i in range(len(years)):
    print(f"========{years[i]}=========")
    yearly_budget[i].show()

+--------------+-------------------+--------------+---------+---------+----+
|customer_group|Number of Customers|Customer Value|Retention|$ at Risk|Risk|
+--------------+-------------------+--------------+---------+---------+----+
|          High|                464|       1971.53|      0.0|      0.0|  --|
|           Low|                627|         82.56|      0.0|      0.0|  --|
|           Med|                482|        411.53|      0.0|      0.0|  --|
|           New|               1130|        846.93|      0.0|      0.0|  --|
+--------------+-------------------+--------------+---------+---------+----+



#### Key Insights (2013)

* **High-value customers**, while fewer in number, represent 80% of the total retention risk. Losing just one of these customers cost ~24x more than a low-value customer. 
* **Medium-value customers**, contribute 18% of total risk, and ahve the highest retention rate (54%.26%), indicating good loyalty with potential for even more updside. 
* **Low-value customers**, have a low retention rate (21.67%) but pose minimal financial risk (2%) retention efforts here are least efficient. 