# Make a Churn simulation

Make a simulated population, treat none (no marketing to anyone), learn to predict who-churns-most, output a pickle for use in `make_plots`.

Churn models don't know anything about marketing - they're built before we test a person's reaction to marketing.

* `mkt_neg` (_sleeping dogs_) - marketing is a negative, upon receipt the churn risk increases
* `bad_exp` (_lost causes_) - person had a bad experience, they're more likely to churn (regardless of marketing) 
* `mkt_pos` (_persuadable_) - marketing is a positive, upon receipt the churn risk _decreases_
* `brand_loyal` (_sure things_) - person likes the brand, they're less likely to churn (regardless of marketing)

Terms taken from "Identifying who can be saved and who will be driven away by retention activity", Radcliffe 2007: https://www.stochasticsolutions.com/pdf/SavedAndDrivenAway.pdf

In [11]:
%load_ext autoreload
%autoreload 2

from functools import partial
import numpy as np
import pandas as pd
import pandera as pa
from sklearn.ensemble import RandomForestClassifier

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from simpler_mpl import set_common_mpl_styles, set_commas
from utility import summarise_groups_pretty, make_ppl, determine_churners
from utility import marketing_props, BASE_CHURN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# Population
TRAIN_SIZE = 100_000
# ML
TEST_SIZE = 50_000  # 0.3 # 0.3 means 30% test set size

# SIZE = 500_000
# TEST_SIZE = 100_000
VAL_SIZE = TEST_SIZE

features = ["mkt_neg", "bad_exp", "mkt_pos", "brand_loyal"]

In [13]:
# TODO uplift_test should be 1.0, not 0.999

model_type = "churn"
ppl_train = determine_churners(
    make_ppl(TRAIN_SIZE, BASE_CHURN),
    marketing_prop=marketing_props[f"{model_type}_train"],
)
ppl_test = determine_churners(
    make_ppl(TEST_SIZE, BASE_CHURN),
    marketing_prop=marketing_props[f"{model_type}_test"],
)
ppl_val = determine_churners(
    make_ppl(VAL_SIZE, BASE_CHURN), marketing_prop=marketing_props[f"{model_type}_val"]
)

X_train = ppl_train[features]
X_test = ppl_test[features]
X_val = ppl_val[features]
y_train = ppl_train["will_churn"]
y_test = ppl_test["will_churn"]
y_val = ppl_val["will_churn"]

ppl = pd.concat((ppl_train, ppl_test))
assert ppl.shape[1] == ppl_train.shape[1]

ppl.sample(5)  # sample from whole population

determine_churners on 100000 rows with marketing_prop 0.00
determine_churners on 50000 rows with marketing_prop 0.00
determine_churners on 50000 rows with marketing_prop 1.00


Unnamed: 0,brand_loyal,bad_exp,mkt_neg,mkt_pos,prob_churn,gets_mkting,will_churn
19984,0,0,0,0,0.16,0,0
27979,0,0,0,0,0.16,0,0
78548,1,0,0,0,0.11,0,0
84125,0,0,0,0,0.16,0,0
42823,0,0,0,0,0.16,0,1


In [15]:
def ppl_summary(ppl, title):
    display(title)
    res = ppl.groupby(['brand_loyal', 'bad_exp']).agg(
    count=pd.NamedAgg("prob_churn", "size"),
    prob_churn_mean=pd.NamedAgg("prob_churn", "mean"),
    prob_churn_var=pd.NamedAgg("prob_churn", "var"),
    will_churn_sum=pd.NamedAgg("will_churn", "sum"),
    will_churn_mean=pd.NamedAgg("will_churn", "mean"))
    display(res)
    res = ppl.groupby(['mkt_pos', 'mkt_neg']).agg(
    count=pd.NamedAgg("prob_churn", "size"),
    prob=pd.NamedAgg("prob_churn", "mean"),
    will_churn_sum=pd.NamedAgg("will_churn", "sum"),
    will_churn_mean=pd.NamedAgg("will_churn", "mean"))
    display(res)

ppl_summary(ppl_train, "train")
ppl_summary(ppl_test, "test")
ppl_summary(ppl_val, "val")

'train'

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prob_churn_mean,prob_churn_var,will_churn_sum,will_churn_mean
brand_loyal,bad_exp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,90095,0.16,0.0,14422,0.160075
1,0,9905,0.11,0.0,1113,0.112367


Unnamed: 0_level_0,Unnamed: 1_level_0,count,prob,will_churn_sum,will_churn_mean
mkt_pos,mkt_neg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,100000,0.155048,15535,0.15535


'test'

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prob_churn_mean,prob_churn_var,will_churn_sum,will_churn_mean
brand_loyal,bad_exp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,45088,0.16,0.0,7160,0.158801
1,0,4912,0.11,0.0,576,0.117264


Unnamed: 0_level_0,Unnamed: 1_level_0,count,prob,will_churn_sum,will_churn_mean
mkt_pos,mkt_neg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,50000,0.155088,7736,0.15472


'val'

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prob_churn_mean,prob_churn_var,will_churn_sum,will_churn_mean
brand_loyal,bad_exp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,45088,0.16,0.0,7234,0.160442
1,0,4912,0.11,0.0,552,0.112378


Unnamed: 0_level_0,Unnamed: 1_level_0,count,prob,will_churn_sum,will_churn_mean
mkt_pos,mkt_neg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,50000,0.155088,7786,0.15572


In [None]:
# XXX
rng.binomial(1, 0.11, 1)[0]

In [None]:
# XXX
rng = np.random.default_rng(seed=0)
ppl_train['will_churn'] = ppl_train['prob_churn'].apply(lambda p: rng.binomial(1, p))
ppl_summary(ppl_train, "train")


In [None]:
ppl.groupby('brand_loyal')['will_churn'].mean()

In [None]:
ppl.groupby('bad_exp')['will_churn'].mean()

In [None]:
ppl.groupby(['brand_loyal', 'bad_exp']).agg(
    count=pd.NamedAgg("will_churn", "size"),
    prob=pd.NamedAgg("will_churn", "mean"),)

In [None]:
percentiles = [0.01, 0.05, 0.5, 0.95, 0.99]
ppl_train.describe(percentiles=percentiles)

In [None]:
ppl_test.describe(percentiles=percentiles)

In [None]:
ppl_val.describe(percentiles=percentiles)

In [None]:
ppl.describe(percentiles=percentiles)

In [None]:
# calc nbr and percentage ppl who churn given probability estimates
# ppl["prob_churn_bin"] = pd.cut(ppl["prob_churn"], bins=10)
# display(ppl.groupby("prob_churn_bin")["will_churn"].sum())
# display(
#    ppl.groupby("prob_churn_bin")["will_churn"].sum()
#    / ppl.groupby("prob_churn_bin")["will_churn"].size()
# )
# ppl = ppl.drop(columns="prob_churn_bin")

In [None]:
ppl.head()

In [None]:
ppl

In [None]:
# check that mkt_neg people have a greater prob_churn than non-mkt_neg ppl for a 2 sample ttest

# COULD DO
# prob_churn and will_churn should be reasonably similar (to 2dp?)

schema = pa.DataFrameSchema(
    {
        "bad_exp": pa.Column(int, pa.Check.isin([0, 1])),
        "brand_loyal": pa.Column(int, pa.Check.isin([0, 1])),
        "mkt_neg": pa.Column(int, pa.Check.isin([0, 1])),
        "mkt_pos": pa.Column(int, pa.Check.isin([0, 1])),
        # gets_mkting should be circa 50%
        "gets_mkting": pa.Column(
            int,
            [
                pa.Check.isin([0, 1]),
                # TODO should check no mkting for train, mkting for test
                # pa.Check(lambda s: s.mean() > 0.45),
                # pa.Check(lambda s: s.mean() < 0.55),
            ],
        ),
        "will_churn": pa.Column(int, pa.Check.isin([0, 1])),
        # prob_churn bounded [0, 1] and if mkt_neg is True then prob_churn should be greater than if mkt_neg if False
        "prob_churn": pa.Column(
            float,
            [
                pa.Check.le(1.0),
                pa.Check.ge(0),
                # pa.Hypothesis.two_sample_ttest(
                #    sample1=1,
                #    sample2=0,
                #    groupby="mkt_neg",
                #    relationship="greater_than",
                #    alpha=0.05,
                #    equal_var=True,
                # ),
            ],
        ),
    },
    strict=True,
    ordered=False,
)
schema.validate(
    ppl,
    lazy=True,
)
schema.validate(
    ppl_val,
    lazy=True,
);

# Look at some examples of those who do or don't churn

In [None]:
ppl.query("will_churn==True and prob_churn > @BASE_CHURN")[:2]

In [None]:
ppl.query("will_churn==False")[:2]

# Start to prepare for ML

In [None]:
ppl.columns

In [None]:
assert len(set(features)) == len(features), "Not expecting duplicates"
print(f"Using: {features}")

# check we've not forgotten any columns as new features
non_features = (
    set(ppl.columns)
    .difference(set(features))
    .difference({"will_churn", "prob_churn", "gets_mkting"})
)

if len(non_features) > 0:
    print(f"IGNORING !!!!!!! {non_features}")
    1 / 0  # we shouldn't get here

In [None]:
# ppl_train, ppl_test, X_train, X_test, y_train, y_test = train_test_split(
#    ppl, ppl[features], ppl["will_churn"], test_size=TEST_SIZE, shuffle=True
# )
print(list(x.shape for x in [ppl_train, ppl_test, X_train, X_test, y_train, y_test]))

In [None]:
est_dummy = DummyClassifier(strategy="prior")
est_dummy.fit(X_train, y_train)

dummy_proba_pos = est_dummy.predict_proba(X_test)[:, 1]
log_loss(y_test, dummy_proba_pos)

In [None]:
base_model = LogisticRegression
# base_model = partial(RandomForestClassifier, n_estimators=5)

In [None]:
churn_model = base_model
# churn_model = LogisticRegression
# churn_model = partial(RandomForestClassifier, n_estimators=10)
est_churn = churn_model()
est_churn.fit(X_train, y_train)
print(f"Fitting churn model with {churn_model}")

y_pred = est_churn.predict_proba(X_test)
y_pred_proba_pos = y_pred[:, 1]
log_loss(y_test, y_pred_proba_pos)

# Gains chart

Note if T prob guessed more-wrong than C prob then it is possible for a decreasing gains line.

In [None]:
del ppl_test, ppl
del ppl_train
del X_train, X_test
del y_train, y_test

In [None]:
keys = ["churn_proba_pos", "dummy_proba_pos"]

In [None]:
result = pd.DataFrame({"y_true": y_val})

result["churn_proba_pos"] = est_churn.predict_proba(X_val)[:, 1]
result["dummy_proba_pos"] = est_dummy.predict_proba(X_val)[:, 1]
for key in keys:
    assert key in result.columns

In [None]:
fig, ax = plt.subplots(figsize=(8, 8), constrained_layout=True)
fig.suptitle("Gains curve - Positive divergence\nmeans better ordering")
for key in keys:
    linestyle = "-"
    linewidth = 2
    if key in {"up_proba_c", "up_proba_t"}:
        linestyle = "--"
        linewidth = 1

    result_sorted = result.sort_values(key, ascending=False).reset_index()
    result_sorted["y_true_cum"] = result_sorted["y_true"].cumsum()
    result_sorted.plot(
        kind="line",
        y="y_true_cum",
        ax=ax,
        label=key,
        linestyle=linestyle,
        linewidth=linewidth,
    )
ax.set_ylabel("True Positives (faster climb better)")
ax.set_xlabel("Validation population")

set_common_mpl_styles(ax, grid_axis="both")
# set_commas(ax, True, True)

# # zoom on x axis
# ax.set_xlim((45_000, result.shape[0]-1));
# ax.set_ylim(ymin=6000);

In [None]:
test_merged = pd.merge(
    left=result,
    right=ppl_val,
    left_index=True,
    right_index=True,
    validate="one_to_one",
)
assert test_merged.shape[0] == TEST_SIZE
test_merged.sample(10).sort_values("prob_churn").drop(
    columns=[] #"dummy_proba_pos", "churn_proba_pos"]
)

In [None]:
test_merged["churn_proba_pos_bins"] = pd.cut(test_merged["churn_proba_pos"], 3)
#test_merged["churn_proba_pos_bins"] = pd.qcut(test_merged["churn_proba_pos"], 2)
features

# For each row in the test set, group and explain the features

If we order the data by predicted use of marketing to increase lift, we can count the underlying features - these should reflect the original distributions.

* sure things - have `brand_loyal` and don't respond to marketing, they're likelier to _stay_
* lost causes - have `bad_experience` and don't respond to marketing, they're likelier to _leave_
* persuadables - have `mkt_pos` as they respond well if marketed to, they're likelier to _stay_ conditional on marketing
* sleeping dogs - have `mkt_neg` as they respond negatively if marketed to, they're likelier to _leave_ conditional on marketing

In [None]:
res = test_merged.groupby("dummy_proba_pos").agg(
    count=pd.NamedAgg("mkt_neg", "size"),
    prob_churn_if_mktd=pd.NamedAgg("will_churn", "mean"),
    mkt_pos_sum=pd.NamedAgg("mkt_pos", "sum"),
    bad_exp_sum=pd.NamedAgg("bad_exp", "sum"),
    brand_loyal_sum=pd.NamedAgg("brand_loyal", "sum"),
    mkt_neg_sum=pd.NamedAgg("mkt_neg", "sum"),
)
display(f"Base churn rate for all is circa {BASE_CHURN*100:0.1f}%")
display(
    "churn_proba_pos_bins is prob(churn), typically we market at the people with highest chance of churn"
)
res.style.pipe(summarise_groups_pretty, "Feature contributions to Churn prediction")

In [None]:
# show bins in equal blocks
test_merged2 = test_merged.sort_values('churn_proba_pos_bins').copy()
test_merged2 = test_merged2.reset_index(drop=True)
test_merged2["binned_index"] = pd.cut(test_merged2.index, 10)

res = test_merged2.groupby("binned_index").agg(
    count=pd.NamedAgg("mkt_neg", "size"),
    prob_churn_if_mktd=pd.NamedAgg("will_churn", "mean"),
    mkt_pos_sum=pd.NamedAgg("mkt_pos", "sum"),
    bad_exp_sum=pd.NamedAgg("bad_exp", "sum"),
    brand_loyal_sum=pd.NamedAgg("brand_loyal", "sum"),
    mkt_neg_sum=pd.NamedAgg("mkt_neg", "sum"),
    #up_proba_c_mean=pd.NamedAgg("up_proba_c", "mean"),
    #up_proba_t_mean=pd.NamedAgg("up_proba_t", "mean"),
    #uplift_diff_t_c=pd.NamedAgg("uplift_diff_t_c", "mean")
)
#res

res[::].style.pipe(summarise_groups_pretty, "Feature contributions to Uplift prediction")

In [None]:
res = test_merged.groupby("churn_proba_pos_bins").agg(
    count=pd.NamedAgg("mkt_neg", "size"),
    prob_churn_if_mktd=pd.NamedAgg("will_churn", "mean"),
    mkt_pos_sum=pd.NamedAgg("mkt_pos", "sum"),
    bad_exp_sum=pd.NamedAgg("bad_exp", "sum"),
    brand_loyal_sum=pd.NamedAgg("brand_loyal", "sum"),
    mkt_neg_sum=pd.NamedAgg("mkt_neg", "sum"),
)
display(f"Base churn rate for all is circa {BASE_CHURN*100:0.1f}%")
display(
    "churn_proba_pos_bins is prob(churn), typically we market at the people with highest chance of churn"
)
res.style.pipe(summarise_groups_pretty, "Feature contributions to Churn prediction")

# Estimate value to business

## Churn comparison result

In [None]:
df_comparison = test_merged.sort_values('churn_proba_pos', ascending=False).copy() # most likely needing marketing first
df_comparison = df_comparison[['churn_proba_pos', 'will_churn']].rename(columns={'will_churn': 'churn_will_churn'})
df_comparison = df_comparison.reset_index(drop=True)

OUTFILE = "df_comparison_churn.pickle"
print(f"Writing to {OUTFILE} with {df_comparison.columns}")
df_comparison.to_pickle(OUTFILE)
#f_costing.plot(kind='line', y='value_generated_cumsum');

In [None]:
df_comparison = test_merged.sort_values('dummy_proba_pos', ascending=False).copy() # most likely needing marketing first
df_comparison = df_comparison[['dummy_proba_pos', 'will_churn']].rename(columns={'will_churn': 'dummy_will_churn'})
df_comparison = df_comparison.reset_index(drop=True)

OUTFILE = "df_comparison_dummy.pickle"
print(f"Writing to {OUTFILE} with {df_comparison.columns}")
df_comparison.to_pickle(OUTFILE)
#f_costing.plot(kind='line', y='value_generated_cumsum');