In [1]:
import pandas as pd
import numpy as np

rng = np.random.default_rng(seed=0)

In [17]:
SIZE = 10_000
# rng.binomial(nbr events e.g. 1 means 0 or 1, p is probability of True, size is nbr of items to generate)

# a bad_exp means they had a problem (e.g. bad insurance claim, hard time with mobile phone tech support),
# this increases their likelihood of churn
# mkt_neg means they really don't like getting marketing and this will increase their likelihood of churn
# gets_mkting is a 50/50 split for Treatment (True) or Control (False)

BASE_CHURN = 0.15  # expected base churn rate

ppl = pd.DataFrame(
    {
        # "bad_exp": rng.binomial(
        #    1, 0.1, SIZE
        # ),  # True if they had a bad experience with company
        "mkt_neg": rng.binomial(
            1, 0.1, SIZE
        ),  # True if receiving marketing will increase churn probability for them
    }
)
ppl["gets_mkting"] = rng.binomial(1, 0.5, SIZE)

ppl["prob_churn"] = BASE_CHURN  # a reasonably standard churn rate

In [22]:
# people who hate marketing and who get marketing have a higher chance of churning
mask = (ppl["mkt_neg"] & ppl["gets_mkting"]) == 1 # trues are 1s (ints)
ppl.loc[mask, "prob_churn"] += 0.25 # TODO HUGE CHANGE !!!!
assert (
    mask.sum() / mask.shape[0] < 0.2
), "Not expecting a high rate of churn even with random variation"
ppl[mask].sample(5)

Unnamed: 0,mkt_neg,gets_mkting,prob_churn
3967,1,1,1.15
1989,1,1,1.15
9597,1,1,1.15
4809,1,1,1.15
4583,1,1,1.15


In [4]:
ppl["will_churn"] = rng.binomial(1, ppl["prob_churn"], ppl.shape[0])

In [5]:
ppl.head()

Unnamed: 0,mkt_neg,gets_mkting,prob_churn,will_churn
0,0,1,0.15,1
1,0,1,0.15,0
2,0,1,0.15,0
3,0,1,0.15,0
4,0,1,0.15,0


In [6]:
ppl.describe()

Unnamed: 0,mkt_neg,gets_mkting,prob_churn,will_churn
count,10000.0,10000.0,10000.0,10000.0
mean,0.098,0.5057,0.16235,0.1595
std,0.29733,0.499993,0.054178,0.36616
min,0.0,0.0,0.15,0.0
25%,0.0,0.0,0.15,0.0
50%,0.0,1.0,0.15,0.0
75%,0.0,1.0,0.15,0.0
max,1.0,1.0,0.4,1.0


In [7]:
import pandera as pa

# check that mkt_neg people have a greater prob_churn than non-mkt_neg ppl for a 2 sample ttest

schema = pa.DataFrameSchema(
    {
        "mkt_neg": pa.Column(int, pa.Check.isin([0, 1])),
        "gets_mkting": pa.Column(int, pa.Check.isin([0, 1])),
        #"prob_churn": pa.Column(float, [pa.Check.le(1.0), pa.Check.ge(0)]),
        "will_churn": pa.Column(int, pa.Check.isin([0, 1])),
        "prob_churn": pa.Column(float,    [pa.Check.le(1.0), pa.Check.ge(0), pa.Hypothesis.two_sample_ttest(
                sample1=1,
                sample2=0,
                groupby="mkt_neg",
                relationship="greater_than",
                alpha=0.05,
                equal_var=True)])
    }
)
schema.validate(ppl, lazy=True)

Unnamed: 0,mkt_neg,gets_mkting,prob_churn,will_churn
0,0,1,0.15,1
1,0,1,0.15,0
2,0,1,0.15,0
3,0,1,0.15,0
4,0,1,0.15,0
...,...,...,...,...
9995,0,0,0.15,0
9996,0,1,0.15,0
9997,0,1,0.15,0
9998,1,1,0.40,1


# Look at some examples of those who do or don't churn

In [8]:
ppl.query("will_churn==True and prob_churn > @BASE_CHURN").iloc[[0]]

Unnamed: 0,mkt_neg,gets_mkting,prob_churn,will_churn
5,1,1,0.4,1


In [9]:
ppl.query("will_churn==True and prob_churn == @BASE_CHURN").iloc[[0]]

Unnamed: 0,mkt_neg,gets_mkting,prob_churn,will_churn
0,0,1,0.15,1


In [10]:
ppl.query("will_churn==False").iloc[[0]]

Unnamed: 0,mkt_neg,gets_mkting,prob_churn,will_churn
1,0,1,0.15,0


# Start to prepare for ML

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
features = ['mkt_neg']
ppl_train, ppl_test, X_train, X_test, y_train, y_test = train_test_split(ppl, ppl[features], ppl['will_churn'], test_size=0.3, shuffle=True)
print(list(x.shape for x in [ppl_train, ppl_test, X_train, X_test, y_train, y_test]))

[(7000, 4), (3000, 4), (7000, 1), (3000, 1), (7000,), (3000,)]


In [13]:
from sklearn.dummy import DummyClassifier
est_dummy = DummyClassifier(strategy='uniform')
est_dummy.fit(X_train, y_train)

from sklearn.metrics import log_loss
#y_true

dummy_proba_pos = est_dummy.predict_proba(X_test)[:, 1]
log_loss(y_test, dummy_proba_pos)

0.6931471805599454

In [14]:
from sklearn.linear_model import LogisticRegression
est = LogisticRegression()
est.fit(X_train, y_train)

y_pred = est.predict_proba(X_test)
y_pred_proba_pos = y_pred[:, 1]
log_loss(y_test, y_pred_proba_pos)

0.4403364393392923

# diagnostics might not be right yet!

In [15]:

result = pd.DataFrame({'lr_proba_pos': y_pred_pos, 'y_true': y_test})
result['dummy_proba_pos'] = est_dummy.predict_proba(X_test)[:, 1]


result_sorted = result.sort_values('lr_proba_pos').reset_index(drop=True)
result_sorted['y_true_cum'] = result_sorted['y_true'].cumsum()

#result_sorted['random_guess'] = rng.binomial(1, 0.5, result_sorted.shape[0])
#result_sorted['random_guess_cum'] = result_sorted['random_guess'].cumsum()
result_sorted

NameError: name 'y_pred_pos' is not defined

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(4, 4), constrained_layout=True)
result_sorted.plot(kind='line', y='y_true_cum', ax=ax)
#result_sorted.plot(kind='line', y='random', ax=ax)

In [None]:
result = pd.DataFrame({'y_true': y_test})
result['dummy_proba_pos'] = dummy_proba_pos


result_sorted = result.sort_values('dummy_proba_pos').reset_index(drop=True)
result_sorted['y_true_cum'] = result_sorted['y_true'].cumsum()
result_sorted.head(5)
result_sorted.tail(2)

In [None]:
result_sorted.describe()

In [None]:
fig, ax = plt.subplots(figsize=(4, 4), constrained_layout=True)
result_sorted.plot(kind='line', y='y_true_cum', ax=ax)
result_sorted.tail(2)