# Churn modeling 

is a problem of identifying customers who are at risk of **leaving** or discontinuing their relationship with the company. It involves analyzing customer data to recognize patterns and behaviors that suggest customer dissatisfaction or a lack of engagement. The primary goal of churn modeling is to help businesses **retain customers** by understanding the reasons for their potential departure and taking appropriate actions to address their concerns.


Assumptions:
1. You have historical data about customers who
    churn and not churn.
2. You have a predictive model which for every
    customer predict whether he will churn or not churn next month.
3. You know if you give a discount d=x% (e.g. 30%) to a churn
    customer, then with the probability r=y% (e.g. 80%) he will
    not churn (these numbers could be assessed from historical data).

Data from `https://www.kaggle.com/code/bandiatindra/telecom-churn-prediction/data` with some marginal preprocessing

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
telecom_cust = pd.read_csv('telecom_churn.csv')

In [None]:
telecom_cust.head()

In [None]:
telecom_cust.shape

In [None]:
telecom_cust.count()

In [None]:
telecom_cust.columns

In [None]:
telecom_cust.dtypes.sort_values()

In [None]:
telecom_cust.TotalCharges = pd.to_numeric(telecom_cust.TotalCharges, errors='coerce')
telecom_cust.isnull().sum()

In [None]:
telecom_cust[telecom_cust.TotalCharges.astype(float).isnull()]

In [None]:
telecom_cust.dropna(inplace=True)

In [None]:
telecom_cust.dtypes

In [None]:
telecom_cust.Churn.value_counts()

In [None]:
# Convert the predictor variable in a binary numeric variable
telecom_cust['Churn'].replace(to_replace='Yes', value=1, inplace=True)
telecom_cust['Churn'].replace(to_replace='No',  value=0, inplace=True)

# Let's convert all the categorical variables into dummy variables
df_dummies = pd.get_dummies(telecom_cust.drop('customerID', axis=1))
df_dummies.head()

In [None]:
df_dummies.describe()

In [None]:
pd.options.display.max_columns = 50

In [None]:
df_dummies.describe()

In [None]:
# Plot `Churn` correlation with features

plt.figure(figsize=(6,8))
df_dummies.corr()['Churn'].sort_values(ascending = False).plot(kind='barh');
plt.title('Churn column correlation with every feature');

In [None]:
df_dummies.corr()['Churn'].sort_values(ascending = False)

### `Monthly contracts`, an `absense of online security` and `technical support` positively affect Churn. In 

### contrast, being a `tenure` customer, having a `2-years contract` and `not having an internet service` negatively

### affect Churn.

# Quick EDA

## 1. Demography


### Females, Males are 50-50

In [None]:
colors = ['#4D3425','#E4512B']
ax = (telecom_cust['gender'].value_counts()*100.0 /len(telecom_cust)).plot(kind='bar',
                                                                           stacked = True,
                                                                          rot = 0,
                                                                          color = colors)
# ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('% Customers')
ax.set_xlabel('Gender')
ax.set_ylabel('% Customers')
ax.set_title('Gender Distribution')

# create a list to collect the plt.patches data
totals = []

# find the values and append to list
for i in ax.patches:
    totals.append(i.get_width())

# set individual bar lables using above list
total = sum(totals)

for i in ax.patches:
    # get_width pulls left or right; get_y pushes up or down
    ax.text(i.get_x()+.15, i.get_height()-3.5, \
            str(round((i.get_height()/total), 1))+'%',
            fontsize=12,
            color='white',
           weight = 'bold')

### On average Female customers spend 1 more dollar on service (per month), there is no difference in median spendings though

In [None]:
telecom_cust.groupby('gender')['MonthlyCharges'].agg(['mean', 'max'])

### There are only 16% of seniors

In [None]:
ax = (telecom_cust['SeniorCitizen'].value_counts()*100.0 /len(telecom_cust))\
.plot.pie(autopct='%.1f%%', labels = ['No', 'Yes'],figsize =(5,5), fontsize = 12 )
ax.set_ylabel('Senior Citizens',fontsize = 12)
ax.set_title('% of Senior Citizens', fontsize = 12);

### Senior Citizens spends significantly more money on services (why?)

In [None]:
telecom_cust.groupby('SeniorCitizen')['MonthlyCharges'].agg(['mean', 'max', 'median'])

### 50% of customers have a partner, 30% of customers have a dependent

In [None]:
df2 = pd.melt(telecom_cust, id_vars=['customerID'], value_vars=['Dependents','Partner'])
df3 = df2.groupby(['variable','value']).count().unstack()
df3 = df3*100/len(telecom_cust)
colors = ['#4D3425','#E4512B']
ax = df3.loc[:,'customerID'].plot.bar(stacked=True, color=colors,
                                      figsize=(8,6),rot = 0,
                                     width = 0.2)

ax.set_ylabel('% Customers',size = 14)
ax.set_xlabel('')
ax.set_title('% Customers with dependents and partners',size = 14)
ax.legend(loc = 'center',prop={'size':14})

for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
               size = 14)


### Customers with partners spend more compared to customers without partners

In [None]:
telecom_cust.groupby('Partner')['MonthlyCharges'].agg(['mean', 'max', 'median'])

## 2. Tenure and type of a contract

### There are two large groups: senior customers and customers who use the service for 1-2 months

In [None]:
ax = sns.distplot(telecom_cust['tenure'], hist=True, kde=False, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('# of Customers')
ax.set_xlabel('Tenure (months)')
ax.set_title('# of Customers by their tenure')


### Most of the customers are on 1-month contract

In [None]:
ax = telecom_cust['Contract'].value_counts().plot(kind = 'bar',rot = 0, width = 0.3)
ax.set_ylabel('# of Customers')
ax.set_title('# of Customers by Contract Type');

### How tenure depends on the type of the contract?

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, sharey = True, figsize = (20,6))

ax = sns.distplot(telecom_cust[telecom_cust['Contract']=='Month-to-month']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'turquoise',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax1)
ax.set_ylabel('# of Customers')
ax.set_xlabel('Tenure (months)')
ax.set_title('Month to Month Contract')

ax = sns.distplot(telecom_cust[telecom_cust['Contract']=='One year']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'steelblue',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax2)
ax.set_xlabel('Tenure (months)',size = 14)
ax.set_title('One Year Contract',size = 14)

ax = sns.distplot(telecom_cust[telecom_cust['Contract']=='Two year']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'darkblue',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax3)

ax.set_xlabel('Tenure (months)')
ax.set_title('Two Year Contract');


2-year contract clients are more loyal: most of the month-to-month clients use services for 1-2 months in total, while most 2-year contract clients use services for >50 months (>4 years).

## 3. What about additional services?

In [None]:
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

fig, axes = plt.subplots(nrows = 3,ncols = 3,figsize = (15,12))
for i, item in enumerate(services):
    if i < 3:
        ax = telecom_cust[item].value_counts().plot(kind = 'bar',ax=axes[i,0],rot = 0)
        
    elif i >=3 and i < 6:
        ax = telecom_cust[item].value_counts().plot(kind = 'bar',ax=axes[i-3,1],rot = 0)
        
    elif i < 9:
        ax = telecom_cust[item].value_counts().plot(kind = 'bar',ax=axes[i-6,2],rot = 0)
    ax.set_title(item)


### Among Senior Citizens 95% do you an InternetService, while for others it is only 75%

In [None]:
telecom_cust.groupby('SeniorCitizen')['InternetService'].value_counts()

### We observe the same trend for Tech Support

In [None]:
telecom_cust.groupby('SeniorCitizen')['TechSupport'].value_counts()

# 4. What about Churn (target column)?

### There are 73% of customers who do not Churn, and 27% who churn

In [None]:
telecom_cust['Churn'].value_counts() / len(telecom_cust) * 100

### On average customers who do not churn use services for 38 months

In [None]:
telecom_cust.groupby('Churn').tenure.agg(['mean', 'median', 'std'])

### There are 43% of Churned customers among month-to-month users, 11% among customers with 1-year contract, and only 3% among those with 2-year contract

In [None]:
telecom_cust.groupby('Contract').Churn.mean()

# Churn prediction

In [None]:
from sklearn.model_selection import train_test_split

X = telecom_cust.drop(['customerID', 'Churn'], axis=1)
y = telecom_cust['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
cat_features = list(X_train.dtypes.sort_values().iloc[4:].index)

print('Categorical features:', cat_features)

dataset_train = Pool(X_train, y_train, cat_features)
dataset_test = Pool(X_test, y_test, cat_features)

In [None]:
model = CatBoostClassifier()

model.fit(dataset_train, eval_set=dataset_test,
          verbose=50)

In [None]:
y_pred_test = model.predict_proba(X_test)
y_pred_train = model.predict_proba(X_train)

In [None]:
# y(Churn) = 0.3

# threshold = 0.10

In [None]:
y_pred_train

In [None]:
y_pred_train.sum(axis=1)

In [None]:
print(roc_auc_score(y_train, y_pred_train[:, 1]))

In [None]:
print(roc_auc_score(y_test, y_pred_test[:, 1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, y_pred_train[:, 1])

In [None]:
plt.plot(fpr, tpr, label=f'ROC AUC = {np.round(roc_auc_score(y_train, y_pred_train[:, 1]), 2)}');
plt.legend()
plt.xlabel('False Positive Rate = FP / (FP + TN)')
plt.ylabel('True Positive Rate = Recall \n= TP / (TP + FN)')
plt.title('ROC AUC curve');

In [None]:
plt.plot(fpr, tpr, label=f'ROC AUC = {np.round(roc_auc_score(y_train, y_pred_train[:, 1]), 2)}');
plt.scatter(fpr[500], tpr[500], c='r', s=80, label=f'probability threshold = {np.round(thresholds[500], 2)}')
plt.scatter(fpr[900], tpr[900], c='g', s=80, label=f'probability threshold = {np.round(thresholds[900], 2)}')
plt.scatter(fpr[1200], tpr[1200], c='k', s=80, label=f'probability threshold = {np.round(thresholds[1200], 2)}')
plt.legend()
plt.xlabel('False Positive Rate = FP / (FP + TN)')
plt.ylabel('True Positive Rate = Recall \n= TP / (TP + FN)');

In [None]:
threshold = 0.25

accuracy_score(y_train, (y_pred_train[:, 1] > threshold).astype(int))

In [None]:
threshold = 0.5

accuracy_score(y_train, (y_pred_train[:, 1] > threshold).astype(int))

In [None]:
threshold = 0.75

accuracy_score(y_train, (y_pred_train[:, 1] > threshold).astype(int))

In [None]:
threshold = 0.25

tn, fp, fn, tp = confusion_matrix(y_train, (y_pred_train[:, 1] > threshold).astype(int)).ravel()

print(f'TN: {tn}', f'FP: {fp}', f'FN: {fn}', f'TP: {tp}')

In [None]:
threshold = 0.5

tn, fp, fn, tp = confusion_matrix(y_train, (y_pred_train[:, 1] > threshold).astype(int)).ravel()

print(f'TN: {tn}', f'FP: {fp}', f'FN: {fn}', f'TP: {tp}')

In [None]:
threshold = 0.75

tn, fp, fn, tp = confusion_matrix(y_train, (y_pred_train[:, 1] > threshold).astype(int)).ravel()

print(f'TN: {tn}', f'FP: {fp}', f'FN: {fn}', f'TP: {tp}')