# Business Problem

Telco churn data includes information about a fictitious telecom company that, in the third quarter, provided home phone and Internet services to 7043 customers in California. Shows which customers have left, stayed or signed up for their service.

Variables
* CustomerId
* Gender
* SeniorCitizen: Whether the customer is old (1,0)
* Partner: Whether the Customer Has a Partner (Yes, No)
* Dependents: Whether the customer has dependents(Yes, No)
* tenure: The number of months the customer stayed with the company
* PhoneService: Whether the customer has telephone service(Yes, No)
* MultipleLines: If the customer does not have more than one line(Yes, No, No Telephone service)
* InternetService: Customer's internet service provider (DSL, Fiber optic, No)
* OnlineSecurity: Whether Customer Has Online Security(Yes, No, No Internet service)
* OnlineBackup: Whether the customer has an online backup (Yes, No, no Internet service)
* DeviceProtection: Whether the customer has device protection(Yes, No, no Internet service)
* TechSupport: Customer does not have technical support(Yes, No, no internet service)
* StreamingTV: Whether the customer has a TV broadcast (Yes, No, no Internet service)
* StreamingMovies: Whether the customer is streaming Movies(Yes, No, No Internet service)
* Contract: Customer's contract duration (Month to month, One year, Two years)
* PaperlessBilling: Whether the customer has a paperless invoice(Yes, No)
* PaymentMethod: Customer's Payment Method(Electronic Check, Postal Check, Bank Transfer(automatic), Credit Card(automatic))
* MonthlyCharges: Amount charged monthly from customer
* TotalCharges: Total amount Collected from Customer
* Churn: Whether the Customer churn (Yes, No)

In [None]:
# pip install pivottablejs
# from pivottablejs import pivot_ui

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from pandas_profiling import ProfileReport
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/telcochurndata/Telco-Customer-Churn.csv')
df.head()

In [None]:
def check_df(dataframe, head=10):
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Variables #####################")
    print(dataframe.columns)
    print("##################### Descriptive Stats #####################")
    print(dataframe.describe().T)
    print("##################### Null Values #####################")
    print(dataframe.isnull().sum())
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Info #####################")
    print(dataframe.info())
check_df(df)

Total charges variable should have float numbers only. 

In [None]:
df['TotalCharges'] = df['TotalCharges'].replace(r'\s+', np.nan, regex=True)
df['TotalCharges'] = df['TotalCharges'].astype('float')

Convert variables that have yes or no values into 0 and 1's without using label encoder.

In [None]:
columns_to_convert = ['Partner', 
                      'Dependents', 
                      'PhoneService', 
                      'PaperlessBilling', 
                      'Churn']

for item in columns_to_convert:
    df[item] = df[item].apply(lambda x: 0 if x=='No' else 1)

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
df.isnull().sum()

## Categorical variable analysis

In the rare encoding phase, we can combine indices that have least ratios.

In [None]:
def cat_summary(dataframe, col_name):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
for cat_col in cat_cols:
    cat_summary(df, cat_col)

The dataset is imbalanced (churn ratio is imbalanced). We will use the SMOTETomek technique which combine a synthetic oversampling sequence (SMOTE) followed by an undersampling sequence (TOMEK) Step 1 : Oversampling synthetically the minority class Step 2 : Undersampling by cleaning the noise generated by the SMOTE technique

In [None]:
df[df["Contract"] == "Month-to-month"]["tenure"].hist(bins=20)
plt.xlabel("tenure")
plt.title("Month-to-month")

In [None]:
df[df["Contract"] == "Two year"]["tenure"].hist(bins=20)
plt.xlabel("tenure")
plt.title("Two year")

In [None]:
fig = make_subplots(rows=4, cols=4, subplot_titles=["Gender - Count", "Partner - Count", "Dependents - Count",
                                          "PhoneService - Count",  "MultipleLines - Count", "InternetService - Count", 
                                          "OnlineSecurity - Count", "OnlineBackup - Count", "DeviceProtection - Count",
                                          "TechSupport - Count", "StreamingTV - Count",  "StreamingMovies - Count",
                                           "Contract - Count", "PaperlessBilling - Count", "PaymentMethod - Count",
                                            "SeniorCitizen - Count"])

fig.add_trace(go.Bar(x = df["gender"].value_counts().index, y=df["gender"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=1, col=1)
fig.add_trace(go.Bar(x = df["Partner"].value_counts().index, y=df["Partner"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=1, col=2)
fig.add_trace(go.Bar(x = df["Dependents"].value_counts().index, y=df["Dependents"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=1, col=3)
fig.add_trace(go.Bar(x = df["PhoneService"].value_counts().index, y=df["PhoneService"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=1, col=4)
fig.add_trace(go.Bar(x = df["MultipleLines"].value_counts().index, y=df["MultipleLines"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=1)
fig.add_trace(go.Bar(x = df["InternetService"].value_counts().index, y=df["InternetService"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=2)
fig.add_trace(go.Bar(x = df["OnlineSecurity"].value_counts().index, y=df["OnlineSecurity"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=3)
fig.add_trace(go.Bar(x = df["OnlineBackup"].value_counts().index, y=df["OnlineBackup"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=4)
fig.add_trace(go.Bar(x = df["DeviceProtection"].value_counts().index, y=df["DeviceProtection"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=1)
fig.add_trace(go.Bar(x = df["TechSupport"].value_counts().index, y=df["TechSupport"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=2)
fig.add_trace(go.Bar(x = df["StreamingTV"].value_counts().index, y=df["StreamingTV"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=3)
fig.add_trace(go.Bar(x = df["StreamingMovies"].value_counts().index, y=df["StreamingMovies"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=4)
fig.add_trace(go.Bar(x = df["Contract"].value_counts().index, y=df["Contract"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=4, col=1)
fig.add_trace(go.Bar(x = df["PaperlessBilling"].value_counts().index, y=df["PaperlessBilling"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=4, col=2)
fig.add_trace(go.Bar(x = df["PaymentMethod"].value_counts().index, y=df["PaymentMethod"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=4, col=3)
fig.add_trace(go.Bar(x = df["SeniorCitizen"].value_counts().index, y=df["SeniorCitizen"].value_counts().values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=4, col=4)
fig['layout']['xaxis16'].update(range=[0, 1])

fig.update_layout(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Bar Plots", showlegend=False)

In [None]:
## Numeric variable
def num_summary(dataframe, numerical_col):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)
    print("#"*9)

In [None]:
for num_col in num_cols:
    num_summary(df, num_col)

In [None]:
def target_analyser(dataframe, target, num_cols, cat_cols):
    print("#"*9,"target_numeric_analysis", "#"*9)
    for num_col in num_cols:
        print(pd.DataFrame({f"{num_col}_TARGET_MEAN": dataframe.groupby(target)[num_col].mean()}), end="\n\n\n")
    print("#"*9,"target_categoric_analysis", "#"*9)
    for cat_col in cat_cols:
        print(cat_col, ":", len(dataframe[cat_col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[cat_col].value_counts(),
                            "RATIO": dataframe[cat_col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(cat_col)[target].mean()}), end="\n\n\n")

In [None]:
from statsmodels.stats.proportion import proportions_ztest

In [None]:
test_stat, pvalue = proportions_ztest(count=[df.loc[df["Partner"] == 0, "Churn"].sum(),
                                             df.loc[df["Partner"] == 1, "Churn"].sum()],

                                      nobs=[df.loc[df["Partner"] == 0, "Churn"].shape[0],
                                            df.loc[df["Partner"] == 1, "Churn"].shape[0]])

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

In [None]:
test_stat, pvalue = proportions_ztest(count=[df.loc[df["Dependents"] == 0, "Churn"].sum(),
                                             df.loc[df["Dependents"] == 1, "Churn"].sum()],

                                      nobs=[df.loc[df["Dependents"] == 0, "Churn"].shape[0],
                                            df.loc[df["Dependents"] == 1, "Churn"].shape[0]])

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

In [None]:
target_analyser(df, "Churn", num_cols, cat_cols)

To illustrate, according to contract type, customers with month to month contract type has higher mean of churn than two-year contract type.

**RARE ENCODING**

In [None]:
# def rare_encoder(dataframe, rare_perc):
#     temp_df = dataframe.copy()

#     rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O'
#                     and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

#     for var in rare_columns:
#         tmp = temp_df[var].value_counts() / len(temp_df)
#         rare_labels = tmp[tmp < rare_perc].index
#         temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])

#     return temp_df

# new_df = rare_encoder(df, 0.01)

# rare_analyser(new_df, "TARGET", cat_cols)

In [None]:
fig = make_subplots(rows=3, cols=3, subplot_titles=["Partner - Count", "Partner - Non-zero Churn Count", "Partner - Mean Churn",
                                          "Dependents - Count",  "Dependents - Non-zero Churn Count", "Dependents - Mean Churn", 
                                          "PhoneService - Count", "PhoneService - Non-zero Churn Count", "PhoneService - Mean Churn"])
# Partner
prt_scm = df.groupby('Partner')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index, y=prt_scm["count of non-zero churn"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index, y=prt_scm["mean"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=3)

# Dependents
prt_scm = df.groupby('Dependents')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=2, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index,
                     y=prt_scm["count of non-zero churn"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=2, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index,
                     y=prt_scm["mean"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=3)

# PhoneService
prt_scm = df.groupby('PhoneService')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=3, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index,
                     y=prt_scm["count of non-zero churn"].head(10).values,
                        marker=dict(
                          line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                          )),
                     row=3, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index,
                     y=prt_scm["mean"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=3)

fig.update_layout(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Partner - Dependents - Phone Service", showlegend=False)
# py.iplot(fig, filename='device-plots')

In [None]:
fig = make_subplots(rows=3, cols=3, subplot_titles=["MultipleLines - Count", "MultipleLines - Non-zero Churn Count", "MultipleLines - Mean Churn",
                                          "InternetService - Count",  "InternetService - Non-zero Churn Count", "InternetService - Mean Churn", 
                                          "OnlineSecurity - Count", "OnlineSecurity - Non-zero Churn Count", "OnlineSecurity - Mean Churn"])
# MultipleLines
prt_scm = df.groupby('MultipleLines')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index, y=prt_scm["count of non-zero churn"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index, y=prt_scm["mean"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=3)

# InternetService
prt_scm = df.groupby('InternetService')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=2, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index,
                     y=prt_scm["count of non-zero churn"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=2, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index,
                     y=prt_scm["mean"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=3)

# OnlineSecurity
prt_scm = df.groupby('OnlineSecurity')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=3, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index,
                     y=prt_scm["count of non-zero churn"].head(10).values,
                        marker=dict(
                          line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                          )),
                     row=3, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index,
                     y=prt_scm["mean"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=3)

fig.update_layout(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="MultipleLines - InternetService - OnlineSecurity", showlegend=False)
# py.iplot(fig, filename='device-plots')

In [None]:
fig = make_subplots(rows=3, cols=3, subplot_titles=["OnlineBackup - Count", "OnlineBackup - Non-zero Churn Count", "OnlineBackup - Mean Churn",
                                          "DeviceProtection - Count",  "DeviceProtection - Non-zero Churn Count", "DeviceProtection - Mean Churn", 
                                          "TechSupport - Count", "TechSupport - Non-zero Churn Count", "TechSupport - Mean Churn"])
# OnlineBackup
prt_scm = df.groupby('OnlineBackup')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index, y=prt_scm["count of non-zero churn"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index, y=prt_scm["mean"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=1, col=3)

# DeviceProtection
prt_scm = df.groupby('DeviceProtection')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=2, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index,
                     y=prt_scm["count of non-zero churn"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=2, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index,
                     y=prt_scm["mean"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=2, col=3)

# TechSupport
prt_scm = df.groupby('TechSupport')['Churn'].agg(['size', 'count', 'mean'])
prt_scm.columns = ["count", "count of non-zero churn", "mean"]
prt_scm = prt_scm.sort_values(by="count", ascending=False)

fig.add_trace(go.Bar(x = prt_scm["count"].head(10).index, y=prt_scm["count"].head(10).values,marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
              row=3, col=1)
fig.add_trace(go.Bar(x = prt_scm["count of non-zero churn"].head(10).index,
                     y=prt_scm["count of non-zero churn"].head(10).values,
                        marker=dict(
                          line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                          )),
                     row=3, col=2)
fig.add_trace(go.Bar(x = prt_scm["mean"].head(10).index,
                     y=prt_scm["mean"].head(10).values,
                     marker=dict(
                           line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
                           )),
                      row=3, col=3)

fig.update_layout(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="OnlineBackup - DeviceProtection - TechSupport", showlegend=False)
# py.iplot(fig, filename='device-plots')

In [None]:
# pivot_ui(df)

**OUTLIERS**

In [None]:
def outlier_thresholds(dataframe, variable, q1 = 0.10, q2= 0.90):
    quartile1 = dataframe[variable].quantile(q1)
    quartile3 = dataframe[variable].quantile(q2)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
#     up_limit = round(up_limit)
#     low_limit = round(low_limit)
    return low_limit, up_limit

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
def grab_outliers(dataframe, col_name, index=False):
    low, up = outlier_thresholds(dataframe, col_name)

    if dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].shape[0] > 10:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].head())
    else:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))])

    if index:
        outlier_index = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].index
        return outlier_index

def remove_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]
    return df_without_outliers

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
check_outlier(df, "MonthlyCharges")

In [None]:
check_outlier(df, "tenure")

In [None]:
check_outlier(df, "MonthlyCharges")

When we examined the data structure and looked at all the quarters, we chose 10% and 90% values for the first and third quartiles. According to these values, it is observed that there are no outliers in the data.

**Multivariate analysis**<br/>
Before we apply LOF method, we will handle missing values.

In [None]:
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [None]:
# LOF
df_num = df.select_dtypes(include = ['float64', 'int64'])
clf = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)
clf.fit_predict(df_num)
df_scores = clf.negative_outlier_factor_

In [None]:
pd.DataFrame(np.sort(df_scores)).plot(stacked = True, xlim =[0,50], style=".-")

In [None]:
th = np.sort(df_scores)[11]

In [None]:
df[df_scores < th].shape

In [None]:
df[df_scores < th].index

In [None]:
df = df.drop(axis=0, labels=df[df_scores < th].index)

**MISSING VALUES**

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns


na_columns = missing_values_table(df, na_name=True)

In [None]:
def missing_vs_target(dataframe, target, na_columns):
    temp_df = dataframe.copy()

    for col in na_columns:
        temp_df[col + '_NA_FLAG'] = np.where(temp_df[col].isnull(), 1, 0)

    na_flags = temp_df.loc[:, temp_df.columns.str.contains("_NA_")].columns

    for col in na_flags:
        print(pd.DataFrame({"TARGET_MEAN": temp_df.groupby(col)[target].mean(),
                            "Count": temp_df.groupby(col)[target].count()}), end="\n\n\n")


missing_vs_target(df, "Churn", na_columns)

In [None]:
df['TotalCharges'] = df['TotalCharges'].fillna(0)

CORRELATION

Multicollinearity: due to the multicollinearity linear regression gives incorrect results and the performance of the model will get decreases. We need to handle this type of situation.

In [None]:
df[num_cols].corr()

# Korelasyon Matrisi
f, ax = plt.subplots(figsize=[18, 13])
sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", ax=ax, cmap="magma")
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

If there is multicollinearity between two variables, we can analyze them with the **VIF** (Variable Inflation Factors) method. It measures the strength of the correlation between our independent variables. In order to avoid ***inaccurate parameter estimations***, multiple correlation analysis should be performed and related variables should be removed from the data set if needed. We set our max threshold at 10

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 

In [None]:
#Exclude categoricals + target variable
X=df.select_dtypes(include = ['float64', 'int64'])
X=X.drop('Churn', axis = 1)
VIF = X
vif_data = pd.DataFrame()
vif_data["Feature"] = VIF.columns
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(VIF.values, i) for i in range(len(VIF.columns))]
vif_data=vif_data.sort_values(by='VIF',ascending=False)
vif_data.style.background_gradient(cmap = 'Reds', axis = 0)

We can see that MonthlyCharges, TotalCharges, tenure have high VIF. As presumed, the high correlation coefficient of MonthlyCharges result in a high VIF. We will remove the variable MonthlyCharges and recheck if the VIF of other variables dropped

In [None]:
#We remove Year_Birth + Income and recalculate the VIF of other features 
VIF_filter=X.drop(columns=['MonthlyCharges'], axis = 1)
vif_filtered_data = pd.DataFrame() 
vif_filtered_data["Feature"] = VIF_filter.columns 
vif_filtered_data["VIF"] = [variance_inflation_factor(VIF_filter.values, i) for i in range(len(VIF_filter.columns))]
# vif_filtered_data=vif_filtered_data[vif_filtered_data["VIF"] > 5]
vif_filtered_data=vif_filtered_data.sort_values(by='VIF',ascending=False)
vif_filtered_data.style.background_gradient(cmap = 'Reds', axis = 0)

Since all variables below our threshold 10, we should only drop MonthlyCharges variable.

In [None]:
df = df.drop('MonthlyCharges', axis = 1)

## Feature Extraction

In [None]:
# Preparing new category from tenure variable
df.loc[(df["tenure"] >= 0) & (df["tenure"] <= 12), "NEW_TENURE_YEAR"] = "0-1 Year"
df.loc[(df["tenure"] > 12) & (df["tenure"] <= 24), "NEW_TENURE_YEAR"] = "1-2 Year"
df.loc[(df["tenure"] > 24) & (df["tenure"] <= 36), "NEW_TENURE_YEAR"] = "2-3 Year"
df.loc[(df["tenure"] > 36) & (df["tenure"] <= 48), "NEW_TENURE_YEAR"] = "3-4 Year"
df.loc[(df["tenure"] > 48) & (df["tenure"] <= 60), "NEW_TENURE_YEAR"] = "4-5 Year"
df.loc[(df["tenure"] > 60) & (df["tenure"] <= 72), "NEW_TENURE_YEAR"] = "5-6 Year"

In [None]:
df["NEW_TENURE_YEAR"].value_counts()

In [None]:
# Specify contract 1 or 2 year customers as Engaged
df["NEW_Engaged"] = df["Contract"].apply(lambda x: 1 if x in ["One year", "Two year"] else 0)

# Customer benefit from at least one online support 
df["NEW_noProt"] = df.apply(
    lambda x: 1 if (x["OnlineBackup"] != "No") or (x["DeviceProtection"] != "No") or (x["TechSupport"] != "No") else 0,axis=1)

# Young customers with monthly contracts
df["NEW_Young_Not_Engaged"] = df.apply(lambda x: 1 if (x["NEW_Engaged"] == 0) and (x["SeniorCitizen"] == 0) else 0,
                                       axis=1)

# The total number of services received by the person
df['NEW_TotalServices'] = (df[['PhoneService', 'InternetService', 'OnlineSecurity',
                               'OnlineBackup', 'DeviceProtection', 'TechSupport',
                               'StreamingTV', 'StreamingMovies']] == 'Yes').sum(axis=1)

In [None]:
# Herhangi bir streaming hizmeti alan kişiler
df["NEW_FLAG_ANY_STREAMING"] = df.apply(
    lambda x: 1 if (x["StreamingTV"] == "Yes") or (x["StreamingMovies"] == "Yes") else 0, axis=1)

# Does the person make automatic payments?
df["NEW_FLAG_AutoPayment"] = df["PaymentMethod"].apply(
    lambda x: 1 if x in ["Bank transfer (automatic)", "Credit card (automatic)"] else 0)

# average monthly payment
df["NEW_AVG_Charges"] = df["TotalCharges"] / (df["tenure"] + 0.1)

# # Current Price increase relative to average price
# df["NEW_Increase"] = df["NEW_AVG_Charges"] / (df["MonthlyCharges"] + 1)

# # fee per service
# df["NEW_AVG_Service_Fee"] = df["MonthlyCharges"] / (df['NEW_TotalServices'] + 1)




df.loc[(df['gender'] == 'Male') & (df['SeniorCitizen'] == 0), 'new_sex_cat'] = 'youngmale'
df.loc[(df['gender'] == 'Male') & (df['SeniorCitizen'] == 1), 'new_sex_cat'] = 'oldmale'
df.loc[(df['gender'] == 'Female') & (df['SeniorCitizen'] == 0), 'new_sex_cat'] = 'youngfemale'
df.loc[(df['gender'] == 'Female') & (df['SeniorCitizen'] == 1), 'new_sex_cat'] = 'oldfemale'

## ENCODING

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
# LABEL ENCODING
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


binary_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique() == 2]
binary_cols

for col in binary_cols:
    df = label_encoder(df, col)

In [None]:
cat_cols = [col for col in cat_cols if col not in binary_cols and col not in ["Churn", "NEW_TotalServices"]]
cat_cols


def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe


df = one_hot_encoder(df, cat_cols, drop_first=True)

df.head()
df.shape

In [None]:
num_cols = [col for col in num_cols if col not in ["MonthlyCharges"]]

## Modelling

In [None]:
y = df["Churn"]
X = df.drop(["Churn", "customerID"], axis=1)
#1. Split data into X and Y. We use stratify to keep an equal proportion of examples in each class between train set and test set
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=1,shuffle=True, stratify=y)

In [None]:
from collections import Counter

In [None]:
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%s, Count=%d, Percentage=%.2f%%' % (k, v, per))

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
catboost_model = CatBoostClassifier(verbose=False, random_state=12345).fit(X_train, y_train)
y_pred = catboost_model.predict(X_test)

print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 3)}")
print(f"Recall: {round(recall_score(y_pred, y_test), 2)}")
print(f"Precision: {round(precision_score(y_pred, y_test), 2)}")
print(f"F1: {round(f1_score(y_pred, y_test), 2)}")
print(f"Auc: {round(roc_auc_score(y_pred, y_test), 2)}")

In [None]:
def plot_feature_importance(importance, names, model_type):
    # Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    # Create a DataFrame using a Dictionary
    data = {'feature_names': feature_names, 'feature_importance': feature_importance}
    fi_df = pd.DataFrame(data)

    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)

    # Define size of bar plot
    plt.figure(figsize=(15, 10))
    # Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    # Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.show()


plot_feature_importance(catboost_model.get_feature_importance(), X.columns, 'CATBOOST')

In [None]:
## LOGISTIC REGRESSION
# from sklearn.linear_model import LogisticRegression
# def high_correlated_cols(dataframe, plot=False, corr_th=0.90):
#     corr = dataframe.corr()
#     cor_matrix = corr.abs()
#     upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
#     drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
#     if plot:
#         import seaborn as sns
#         import matplotlib.pyplot as plt
#         sns.set(rc={'figure.figsize': (15, 15)})
#         sns.heatmap(corr, cmap="RdBu")
#         plt.show(block=True)
#     return drop_list


# drop_list = high_correlated_cols(df)

# df.drop(drop_list, axis=1, inplace=True)

# y = df["Churn"]
# X = df.drop(["Churn", "customerID"], axis=1)

# LR = LogisticRegression().fit(X, y)

# cvr = cross_validate(LR, X, y, cv=3, scoring=["accuracy", "f1", "roc_auc", "precision", "recall"])

# for k, v in cvr.items():
#     print(f"{k} : {v.mean()}")

# Pıpeline

In [None]:
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import FeatureUnion
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.svm import SVC
# import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
y = df["Churn"]
X = df.drop(["Churn", "customerID"], axis=1)
#1. Split data into X and Y. We use stratify to keep an equal proportion of examples in each class between train set and test set
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=1,shuffle=True, stratify=y)

In [None]:
#Feature type selection
class feat_sel(BaseEstimator, TransformerMixin):
    def __init__(self, dtype='numeric'):
        self.dtype = dtype

    def fit( self, X, y=None ):
        return self 

    def transform(self, X, y=None):
        if self.dtype == 'numeric':
            num_cols = X.columns[X.dtypes != object].tolist()
            return X[num_cols]
        elif self.dtype == 'category':
            cat_cols = X.columns[X.dtypes == object].tolist()
            return X[cat_cols]
    def get_feature_names(self):
        if self.dtype == 'numeric':
            num_cols = X.columns[X.dtypes != object].tolist()
            return X[num_cols]
        elif self.dtype == 'category':
            cat_cols = X.columns[X.dtypes == object].tolist()
            return X[cat_cols]

In [None]:
#Scale
class df_scaler(BaseEstimator, TransformerMixin):
    def __init__(self, method=StandardScaler()):
        super().__init__()
        self.method = method        

    def fit(self, X, y=None):
        return self.method.fit(X)

    def transform(self, X, y=None):
        Xscl = self.method.transform(X)
        Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns)
        self.columns = X.columns
        return Xscaled
    def get_feature_names(self):
        return list(self.columns)  

In [None]:
class FeatureUnion_df(TransformerMixin, BaseEstimator):
    
    def __init__(self, transformer_list, n_jobs=None, transformer_weights=None, verbose=False):
        self.transformer_list = transformer_list
        self.n_jobs = n_jobs
        self.transformer_weights = transformer_weights
        self.verbose = verbose 
        self.feat_un = FeatureUnion(self.transformer_list)
        
    def fit(self, X, y=None):
        self.feat_un.fit(X)
        return self

    def transform(self, X, y=None):
        X_tr = self.feat_un.transform(X)
        columns = []
        
        for trsnf in self.transformer_list:
            cols = trsnf[1].steps[-1][1].get_feature_names()  
            columns += list(cols)

        X_tr = pd.DataFrame(X_tr, index=X.index, columns=columns)
        
        return X_tr

    def get_params(self, deep=True): 
        return self.feat_un.get_params(deep=deep)
    def get_feature_names(self):
        return self.columns

In [None]:
#Model Selection
class Model_selection(BaseEstimator):
    def __init__(self, estimator = CatBoostClassifier()):
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)

In [None]:
## Pipeline
numeric_pipe = Pipeline([('fs', feat_sel(dtype='numeric')),  # Select only the numeric features
                         ('scl', df_scaler()) # Scale data
                        ])  
categorical_pipeline = Pipeline( steps = [( 'fs', feat_sel(dtype='category')), # Select only the categorical features
                                          ])
processing_pipe = FeatureUnion_df(transformer_list=[('cat_pipe', categorical_pipeline),
                                                    ('num_pipe', numeric_pipe)
                                                   ])

In [None]:
%%time

#Outer loop
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
outer_results = list()

for train_ix, valid_ix in cv_outer.split(X_train,y_train):
    Xtrain, X_valid = X_train.iloc[train_ix, :], X_train.iloc[valid_ix, :]
    ytrain, y_valid = y_train.iloc[train_ix], y_train.iloc[valid_ix]
       
    #our pipeline    
    pipe = Pipeline([
    ('prep',processing_pipe),
    ('selector', SelectKBest(f_classif, k=10)),
    ('resample' ,SMOTETomek(sampling_strategy=0.7,random_state=1)),
    ('est', Model_selection())])
    
    #search space containing feature selection, preprocessing steps and model hyperparameters
    search_space = [
                {'est__estimator':[SVC()],
                 'selector__k': [10,15, 20],
                 'est__estimator__C': [0.1,1,10],
                 'est__estimator__kernel': ['rbf', 'poly'],
                 'est__estimator__random_state' : [1],
                 'prep__num_pipe__scl__method': [StandardScaler(),RobustScaler()],
                 'est__estimator__verbose' : [False],},

                {'est__estimator': [CatBoostClassifier()],
                 'selector__k': [10,15, 20],
                 'est__estimator__random_state' : [1],
                 'est__estimator__silent' : [True],
                 'prep__num_pipe__scl__method': [StandardScaler(),RobustScaler()],
                 'est__estimator__early_stopping_rounds':[100],}]
    #inner loop
    cv_inner=StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    clf = GridSearchCV(pipe, search_space,cv=cv_inner, scoring='f1_weighted',refit=True)
    clf.fit(Xtrain, ytrain)

    best_model = clf.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_valid)
    # F1 score
    F1 = f1_score(y_valid, yhat,average='weighted')
    # store the result
    outer_results.append(F1)
    # report best model for each fold of the outer loop
    print('>F1=%.3f, best score=%.3f, model=%s' % (F1, clf.best_score_, clf.best_params_))
# Get a summarized result
print('Weighted F1-score: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))


In [None]:
def format_cv_results(search):
    df = pd.concat([pd.DataFrame(clf.cv_results_["params"]),pd.DataFrame(clf.cv_results_["mean_test_score"], columns=["Score"])],axis=1)
    df = df.sort_values("Score", ascending=False)
    return df.fillna(value="")
df_res = format_cv_results(clf)
df_res.head(10)

# Results

According to the result of hyperparameter optimization, the most suitable model is selected and fitting is performed.

In [None]:
std_scaler = StandardScaler()

for col in num_cols:
    df[col] = std_scaler.fit_transform(df[[col]])

df.describe().T

In [None]:
#best model
pipe = Pipeline([
    ('selector', SelectKBest(f_classif, k=20)),
    ('est', CatBoostClassifier(early_stopping_rounds = 100, verbose=False))])
# CB = CatBoostClassifier().fit(X, y)

cvr = cross_validate(pipe, X, y, cv=3, scoring=["accuracy", "f1", "roc_auc", "precision", "recall"])

for k, v in cvr.items():
    print(f"{k} : {v.mean()}")

## LIGHTGBM

In [None]:
# features = []
# features.append(('scaler', StandardScaler()))
# # features.append(('select_best', SelectKBest(k=6)))
# feature_union = FeatureUnion(features)
# scores = []

In [None]:
# %%time

# #Outer loop
# cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# outer_results = list()

# for train_ix, valid_ix in cv_outer.split(X_train,y_train):
#     Xtrain, X_valid = X_train.iloc[train_ix, :], X_train.iloc[valid_ix, :]
#     ytrain, y_valid = y_train.iloc[train_ix], y_train.iloc[valid_ix]
       
#     #our pipeline    
#     pipe = Pipeline([
#     ('feature_union',feature_union),
#     ('selector', SelectKBest(f_classif, k=15)),
#     ('resample' ,SMOTETomek(sampling_strategy=0.7,random_state=1)),
#     ('cbc', CatBoostClassifier())])
    
#     #search space containing feature selection, preprocessing steps and model hyperparameters
#     search_space = {
#                     'cbc__iterations': [5, 10, 15, 20, 25, 50, 100],
#                     'cbc__learning_rate': [0.01, 0.05, 0.1],
#                     'cbc__depth': [3, 5, 7, 9, 11, 13],
#                   }
#     #inner loop
#     cv_inner=StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
#     clf = GridSearchCV(pipe, search_space,cv=cv_inner, scoring='f1_weighted',refit=True)
#     clf.fit(Xtrain, ytrain)

#     best_model = clf.best_estimator_
#     # evaluate model on the hold out dataset
#     yhat = best_model.predict(X_valid)
#     # F1 score
#     F1 = f1_score(y_valid, yhat,average='weighted')
#     # store the result
#     outer_results.append(F1)
#     # report best model for each fold of the outer loop
#     print('>F1=%.3f, best score=%.3f, model=%s' % (F1, clf.best_score_, clf.best_params_))
# # Get a summarized result
# print('Weighted F1-score: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))


In [None]:
# df_res = format_cv_results(clf)
# df_res.head(10)