In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import inflection
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

SEED = 0

def load_dataset(path):
    """
    Return the dataset in pandas dataframe format with lowercase columns
    and each column name seperated by underscore.
    
    Args:
    * path, str: file directory or file link of dataset
    
    Output:
    * pd.DataFrame
    
    Notes:
    This function is used to read a parquet format.
    If the format is not parquet, change the first line of code.
    i.e. pd.read_csv.
    
    """
    df = pd.read_csv(path)
    df.columns = [inflection.underscore(var) for var in list(df.columns)]    
    return df

def parse_to_numeric(df, var='total_charges'):
    """
    Parse the numerical variables that are still in object dtype. 
    
    Args:
    * df, pd.DataFrame: the dataset
    * var (default: 'total_charges'), str: a variable to be parsed
    * fillna (default: True), bool: option to fill missing values or not
    Return:
    * pd.DataFrame
    
    """
    df = df.copy()
    df[var] = pd.to_numeric(df[var], errors='coerce')
    return df

def split_dataset(df, target='churn', test_size=0.2, seed=SEED):
    """
    Return the train and validation/test set with specified split ratio.
    
    Args:
    * df, pd.DataFrame: the dataset to be splitted
    * target (default: 'churn'), str: the target variable
    * test_size (default: 0.2), float: the ratio of test size after splitting
    * seed (default: SEED), int: random number for reproducibility
    
    Output:
    * train, val (optional: test), pd.DataFrame: training and validation/test sets
    
    Notes:
    * The purpose of splitted is to avoid data leakage and
      to make an hold-out dataset for testing the model performance.
    * Use strafied sampling to make training and validation/test data 
      have similar distribution.

"""
    return train_test_split(
        df,
        test_size=test_size,
        random_state=seed,
        stratify=df[target]
    )

def dataset_summary(df):
    """
    Return the following information from dataset:
    variable name, number of unique value, pandas dtype, 
    number of missing values, percentage of missing values, 
    and list of unique values.
    
    Args:
    * df, pd.DataFrame: the dataset
    
    Output:
    * table, pd.DataFrame
    """
    table = pd.DataFrame(
                columns=['variable',
                         'no_unique',
                         'pandas_dtype',
                         'missing_value',
                         '%_missing_values',
                         'unique_value'
                ]
    )

    for i, var in enumerate(df.columns):
        table.loc[i] = [var,
                        df[var].nunique(),
                        df[var].dtypes,
                        df[var].isnull().sum(),
                        df[var].isnull().sum() * 100 / df.shape[0],
                        df[var].unique().tolist()
        ]
    return table

def main():
    # data preparation
    df = load_dataset("../dataset/telco-customer-churn.csv")
    df = parse_to_numeric(df, var='total_charges')
    
    # set up a validation framework
    full_train, test = split_dataset(df, target='churn', test_size=0.2)
    train, val = split_dataset(full_train, target='churn', test_size=0.25)
    
    display(
        dataset_summary(full_train)
    )

main()

Unnamed: 0,variable,no_unique,pandas_dtype,missing_value,%_missing_values,unique_value
0,customer_id,5634,object,0,0.0,"[6061-GWWAV, 8464-EETCQ, 7621-VPNET, 6432-TWQL..."
1,gender,2,object,0,0.0,"[Male, Female]"
2,senior_citizen,2,int64,0,0.0,"[0, 1]"
3,partner,2,object,0,0.0,"[No, Yes]"
4,dependents,2,object,0,0.0,"[Yes, No]"
5,tenure,73,int64,0,0.0,"[41, 57, 42, 5, 67, 1, 54, 21, 63, 62, 2, 13, ..."
6,phone_service,2,object,0,0.0,"[Yes, No]"
7,multiple_lines,3,object,0,0.0,"[No, Yes, No phone service]"
8,internet_service,3,object,0,0.0,"[DSL, No, Fiber optic]"
9,online_security,3,object,0,0.0,"[Yes, No internet service, No]"


### Load the dataset

In [2]:
df = load_dataset("../dataset/telco-customer-churn.csv")
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Data Preparation

Monthly charges variable is a numerical variable but its data type is not either integer or float. We need to parse them into numeric.

In [3]:
df = parse_to_numeric(df, var='total_charges')
df['total_charges'].dtypes

dtype('float64')

### Set the validation framework

The assumption for this notebook:
* The dataset is splitted into three sets such as training, validation, and test sets with ratio 60:20:20 respectively.
* For EDA and build a model, I only use full train set (training and validation sets combined) to avoid leaking an information from test set to the analysis and model.

In [4]:
full_train, test = split_dataset(df, target='churn', test_size=0.2)
train, val = split_dataset(full_train, target='churn', test_size=0.25)

In [5]:
full_train.shape, val.shape, test.shape

((5634, 21), (1409, 21), (1409, 21))

### Target Variable Distribution

In [6]:
full_train['churn'].value_counts(normalize=True)

No     0.734647
Yes    0.265353
Name: churn, dtype: float64

### Dataset Summary

In [7]:
dataset_summary(full_train)

Unnamed: 0,variable,no_unique,pandas_dtype,missing_value,%_missing_values,unique_value
0,customer_id,5634,object,0,0.0,"[6061-GWWAV, 8464-EETCQ, 7621-VPNET, 6432-TWQL..."
1,gender,2,object,0,0.0,"[Male, Female]"
2,senior_citizen,2,int64,0,0.0,"[0, 1]"
3,partner,2,object,0,0.0,"[No, Yes]"
4,dependents,2,object,0,0.0,"[Yes, No]"
5,tenure,73,int64,0,0.0,"[41, 57, 42, 5, 67, 1, 54, 21, 63, 62, 2, 13, ..."
6,phone_service,2,object,0,0.0,"[Yes, No]"
7,multiple_lines,3,object,0,0.0,"[No, Yes, No phone service]"
8,internet_service,3,object,0,0.0,"[DSL, No, Fiber optic]"
9,online_security,3,object,0,0.0,"[Yes, No internet service, No]"


### Modelling