# Import Libraries

In [1]:
# Data Loading
import pandas as pd

# Data Preprocessing
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype

# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score, classification_report

# Data Loading

In [2]:
df = pd.read_csv("./WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
# TotalCharges have ' ' value, we will replace it to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Data Preprocessing

In [4]:
# Split data set
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, stratify=y_train_val, random_state=0)

In [5]:
# Define data type
nom_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
            'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
            'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
            'PaperlessBilling', 'PaymentMethod']
ord_cols = ['Contract']
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [6]:
# Change dtypes for OrdinalEncoder or OneHotEncoder
X_train['Contract'] = X_train['Contract'].astype(
    CategoricalDtype(
        categories=['Month-to-month', 'One year', 'Two year'], ordered=True)
)

X_train[nom_cols] = X_train[nom_cols].astype('category')

# GridSearchCV

[reference 1](https://stackoverflow.com/questions/43366561/use-sklearns-gridsearchcv-with-a-pipeline-preprocessing-just-once)

[reference 2](https://stackoverflow.com/questions/38555650/try-multiple-estimator-in-one-grid-search)

In [7]:
num_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', None)
    ]
)

nom_cats = [
    X_train['gender'].cat.categories.tolist(),
    X_train['SeniorCitizen'].cat.categories.tolist(),
    X_train['Partner'].cat.categories.tolist(),
    X_train['Dependents'].cat.categories.tolist(),
    X_train['PhoneService'].cat.categories.tolist(),
    X_train['MultipleLines'].cat.categories.tolist(),
    X_train['InternetService'].cat.categories.tolist(),
    X_train['OnlineSecurity'].cat.categories.tolist(),
    X_train['OnlineBackup'].cat.categories.tolist(),
    X_train['DeviceProtection'].cat.categories.tolist(),
    X_train['TechSupport'].cat.categories.tolist(),
    X_train['StreamingTV'].cat.categories.tolist(),
    X_train['StreamingMovies'].cat.categories.tolist(),
    X_train['PaperlessBilling'].cat.categories.tolist(),
    X_train['PaymentMethod'].cat.categories.tolist()
]

nom_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', None),
        ('scaler', None)
    ]
)

ord_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(
            categories=[
                X_train['Contract'].cat.categories.tolist()
            ]
        )),
        ('scaler', None)
    ]
)

composer = ColumnTransformer(
    transformers=[
        ('num_pipe', num_pipe, num_cols),
        ('nom_pipe', nom_pipe, nom_cols),
        ('ord_pipe', ord_pipe, ord_cols)
    ]
)

Logistic Regression is OK with OneHotEncoder.

In [8]:
clf_lr = GridSearchCV(
    estimator=Pipeline(
        steps=[
            ('composer', composer),
            ('clf', LogisticRegression(max_iter=3000, random_state=0))
        ]
    ),
    param_grid={
        'composer__num_pipe__scaler': [
            MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
        'composer__nom_pipe__encoder': [
            OneHotEncoder(categories=nom_cats, sparse=False),
            OrdinalEncoder(categories=nom_cats)
        ],
        'composer__nom_pipe__scaler': [
            None, MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
        'composer__ord_pipe__scaler': [
            None, MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
    },
    scoring=make_scorer(
        score_func=f1_score,
        average='binary',
        pos_label='Yes'
    ),
    n_jobs=-1
)

clf_lr.fit(X=X_train_val, y=y_train_val)
print("best score", clf_lr.best_score_)
y_test_pred = clf_lr.best_estimator_.predict(X=X_test)
print(classification_report(y_true=y_test, y_pred=y_test_pred))
display(clf_lr.best_estimator_)

best score 0.5984947237400835
              precision    recall  f1-score   support

          No       0.85      0.89      0.87       777
         Yes       0.64      0.56      0.60       280

    accuracy                           0.80      1057
   macro avg       0.75      0.73      0.74      1057
weighted avg       0.80      0.80      0.80      1057



SVM is OK with OneHotEncoder

In [9]:
clf_svm = GridSearchCV(
    estimator=Pipeline(
        steps=[
            ('composer', composer),
            ('clf', SVC(random_state=0))
        ]
    ),
    param_grid={
        'composer__num_pipe__scaler': [
            MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
        'composer__nom_pipe__encoder': [
            OneHotEncoder(categories=nom_cats, sparse=False),
            OrdinalEncoder(categories=nom_cats)
        ],
        'composer__nom_pipe__scaler': [
            None, MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
        'composer__ord_pipe__scaler': [
            None, MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
    },
    scoring=make_scorer(
        score_func=f1_score,
        average='binary',
        pos_label='Yes'
    ),
    n_jobs=-1
)

clf_svm.fit(X=X_train_val, y=y_train_val)
print("best score", clf_svm.best_score_)
y_test_pred = clf_svm.best_estimator_.predict(X=X_test)
print(classification_report(y_true=y_test, y_pred=y_test_pred))
display(clf_svm.best_estimator_)

best score 0.5631963411178951
              precision    recall  f1-score   support

          No       0.84      0.91      0.87       777
         Yes       0.67      0.50      0.58       280

    accuracy                           0.80      1057
   macro avg       0.76      0.71      0.72      1057
weighted avg       0.79      0.80      0.79      1057



Tree is OK with OneHotEncoder

In [10]:
clf_tree = GridSearchCV(
    estimator=Pipeline(
        steps=[
            ('composer', composer),
            ('clf', DecisionTreeClassifier(random_state=0))
        ]
    ),
    param_grid={
        'composer__num_pipe__scaler': [
            MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
        'composer__nom_pipe__encoder': [
            OneHotEncoder(categories=nom_cats, sparse=False),
            OrdinalEncoder(categories=nom_cats)
        ],
        'composer__nom_pipe__scaler': [
            None, MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
        'composer__ord_pipe__scaler': [
            None, MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
    },
    scoring=make_scorer(
        score_func=f1_score,
        average='binary',
        pos_label='Yes'
    ),
    n_jobs=-1
)

clf_tree.fit(X=X_train_val, y=y_train_val)
print("best score", clf_tree.best_score_)
y_test_pred = clf_tree.best_estimator_.predict(X=X_test)
print(classification_report(y_true=y_test, y_pred=y_test_pred))
display(clf_tree.best_estimator_)

best score 0.4950271558941518
              precision    recall  f1-score   support

          No       0.82      0.82      0.82       777
         Yes       0.51      0.51      0.51       280

    accuracy                           0.74      1057
   macro avg       0.67      0.67      0.67      1057
weighted avg       0.74      0.74      0.74      1057



RandomForest is NOT OK with OneHotEncoder

In [11]:
clf_rfc = GridSearchCV(
    estimator=Pipeline(
        steps=[
            ('composer', composer),
            ('clf', RandomForestClassifier(random_state=0))
        ]
    ),
    param_grid={
        'composer__num_pipe__scaler': [
            MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
        'composer__nom_pipe__encoder': [
            OneHotEncoder(categories=nom_cats, sparse=False),
            OrdinalEncoder(categories=nom_cats)
        ],
        'composer__nom_pipe__scaler': [
            None, MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
        'composer__ord_pipe__scaler': [
            None, MinMaxScaler(), StandardScaler(), RobustScaler()
        ],
    },
    scoring=make_scorer(
        score_func=f1_score,
        average='binary',
        pos_label='Yes'
    ),
    n_jobs=-1
)

clf_rfc.fit(X=X_train_val, y=y_train_val)
print("best score", clf_rfc.best_score_)
y_test_pred = clf_rfc.best_estimator_.predict(X=X_test)
print(classification_report(y_true=y_test, y_pred=y_test_pred))
display(clf_rfc.best_estimator_)

best score 0.5503627362876713
              precision    recall  f1-score   support

          No       0.83      0.90      0.87       777
         Yes       0.65      0.49      0.56       280

    accuracy                           0.79      1057
   macro avg       0.74      0.70      0.71      1057
weighted avg       0.78      0.79      0.78      1057

