# Import Libraries

In [1]:
# Data Loading
import pandas as pd

# Data Preprocessing
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype

# Threshold Tuning
import matplotlib.pyplot as plt
import numpy as np

# Pipeline
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import RandomOverSampler, SMOTENC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import make_scorer, f1_score, classification_report, precision_recall_curve
from sklearn.linear_model import LogisticRegression

# Data Loading

In [2]:
df = pd.read_csv("./WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
# TotalCharges have ' ' value, we will replace it to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Data Preprocessing

In [4]:
# Split data set
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=0)

In [5]:
# Define data type
nom_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
            'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
            'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
            'PaperlessBilling', 'PaymentMethod']
ord_cols = ['Contract']
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [6]:
# Change dtypes for OrdinalEncoder or OneHotEncoder
X_train_val['Contract'] = X_train_val['Contract'].astype(
    CategoricalDtype(
        categories=['Month-to-month', 'One year', 'Two year'], ordered=True)
)

X_train_val[nom_cols] = X_train_val[nom_cols].astype('category')

# Pipeline

[reference](https://stackoverflow.com/questions/58815016/cross-validating-with-imblearn-pipeline-and-gridsearchcv)

In [7]:
num_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

nom_cats = [
    X_train_val['gender'].cat.categories.tolist(),
    X_train_val['SeniorCitizen'].cat.categories.tolist(),
    X_train_val['Partner'].cat.categories.tolist(),
    X_train_val['Dependents'].cat.categories.tolist(),
    X_train_val['PhoneService'].cat.categories.tolist(),
    X_train_val['MultipleLines'].cat.categories.tolist(),
    X_train_val['InternetService'].cat.categories.tolist(),
    X_train_val['OnlineSecurity'].cat.categories.tolist(),
    X_train_val['OnlineBackup'].cat.categories.tolist(),
    X_train_val['DeviceProtection'].cat.categories.tolist(),
    X_train_val['TechSupport'].cat.categories.tolist(),
    X_train_val['StreamingTV'].cat.categories.tolist(),
    X_train_val['StreamingMovies'].cat.categories.tolist(),
    X_train_val['PaperlessBilling'].cat.categories.tolist(),
    X_train_val['PaymentMethod'].cat.categories.tolist()
]

nom_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(categories=nom_cats)),
    ]
)

ord_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(
            categories=[
                X_train_val['Contract'].cat.categories.tolist()
            ]
        )),
    ]
)

composer = ColumnTransformer(
    transformers=[
        ('num_pipe', num_pipe, num_cols),
        ('nom_pipe', nom_pipe, nom_cols),
        ('ord_pipe', ord_pipe, ord_cols)
    ]
)

# Threshold Tuning

[reference](composer.fit(X=X_train_val, y=y_train_val)
print([feature.startswith(("nom", "ord")) for feature in composer.get_feature_names_out()]))

In [8]:
bool_to_str = lambda items: ['No' if item == False else 'Yes' for item in items]
bool_to_str([False, True, False])

['No', 'Yes', 'No']

In [9]:

pipe = Pipeline(
    steps=[
        ('composer', composer),
        ('clf', LogisticRegression(max_iter=3000, random_state=0))
    ]
)

pipe.fit(X=X_train_val, y=y_train_val)
y_test_pred_proba = pipe.predict_proba(X=X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(
    y_true=y_test, probas_pred=y_test_pred_proba, pos_label='Yes')
fscore = 2 * (precision * recall) / (recall + precision)
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# no_skill = len(y_test[y_test=='Yes']) / len(y_test)
# plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
# plt.plot(recall, precision, marker='.', label='Logistic')
# plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.legend()
# plt.show()
y_test_pred = bool_to_str(pipe.predict_proba(X=X_test)[:,1] >= 0.332262)
print(classification_report(y_true=y_test, y_pred=y_test_pred))
display(pipe)

Best Threshold=0.332262, F-Score=0.656
              precision    recall  f1-score   support

          No       0.90      0.79      0.85       777
         Yes       0.57      0.77      0.66       280

    accuracy                           0.79      1057
   macro avg       0.74      0.78      0.75      1057
weighted avg       0.82      0.79      0.80      1057



# GridSearchCV Oversampling

[reference](https://stackoverflow.com/questions/68567722/how-to-use-precision-recall-curve-in-gridsearchcv)

In [10]:
composer.fit(X=X_train_val, y=y_train_val)
print([feature.startswith(("nom", "ord")) for feature in composer.get_feature_names_out()])

[False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]


In [11]:
clf_lr = GridSearchCV(
    estimator=Pipeline(
        steps=[
            ('composer', composer),
            ('sm', None),
            ('clf', LogisticRegression(max_iter=3000))
        ]
    ),
    param_grid={
      'sm': [
          RandomOverSampler(random_state=0),
          SMOTENC(random_state=0, categorical_features=[False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])
      ]  
    },
    scoring=make_scorer(
        score_func=f1_score,
        average='binary',
        pos_label='Yes'
    ),
    n_jobs=-1
)
clf_lr.fit(X=X_train_val, y=y_train_val)
print("best score", clf_lr.best_score_)
y_test_pred = clf_lr.best_estimator_.predict(X=X_test)
print(classification_report(y_true=y_test, y_pred=y_test_pred))
display(clf_lr.best_estimator_)

best score 0.6251278799042049
              precision    recall  f1-score   support

          No       0.91      0.76      0.83       777
         Yes       0.55      0.80      0.65       280

    accuracy                           0.77      1057
   macro avg       0.73      0.78      0.74      1057
weighted avg       0.82      0.77      0.78      1057

