# NOT DONE

GridSearchCV + KerasClassifier tidak bagus.
1. GridSearchCV melakukan split data set menjadi train set + validation set. 
   
   Fungsinya adalah untuk mendapatkan average score dan average score tertinggi akan dianggap sebagai `GridSearchCV.best_score_` dan `GridSearchCV.best_params_`.
2. Ini berarti jika `GridSearchCV(cv=3)` maka KerasClassifier hanya mendapatkan 2/3 train set. Sedangkan 1/3 train set digunakan oleh GridSearchCV sebagai validaiton set.
3. Hal ini diperparah oleh cara kerja `tf.keras.models.Sequential` yang membutuhkan validation set agar tidak overfit.

# Import Libraries

In [1]:
# Data Loading
import pandas as pd

# Data Preprocessing
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
import numpy as np

# GridSearchCV
from sklearn.metrics import make_scorer, f1_score, classification_report
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
import tensorflow_addons as tfa
from sklearn.feature_selection import mutual_info_classif, SelectKBest

# reference: https://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf

# Data Loading

In [2]:
df = pd.read_csv('./WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
# Convert TotalCharges type from string to number
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

# Data Preprocessing

In [4]:
# Split data set
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=0)
X_train, X_val, y_train, y_val= train_test_split(
    X_train_val, y_train_val, test_size=0.15, stratify=y_train_val, random_state=0)

In [5]:
# Define data type
nom_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
            'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
            'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
            'PaperlessBilling', 'PaymentMethod']
ord_cols = ['Contract']
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [6]:
# Change dtypes for OrdinalEncoder or OneHotEncoder
X_train_val['Contract'] = X_train_val['Contract'].astype(
    CategoricalDtype(
        categories=['Month-to-month', 'One year', 'Two year'], ordered=True)
)

X_train_val[nom_cols] = X_train_val[nom_cols].astype('category')

# Pipeline

[reference](https://stackoverflow.com/questions/58815016/cross-validating-with-imblearn-pipeline-and-gridsearchcv)

In [7]:
num_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

nom_cats = [
    X_train_val['gender'].cat.categories.tolist(),
    X_train_val['SeniorCitizen'].cat.categories.tolist(),
    X_train_val['Partner'].cat.categories.tolist(),
    X_train_val['Dependents'].cat.categories.tolist(),
    X_train_val['PhoneService'].cat.categories.tolist(),
    X_train_val['MultipleLines'].cat.categories.tolist(),
    X_train_val['InternetService'].cat.categories.tolist(),
    X_train_val['OnlineSecurity'].cat.categories.tolist(),
    X_train_val['OnlineBackup'].cat.categories.tolist(),
    X_train_val['DeviceProtection'].cat.categories.tolist(),
    X_train_val['TechSupport'].cat.categories.tolist(),
    X_train_val['StreamingTV'].cat.categories.tolist(),
    X_train_val['StreamingMovies'].cat.categories.tolist(),
    X_train_val['PaperlessBilling'].cat.categories.tolist(),
    X_train_val['PaymentMethod'].cat.categories.tolist()
]

nom_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(categories=nom_cats)),
    ]
)

ord_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(
            categories=[
                X_train_val['Contract'].cat.categories.tolist()
            ]
        )),
    ]
)

composer = ColumnTransformer(
    transformers=[
        ('num_pipe', num_pipe, num_cols),
        ('nom_pipe', nom_pipe, nom_cols),
        ('ord_pipe', ord_pipe, ord_cols)
    ]
)

In [8]:
# Get discrete features bool
composer.fit(X=X_train_val, y=y_train_val)
print([feature.startswith(("nom", "ord")) for feature in composer.get_feature_names_out()])

[False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]


In [9]:
def isolate():
    X_train_val_final = composer.transform(X=X_train_val)
    
    mi = mutual_info_classif(
            X=X_train_val_final, y=y_train_val,
            discrete_features=[False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True],
            random_state=0
    )
    
    df_c = pd.DataFrame()
    df_c["mi"] = mi / np.max(mi)
    
    df_c["feature"] = composer.get_feature_names_out()
    display(df_c.sort_values(by="mi", ascending=False)
                .reset_index()
                .style.background_gradient(cmap="coolwarm"))

isolate()

Unnamed: 0,index,mi,feature
0,43,1.0,ord_pipe__Contract
1,0,0.676495,num_pipe__tenure
2,19,0.627687,nom_pipe__OnlineSecurity_No
3,28,0.581319,nom_pipe__TechSupport_No
4,2,0.503695,num_pipe__TotalCharges
5,17,0.472594,nom_pipe__InternetService_Fiber optic
6,41,0.441467,nom_pipe__PaymentMethod_Electronic check
7,1,0.4037,num_pipe__MonthlyCharges
8,22,0.376432,nom_pipe__OnlineBackup_No
9,25,0.330697,nom_pipe__DeviceProtection_No


# SciKeras KerasClassifier

[reference](https://www.adriangb.com/scikeras/stable/quickstart.html#in-an-sklearn-pipeline)

[reference 2](https://stackoverflow.com/questions/59225328/how-to-use-kerasclassifier-validation-split-and-using-scitkit-learn-gridsearchcv)

In [10]:
def get_model(hidden_layer_dim, meta):
    n_features_in_ = meta["n_features_in_"]
    X_shape_ = meta["X_shape_"]
    n_classes_ = meta["n_classes_"]
    
    model = tf.keras.models.Sequential()
    model.add(
        tf.keras.layers.Dense(
            units=4, input_shape=X_shape_[1:],
            kernel_initializer=tf.keras.initializers.HeNormal(seed=0)
        )
    )
    model.add(
        tf.keras.layers.BatchNormalization()
    )
    model.add(
        tf.keras.layers.LeakyReLU(alpha=0.3)
    )
    model.add(
        tf.keras.layers.Dense(units=1, activation="sigmoid")
    )
    
    return model

callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_f1_score",
    mode="max",
    patience=10,
    min_delta=0.0001,
    restore_best_weights=True
)

clf = KerasClassifier(
    model=get_model,
    loss="binary_crossentropy",
    hidden_layer_dim=100,
    metrics=tfa.metrics.F1Score,
    metrics__num_classes=1,
    metrics__average='macro',
    metrics__threshold=0.3,
    epochs=100,
    callbacks=[callback],
    validation_split=0.1,
    verbose=False
)

def sel_score(X, y):
    return mutual_info_classif(
        X=X, y=y,
        discrete_features=[False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True],
        random_state=0
    )

pipe = Pipeline(
    steps=[
        ('composer', composer),
        ('sel', SelectKBest(
            score_func=sel_score,
            k=17,  
        )),
        ('clf', clf)
    ]
)

In [11]:
# convert [False, True, False] to ['No', 'Yes', 'No']
bool_to_str = lambda items: ["No" if item == False else "Yes" for item in items]
bool_to_str([False, True, True])

['No', 'Yes', 'Yes']

In [12]:
# Default
pipe.fit(X=X_train_val, y=y_train_val)
y_pred_proba = pipe.predict_proba(X_test)
print(classification_report(y_true=y_test, y_pred=bool_to_str(y_pred_proba[:,1] >= 0.3)))

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

              precision    recall  f1-score   support

          No       0.91      0.78      0.84       777
         Yes       0.56      0.78      0.65       280

    accuracy                           0.78      1057
   macro avg       0.73      0.78      0.74      1057
weighted avg       0.82      0.78      0.79      1057



In [13]:
params = {
    "clf__hidden_layer_dim": [8],
    "clf__loss": ["binary_crossentropy"],
    "clf__optimizer": ["sgd", "adam"],
    "clf__metrics__threshold": [0.3]
}
gs = GridSearchCV(
    pipe, params, refit=False, cv=3, 
    scoring=make_scorer(
        score_func=f1_score,
        average='macro',
        labels=['Yes']
    ),
    n_jobs=-1
)

gs.fit(X_train_val, y_train_val)

INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmp0om8t0bd/assets
INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmpnx8u2kxj/assets
INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmp2ed2rl9p/assets
INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmp2q95tozi/assets
INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmpd166qekn/assets
INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmpx0yzlyw8/assets
INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmp5p14p5wr/assets
INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmpgv7u3e89/assets
INFO:tensorflow:Assets written to: ram:///var/folders/w9/5lrd8m1x4f586v9ynn8xrryw0000gn/T/tmpp9xnqie3/assets
INFO:tensorflow:Ass

In [14]:
# Get best params
print(gs.best_score_, gs.best_params_)

0.55886498148867 {'clf__hidden_layer_dim': 8, 'clf__loss': 'binary_crossentropy', 'clf__metrics__threshold': 0.3, 'clf__optimizer': 'adam'}
