In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tqdm import tqdm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import os,gc,copy
import warnings
warnings.filterwarnings("ignore")

In [2]:
X_train = pd.read_csv("X_tr_encoded.csv")
Y_train = pd.read_csv("y_tr.csv")

X_train_resampled = copy.deepcopy(X_train.drop(["customerID"],axis = 1))
y_train_resampled = copy.deepcopy(Y_train)

In [3]:
categorical_features = X_train.columns[X_train.dtypes==object].tolist()
categorical_features.remove('customerID')
numeric_features = X_train.columns[X_train.dtypes!=object].tolist()

In [4]:
X_train_resampled, y_train_resampled = SMOTE(random_state=100).fit_resample(X_train_resampled, Y_train["Churn"].ravel()) 

In [5]:
## logistic

sfs_log = SFS(LogisticRegression(random_state=0),
          k_features = 9,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True),
          n_jobs= -1).fit(X_train_resampled, y_train_resampled)
print(sfs_log.k_feature_names_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:    5.0s remaining:   10.9s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:    5.1s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    5.2s finished

[2023-04-30 17:41:44] Features: 1/9 -- score: 0.7476317174586612[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.1s finished

[2023-04-30 17:41:44] Features: 2/9 -- score: 0.7714175532176691[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  17 | elapsed:    0.0s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    0.1s remaining:    0.0s
[Parallel(n

('Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'StreamingTV', 'Contract', 'PaymentMethod', 'MonthlyCharges')


[Parallel(n_jobs=-1)]: Done   6 out of  11 | elapsed:    0.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    0.6s finished

[2023-04-30 17:41:50] Features: 9/9 -- score: 0.7875596623298867

In [6]:
## decision tree

sfs_dt = SFS(DecisionTreeClassifier(random_state=0),
          k_features = 9,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True),
          n_jobs= -1).fit(X_train_resampled, y_train_resampled)
print(sfs_dt.k_feature_names_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    0.2s finished

[2023-04-30 17:41:50] Features: 1/9 -- score: 0.7476317174586612[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.1s finished

[2023-04-30 17:41:51] Features: 2/9 -- score: 0.7714175532176691[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  17 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    0.0s remaining:    0.0s
[Parallel(n

('PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'StreamingTV', 'Contract', 'PaperlessBilling')



[2023-04-30 17:41:52] Features: 9/9 -- score: 0.8131739172477767

In [7]:
## RF

sfs_rf = SFS(RandomForestClassifier(random_state=0),
          k_features = 9,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True),
          n_jobs= -1).fit(X_train_resampled, y_train_resampled)
print(sfs_rf.k_feature_names_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:    2.1s remaining:    4.7s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:    3.7s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    9.0s finished

[2023-04-30 17:42:02] Features: 1/9 -- score: 0.7476317174586612[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    2.1s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    3.8s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    7.9s finished

[2023-04-30 17:42:10] Features: 2/9 -- score: 0.7714175532176691[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  17 | elapsed:    2.1s remaining:   10.4s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    3.2s remaining:    1.3s
[Parallel(n

('InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'Contract', 'MonthlyCharges', 'TotalCharges')


[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    6.7s finished

[2023-04-30 17:42:59] Features: 9/9 -- score: 0.8386794293724822

In [8]:
## GBM

sfs_gbm = SFS(GradientBoostingClassifier(random_state=0),
          k_features = 9,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True),
          n_jobs= -1).fit(X_train_resampled, y_train_resampled)
print(sfs_gbm.k_feature_names_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:    1.4s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:    2.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    4.4s finished

[2023-04-30 17:43:04] Features: 1/9 -- score: 0.7476317174586612[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    1.7s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    3.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    4.4s finished

[2023-04-30 17:43:08] Features: 2/9 -- score: 0.7714175532176691[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  17 | elapsed:    1.9s remaining:    9.4s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    2.5s remaining:    1.0s
[Parallel(n

('tenure', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'StreamingTV', 'Contract', 'PaymentMethod')


[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    5.6s finished

[2023-04-30 17:43:49] Features: 9/9 -- score: 0.8273810068378257

In [9]:
## XGB

sfs_xgb = SFS(XGBClassifier(random_state=0),
          k_features = 9,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True),
          n_jobs= -1).fit(X_train_resampled, y_train_resampled)
print(sfs_xgb.k_feature_names_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:    1.5s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:    2.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    4.1s finished

[2023-04-30 17:43:53] Features: 1/9 -- score: 0.7476317174586612[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    1.2s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    2.7s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    4.4s finished

[2023-04-30 17:43:58] Features: 2/9 -- score: 0.7714175532176691[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  17 | elapsed:    1.2s remaining:    6.1s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    3.3s remaining:    1.3s
[Parallel(n

('MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'Contract', 'MonthlyCharges', 'TotalCharges')


[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    6.8s finished

[2023-04-30 17:44:48] Features: 9/9 -- score: 0.8472884075434377