In [100]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [101]:
df = pd.read_csv("../data/processed/churn_fe_data.csv")


In [102]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [103]:
X.shape

(7032, 28)

In [104]:
X.columns.duplicated().sum()


np.int64(0)

In [105]:
X.nunique().sort_values()

gender                                      2
high_spender                                2
early_customer                              2
PaymentMethod_Mailed check                  2
PaymentMethod_Electronic check              2
PaymentMethod_Credit card (automatic)       2
StreamingMovies_Yes                         2
StreamingTV_Yes                             2
TechSupport_Yes                             2
DeviceProtection_Yes                        2
OnlineSecurity_Yes                          2
OnlineBackup_Yes                            2
InternetService_Fiber optic                 2
MultipleLines_Yes                           2
MultipleLines_No phone service              2
PaperlessBilling                            2
PhoneService                                2
Dependents                                  2
Partner                                     2
SeniorCitizen                               2
InternetService_No                          2
contract_risk                     

In [106]:
corr = df.corr()["Churn"].sort_values(ascending=False)
corr


Churn                                    1.000000
contract_risk                            0.396150
contract_tenure_risk                     0.364451
tenure_risk                              0.347133
early_customer                           0.319628
InternetService_Fiber optic              0.307463
PaymentMethod_Electronic check           0.301455
MonthlyCharges                           0.192922
PaperlessBilling                         0.191454
SeniorCitizen                            0.150541
avg_monthly_charge                       0.070157
StreamingTV_Yes                          0.063254
high_spender                             0.062342
StreamingMovies_Yes                      0.060860
MultipleLines_Yes                        0.040033
PhoneService                             0.011691
gender                                  -0.008545
MultipleLines_No phone service          -0.011691
DeviceProtection_Yes                    -0.066193
service_count                           -0.067459


In [107]:
low_corr_features = corr[abs(corr) < 0.01].index
low_corr_features


Index(['gender'], dtype='object')

In [108]:
X.shape

(7032, 28)

In [109]:
X.drop(columns=low_corr_features, inplace=True)


In [110]:
X.shape

(7032, 27)

In [111]:
df[["contract_risk", "contract_tenure_risk"]].corr()

Unnamed: 0,contract_risk,contract_tenure_risk
contract_risk,1.0,0.488425
contract_tenure_risk,0.488425,1.0


In [112]:
corr_matrix = X.corr().abs()
corr_matrix

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,service_count,tenure_risk,early_customer,avg_monthly_charge,high_spender,contract_risk,contract_tenure_risk
SeniorCitizen,1.0,0.016957,0.21055,0.015683,0.008392,0.156258,0.219829,0.008392,0.142996,0.254923,...,0.024359,0.171322,0.152987,0.095943,0.023317,0.027713,0.201471,0.177991,0.14182,0.0041
Partner,0.016957,1.0,0.452269,0.381912,0.018397,0.013957,0.09789,0.018397,0.142561,0.001235,...,0.082327,0.083207,0.096948,0.220032,0.363275,0.305061,0.170835,0.150782,0.294094,0.31022
Dependents,0.21055,0.452269,1.0,0.163386,0.001078,0.110131,0.11236,0.001078,0.024307,0.164101,...,0.061134,0.149274,0.056448,0.021304,0.15688,0.145379,0.065052,0.048485,0.240556,0.164075
tenure,0.015683,0.381912,0.163386,1.0,0.007877,0.004823,0.246876,0.007877,0.332399,0.01793,...,0.2328,0.210197,0.232181,0.524441,0.961134,0.75433,0.413232,0.346186,0.676734,0.673437
PhoneService,0.008392,0.018397,0.001078,0.007877,1.0,0.016696,0.247918,1.0,0.27953,0.290183,...,0.006916,0.002747,0.004463,0.128129,0.005827,0.00695,0.221429,0.297855,0.003019,0.00147
PaperlessBilling,0.156258,0.013957,0.110131,0.004823,0.016696,1.0,0.351923,0.016696,0.163746,0.32647,...,0.013726,0.208427,0.203981,0.205167,0.004291,0.00386,0.315956,0.257576,0.175475,0.012919
MonthlyCharges,0.219829,0.09789,0.11236,0.246876,0.247918,0.351923,1.0,0.247918,0.490986,0.787196,...,0.030277,0.271238,0.376615,0.80234,0.237278,0.191927,0.95683,0.81874,0.072685,0.165759
MultipleLines_No phone service,0.008392,0.018397,0.001078,0.007877,1.0,0.016696,0.247918,1.0,0.27953,0.290183,...,0.006916,0.002747,0.004463,0.128129,0.005827,0.00695,0.221429,0.297855,0.003019,0.00147
MultipleLines_Yes,0.142996,0.142561,0.024307,0.332399,0.27953,0.163746,0.490986,0.27953,1.0,0.36642,...,0.060319,0.083583,0.227672,0.538484,0.315664,0.263331,0.52048,0.486404,0.107529,0.249967
InternetService_Fiber optic,0.254923,0.001235,0.164101,0.01793,0.290183,0.32647,0.787196,0.290183,0.36642,1.0,...,0.050552,0.335763,0.305984,0.351094,0.023197,0.021441,0.708313,0.642944,0.252733,0.002568


In [113]:
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

upper

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,service_count,tenure_risk,early_customer,avg_monthly_charge,high_spender,contract_risk,contract_tenure_risk
SeniorCitizen,,0.016957,0.21055,0.015683,0.008392,0.156258,0.219829,0.008392,0.142996,0.254923,...,0.024359,0.171322,0.152987,0.095943,0.023317,0.027713,0.201471,0.177991,0.14182,0.0041
Partner,,,0.452269,0.381912,0.018397,0.013957,0.09789,0.018397,0.142561,0.001235,...,0.082327,0.083207,0.096948,0.220032,0.363275,0.305061,0.170835,0.150782,0.294094,0.31022
Dependents,,,,0.163386,0.001078,0.110131,0.11236,0.001078,0.024307,0.164101,...,0.061134,0.149274,0.056448,0.021304,0.15688,0.145379,0.065052,0.048485,0.240556,0.164075
tenure,,,,,0.007877,0.004823,0.246876,0.007877,0.332399,0.01793,...,0.2328,0.210197,0.232181,0.524441,0.961134,0.75433,0.413232,0.346186,0.676734,0.673437
PhoneService,,,,,,0.016696,0.247918,1.0,0.27953,0.290183,...,0.006916,0.002747,0.004463,0.128129,0.005827,0.00695,0.221429,0.297855,0.003019,0.00147
PaperlessBilling,,,,,,,0.351923,0.016696,0.163746,0.32647,...,0.013726,0.208427,0.203981,0.205167,0.004291,0.00386,0.315956,0.257576,0.175475,0.012919
MonthlyCharges,,,,,,,,0.247918,0.490986,0.787196,...,0.030277,0.271238,0.376615,0.80234,0.237278,0.191927,0.95683,0.81874,0.072685,0.165759
MultipleLines_No phone service,,,,,,,,,0.27953,0.290183,...,0.006916,0.002747,0.004463,0.128129,0.005827,0.00695,0.221429,0.297855,0.003019,0.00147
MultipleLines_Yes,,,,,,,,,,0.36642,...,0.060319,0.083583,0.227672,0.538484,0.315664,0.263331,0.52048,0.486404,0.107529,0.249967
InternetService_Fiber optic,,,,,,,,,,,...,0.050552,0.335763,0.305984,0.351094,0.023197,0.021441,0.708313,0.642944,0.252733,0.002568


In [114]:
to_drop = [col for col in upper.columns if any(upper[col] > 0.85)]
to_drop


['MultipleLines_No phone service',
 'tenure_risk',
 'avg_monthly_charge',
 'high_spender']

In [115]:
X.drop(columns=to_drop, inplace=True)

In [116]:
X.shape

(7032, 23)

In [117]:
lr = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    max_iter=1000
)

lr.fit(X, y)


0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


In [118]:
lr_selected = X.columns[lr.coef_[0] != 0]
lr_selected

Index(['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService',
       'PaperlessBilling', 'MonthlyCharges', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes',
       'TechSupport_Yes', 'StreamingTV_Yes', 'StreamingMovies_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'early_customer', 'contract_risk', 'contract_tenure_risk'],
      dtype='object')

In [119]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf.fit(X, y)

importances = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importances.head(15)


contract_tenure_risk              0.162096
MonthlyCharges                    0.154460
tenure                            0.154294
contract_risk                     0.063775
service_count                     0.047984
InternetService_Fiber optic       0.043952
PaymentMethod_Electronic check    0.033144
PaperlessBilling                  0.031718
Partner                           0.030663
early_customer                    0.028682
SeniorCitizen                     0.026192
Dependents                        0.024254
OnlineBackup_Yes                  0.022355
OnlineSecurity_Yes                0.022288
TechSupport_Yes                   0.020535
dtype: float64

In [120]:
final_features = list(
    set(lr_selected) |
    set(importances.head(15).index)
)

final_features


['early_customer',
 'SeniorCitizen',
 'TechSupport_Yes',
 'DeviceProtection_Yes',
 'StreamingTV_Yes',
 'StreamingMovies_Yes',
 'tenure',
 'contract_tenure_risk',
 'MonthlyCharges',
 'Dependents',
 'OnlineBackup_Yes',
 'PaymentMethod_Credit card (automatic)',
 'InternetService_No',
 'PaperlessBilling',
 'contract_risk',
 'OnlineSecurity_Yes',
 'Partner',
 'MultipleLines_Yes',
 'InternetService_Fiber optic',
 'PhoneService',
 'PaymentMethod_Electronic check',
 'service_count',
 'PaymentMethod_Mailed check']

In [121]:
pd.DataFrame({"feature": final_features}).to_csv(
    "../data/processed/selected_features.csv",
    index=False
)