In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.shape
df.shape

(7043, 21)

In [79]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [80]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
# If ‘coerce’, then invalid parsing will be set as NaN
df.TotalCharges.fillna(value=df["TotalCharges"].median(), inplace=True)
df["Churn"] = df["Churn"].apply(lambda x:1 if x== "Yes" else 0)
df.shape

(7043, 21)

In [81]:
df = df.drop("customerID", axis=1)


In [82]:
df  = pd.get_dummies(df)
df.shape

(7043, 46)

In [83]:
# Target Variable: TotalCharges
labels = np.array(df["TotalCharges"])
df = df.drop("TotalCharges", axis=1)
df = df.drop("MonthlyCharges", axis=1)

train_x, test_x, train_y, test_y = train_test_split(df, labels, test_size=0.25, shuffle=False)
print("Tranin Data : ", train_x.shape)
print("Test Data : ", test_x.shape)

train_xc = sm.add_constant(train_x)
test_xc = sm.add_constant(test_x)

reg = sm.OLS(train_y, train_xc)
results = reg.fit()

print(results.summary())

predict = results.predict(test_xc)

print("")
print("")

print("OLS MSE:", metrics.mean_squared_error(test_y, predict))

Tranin Data :  (5282, 44)
Test Data :  (1761, 44)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.905
Method:                 Least Squares   F-statistic:                     2295.
Date:                Sun, 31 Jan 2021   Prob (F-statistic):               0.00
Time:                        23:23:13   Log-Likelihood:                -42073.
No. Observations:                5282   AIC:                         8.419e+04
Df Residuals:                    5259   BIC:                         8.434e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------

In [84]:
ols = LinearRegression()
rfe = RFE(ols, n_features_to_select=29)
# RFE: Automated Selection -----> Backward Selection
train_x_rfe = rfe.fit_transform(train_x, train_y)
print(rfe.support_)

[False False  True False False False False False False  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False False False False False]


In [86]:
train_x_RFE = train_x[train_x.columns[rfe.support_]]
test_x_RFE = test_x[test_x.columns[rfe.support_]]

print(train_x_RFE.shape)
print(test_x_RFE.shape)

(5282, 29)
(1761, 29)


In [89]:
train_x_cRFE = sm.add_constant(train_x_RFE)
test_x_cRFE = sm.add_constant(test_x_RFE)

reg = sm.OLS(train_y, train_x_cRFE)
results = reg.fit()

print(results.summary())

pred = results.predict(test_x_cRFE)

print("")
print("")

print("OLS MSE:", metrics.mean_squared_error(test_y, pred))


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.750
Method:                 Least Squares   F-statistic:                     1222.
Date:                Sun, 31 Jan 2021   Prob (F-statistic):               0.00
Time:                        23:35:33   Log-Likelihood:                -44636.
No. Observations:                5282   AIC:                         8.930e+04
Df Residuals:                    5268   BIC:                         8.939e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [90]:
ols2  = LinearRegression()
rfe2 = RFE(ols2, n_features_to_select=32)
train_x_rfe2 = rfe2.fit_transform(train_x, train_y)
print(rfe2.support_)

[False  True  True False False False False False False  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False False False  True  True]


In [91]:
train_x_RFE2 = train_x[train_x.columns[rfe2.support_]]
test_x_RFE2 = test_x[test_x.columns[rfe2.support_]]

print(train_x_RFE2.shape)
print(test_x_RFE2.shape)

(5282, 32)
(1761, 32)


In [94]:
train_x_cRFE2 = sm.add_constant(train_x_RFE2)
test_x_cRFE2 = sm.add_constant(test_x_RFE2)

reg = sm.OLS(train_y, train_x_cRFE2)
results = reg.fit()

print(results.summary())

predict2 = results.predict(test_x_cRFE2)

print("")
print("")

print("OLS MSE:", metrics.mean_squared_error(test_y, predict2))

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.905
Method:                 Least Squares   F-statistic:                     3156.
Date:                Sun, 31 Jan 2021   Prob (F-statistic):               0.00
Time:                        23:43:04   Log-Likelihood:                -42075.
No. Observations:                5282   AIC:                         8.418e+04
Df Residuals:                    5265   BIC:                         8.430e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [95]:
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")

train_x_rfecv = rfecv.fit_transform(train_x, train_y)
print(rfecv.support_)

[False  True  True False False False False False False  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False  True False  True  True]


In [96]:
rfecv.n_features_

33

In [98]:
train_x_rfecv = train_x[train_x.columns[rfecv.support_]]
test_x_rfecv = test_x[test_x.columns[rfecv.support_]]

print(train_x_rfecv.shape)
print(test_x_rfecv.shape)

train_x_Crfecv = sm.add_constant(train_x_rfecv)
test_x_Crfecv = sm.add_constant(test_x_rfecv)

reg = sm.OLS(train_y, train_x_Crfecv)
results = reg.fit()

print(results.summary())

predict3 = results.predict(test_x_Crfecv)

print("")
print("")

print("OLS MSE:", metrics.mean_squared_error(test_y, predict3))

(5282, 33)
(1761, 33)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.905
Method:                 Least Squares   F-statistic:                     2971.
Date:                Sun, 31 Jan 2021   Prob (F-statistic):               0.00
Time:                        23:51:12   Log-Likelihood:                -42074.
No. Observations:                5282   AIC:                         8.418e+04
Df Residuals:                    5264   BIC:                         8.430e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------