In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv("Telco-Customer-Churn.csv")

## Data Exploration

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df_to_plot = df.drop(columns=['customerID'])

df_to_plot.plot(subplots=True, kind='box', patch_artist=True)
plt.savefig('boxplots.png', bbox_inches='tight')

## Correlations

In [None]:
# select columns to plot
df_to_plot = df.drop(columns=['customerID'])

# create heatmap
plt.figure(figsize = (12, 9))
s = sb.heatmap(df_to_plot.corr(),  cmap = 'RdBu',vmin = -1, vmax = 1,center = 0)
s.set_yticklabels(s.get_yticklabels(), rotation = 0, fontsize = 12)
s.set_xticklabels(s.get_xticklabels(), rotation = 90, fontsize = 12)
bottom, top = s.get_ylim()
s.set_ylim(bottom + 0.5, top - 0.5)
plt.title("Correlation Heatmap")
plt.savefig('heatmap.png', bbox_inches='tight')
plt.show()

In [None]:
df.corr()

In [None]:
## No Correlations between the numerical variables

In [None]:
df.hist()

In [None]:
df = df.drop(columns=['customerID'])
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df = df.dropna()
y = df['Churn']
X = df.drop(columns='Churn')
X = pd.get_dummies(X)

In [None]:
y_dummy = []
for i in y:
    if i=='No':
        a = 0
    else:
        a = 1
    y_dummy.append(a)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression().fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
log_reg.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(cm, cmap = "Greens", annot = True, 
            cbar_kws = {"orientation": "vertical"},
                       xticklabels = [0,1], yticklabels=[0,1])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
## Logistic Regression using statsmodels

In [2]:
df = pd.read_csv('Telco-Customer-Churn.csv')

In [3]:
# Convert TotalCharges to numeric (some entries may be blank strings)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with missing values
df = df.dropna()

# Reset index after dropping
df = df.reset_index(drop=True)

In [4]:
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

In [5]:
# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

In [7]:
# Drop constant or near-constant columns
X = X.loc[:, X.nunique() > 1]

# Drop perfectly correlated columns
corr = X.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.999)]
X = X.drop(columns=to_drop)

In [8]:
X = sm.add_constant(X)

In [9]:
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Display model summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.414269
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:                 7032
Model:                          Logit   Df Residuals:                     7008
Method:                           MLE   Df Model:                           23
Date:                Sat, 25 Oct 2025   Pseudo R-squ.:                  0.2845
Time:                        14:06:42   Log-Likelihood:                -2913.1
converged:                       True   LL-Null:                       -4071.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
const                                     1.1653      

In [10]:
coef_df = pd.DataFrame({
    'Variable': result.params.index,
    'Coefficient': result.params.values,
    'p-value': result.pvalues
}).sort_values('p-value')

# Show top 10 significant predictors
print(coef_df.head(10))

                                                      Variable  Coefficient  \
tenure                                                  tenure    -0.060588   
Contract_Two year                            Contract_Two year    -1.357106   
Contract_One year                            Contract_One year    -0.660795   
TotalCharges                                      TotalCharges     0.000329   
PaperlessBilling_Yes                      PaperlessBilling_Yes     0.342354   
PaymentMethod_Electronic check  PaymentMethod_Electronic check     0.304467   
SeniorCitizen                                    SeniorCitizen     0.216775   
MultipleLines_Yes                            MultipleLines_Yes     0.448395   
InternetService_No                          InternetService_No    -1.786295   
InternetService_Fiber optic        InternetService_Fiber optic     1.747475   

                                     p-value  
tenure                          2.585486e-22  
Contract_Two year               1.46

In [12]:
# Calculate odds ratios and confidence intervals
odds_ratios = np.exp(result.params)
conf = np.exp(result.conf_int())
odds_df = pd.DataFrame({
    'Variable': result.params.index,
    'Odds Ratio': odds_ratios,
    'CI Lower': conf[0],
    'CI Upper': conf[1],
    'p-value': result.pvalues
}).sort_values('p-value')

# Display top 10
print(odds_df.head(10))

                                                      Variable  Odds Ratio  \
tenure                                                  tenure    0.941211   
Contract_Two year                            Contract_Two year    0.257405   
Contract_One year                            Contract_One year    0.516440   
TotalCharges                                      TotalCharges    1.000329   
PaperlessBilling_Yes                      PaperlessBilling_Yes    1.408258   
PaymentMethod_Electronic check  PaymentMethod_Electronic check    1.355902   
SeniorCitizen                                    SeniorCitizen    1.242065   
MultipleLines_Yes                            MultipleLines_Yes    1.565798   
InternetService_No                          InternetService_No    0.167580   
InternetService_Fiber optic        InternetService_Fiber optic    5.740090   

                                CI Lower   CI Upper       p-value  
tenure                          0.929777   0.952786  2.585486e-22  
Contr