In [122]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [123]:
data.columns = data.columns.str.lower()
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [124]:
data.drop(columns=["customerid"], inplace=True)

In [125]:
data.shape

(7043, 20)

In [126]:
data.drop(labels=data[data["totalcharges"] == " "].index, inplace=True)
data["totalcharges"] = data["totalcharges"].astype(float)

In [127]:
data.shape

(7032, 20)

In [128]:
service_cols = [
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
]

In [129]:
data.select_dtypes(include=["object"]).columns.to_list()

['gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'churn']

In [130]:
for col in data.select_dtypes(include=["object"]).columns.to_list():
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
data.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


In [131]:
scaler = StandardScaler()
numerical_cols = ["tenure", "monthlycharges", "totalcharges"]
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
data.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,0,0,1,0,-1.280248,0,1,0,0,2,0,0,0,0,0,1,2,-1.161694,-0.994194,0
1,1,0,0,0,0.064303,1,0,0,2,0,2,0,0,0,1,0,3,-0.260878,-0.17374,0
2,1,0,0,0,-1.239504,1,0,0,2,2,0,0,0,0,0,1,3,-0.363923,-0.959649,1
3,1,0,0,0,0.512486,0,1,0,2,0,2,2,0,0,1,0,0,-0.74785,-0.195248,0
4,0,0,0,0,-1.239504,1,0,1,0,0,0,0,0,0,0,1,2,0.196178,-0.940457,1


In [132]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [137]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


X = data.copy().drop(columns=["churn"])

# find the best PCA components
pca = PCA().fit(X)
explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(explained_variance >= 0.90) + 1
print(n_components)


y = data["churn"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

12


In [134]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


models = [
    LogisticRegression(),
    RandomForestClassifier(),
    SVC(),
    SGDClassifier(),
    XGBClassifier(),
]
results = {}
for model in models:
    model.fit(X_train, y_train)
    churn_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, churn_pred)
    results[model.__class__.__name__] = accuracy
print(results)

{'LogisticRegression': 0.7931769722814499, 'RandomForestClassifier': 0.7818052594171997, 'SVC': 0.7860696517412935, 'SGDClassifier': 0.7775408670931059, 'XGBClassifier': 0.7647476901208244}


In [135]:
lin_reg = LogisticRegression()
lin_reg.fit(X_train, y_train)
churn_pred = lin_reg.predict(X_test)
data.head()
accuracy = accuracy_score(y_test, churn_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7932


In [136]:
# feature importance
# importance = lin_reg.coef_[0]
# feature_names = X.columns
# feature_importance = pd.Series(importance, index=feature_names).sort_values(
#     ascending=False
# )
# plt.figure(figsize=(10, 6))
# sns.barplot(x=feature_importance.values, y=feature_importance.index)
# plt.title("Feature Importance from Logistic Regression")
# plt.xlabel("Importance")
# plt.ylabel("Feature")
# plt.show()