In [1832]:
# pip install scikit-learn
# pip install pandas

In [1833]:
import pandas as pd
import numpy as np

np.random.seed(0)

In [1834]:
# dataset 1: https://www.kaggle.com/datasets/blastchar/telco-customer-churn/
csv_path = "datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [1835]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [1836]:
# drop customerID
df.drop('customerID', axis=1, inplace=True)

In [1837]:
# https://community.ibm.com/community/user/businessanalytics/blogs/steven-macko/2019/07/11/telco-customer-churn-1113
# customerID -> unique customer id
# gender -> [Male, Female]
# SeniorCitizen -> [0, 1]
# Partner -> [Yes, No]
# Dependents -> [Yes, No]
# tenure -> Number of months the customer has stayed with the company
# PhoneService -> [Yes, No]
# MultipleLines -> [No phone service, No, Yes]
# InternetService -> [DSL, Fiber optic, No]
# OnlineSecurity -> [No, Yes, No internet service]
# OnlineBackup -> [No, Yes, No internet service]
# DeviceProtection -> [No, Yes, No internet service]
# TechSupport -> [No, Yes, No internet service]
# StreamingTV -> [No, Yes, No internet service]
# StreamingMovies -> [No, Yes, No internet service]
# Contract -> [Month-to-month, One year, Two year]    
# PaperlessBilling -> [Yes, No]
# PaymentMethod -> [Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)]
# MonthlyCharges -> Monthly charge
# TotalCharges -> Total charge
# Churn -> [Yes, No]

# find the unique values of gender, senior citizen, partner, dependents, phone service, multiple lines, 
# internet service, online security, online backup, device protection, tech support
# streaming tv, streaming movies, contract, paperless billing, payment method, churn
columns = df.columns
# drop customer id, tenure, monthly charges, total charges
columns = columns.drop(['tenure', 'MonthlyCharges', 'TotalCharges'])
for column in columns:
    print(column, df[column].unique())


gender ['Female' 'Male']
SeniorCitizen [0 1]
Partner ['Yes' 'No']
Dependents ['No' 'Yes']
PhoneService ['No' 'Yes']
MultipleLines ['No phone service' 'No' 'Yes']
InternetService ['DSL' 'Fiber optic' 'No']
OnlineSecurity ['No' 'Yes' 'No internet service']
OnlineBackup ['Yes' 'No' 'No internet service']
DeviceProtection ['No' 'Yes' 'No internet service']
TechSupport ['No' 'Yes' 'No internet service']
StreamingTV ['No' 'Yes' 'No internet service']
StreamingMovies ['No' 'Yes' 'No internet service']
Contract ['Month-to-month' 'One year' 'Two year']
PaperlessBilling ['Yes' 'No']
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn ['No' 'Yes']


In [1838]:
# preprocess data
for column in columns:
    if df[column].dtype == 'object':
        df[column] = df[column].astype('category')
        # cat.codes assigns a unique integer to each unique value in the column
        df[column] = df[column].cat.codes

In [1839]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


In [1840]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int8   
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int8   
 3   Dependents        7043 non-null   int8   
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int8   
 6   MultipleLines     7043 non-null   int8   
 7   InternetService   7043 non-null   int8   
 8   OnlineSecurity    7043 non-null   int8   
 9   OnlineBackup      7043 non-null   int8   
 10  DeviceProtection  7043 non-null   int8   
 11  TechSupport       7043 non-null   int8   
 12  StreamingTV       7043 non-null   int8   
 13  StreamingMovies   7043 non-null   int8   
 14  Contract          7043 non-null   int8   
 15  PaperlessBilling  7043 non-null   int8   
 16  PaymentMethod     7043 non-null   int8   


In [1841]:
# convert total charges to float
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int8   
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int8   
 3   Dependents        7043 non-null   int8   
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int8   
 6   MultipleLines     7043 non-null   int8   
 7   InternetService   7043 non-null   int8   
 8   OnlineSecurity    7043 non-null   int8   
 9   OnlineBackup      7043 non-null   int8   
 10  DeviceProtection  7043 non-null   int8   
 11  TechSupport       7043 non-null   int8   
 12  StreamingTV       7043 non-null   int8   
 13  StreamingMovies   7043 non-null   int8   
 14  Contract          7043 non-null   int8   
 15  PaperlessBilling  7043 non-null   int8   
 16  PaymentMethod     7043 non-null   int8   


In [1842]:
# save as csv
# df.to_csv('datasets/telco_customer_churn_preprocessed.csv', index=False)

In [1843]:
# drop rows with missing values
df.dropna(inplace=True)

In [1844]:
# split the data into 80% training and 20% testing using sklearn
from sklearn.model_selection import train_test_split

# churn is the target
X = df.drop(['Churn'], axis=1).values
y = df['Churn'].values

# split the data into 80% training and 20% testing using sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [1845]:
# scale the data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# fit to the training data
X_train = sc.fit_transform(X_train)
# transform the testing data
X_test = sc.transform(X_test)


In [1846]:
print(X_train.shape)
print(X_test.shape)

(5625, 19)
(1407, 19)


In [1847]:
# count the number of NaN in X_train
np.isnan(X_train).sum()

0

In [1848]:
# replace Nan with average
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# # fit to the training data
# X_train = imputer.fit_transform(X_train)
# # transform the testing data
# X_test = imputer.transform(X_test)

In [1849]:
# use sklearn logistic regression
from sklearn.linear_model import LogisticRegression

# create the model
model = LogisticRegression()

# train the model
model.fit(X_train, y_train)


In [1850]:
# evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

# print the predictions
predictions = model.predict(X_test)

In [1851]:
# accuracy
print(f"accuracy: {model.score(X_test, y_test)}")
# precision, recall, f1-score, support
print(classification_report(y_test, predictions))


accuracy: 0.8052594171997157
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1038
           1       0.66      0.54      0.59       369

    accuracy                           0.81      1407
   macro avg       0.75      0.72      0.73      1407
weighted avg       0.80      0.81      0.80      1407



In [1852]:
# confusion matrix
print(confusion_matrix(y_test, predictions))

[[934 104]
 [170 199]]


In [1853]:
# create custom metrics functions
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def precision(y_true, y_pred):
    return np.sum(y_true * y_pred) / np.sum(y_pred)

def recall(y_true, y_pred):
    return np.sum(y_true * y_pred) / np.sum(y_true)

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * p * r / (p + r)

In [1854]:
# custom logistic regression class
class MyLogisticRegression:

    def __init__(self, n_features, 
                        lr=0.1, 
                        n_iters=1000, 
                        threshold=0,
                        show_loss=False):
        self.n_features = n_features
        self.lr = lr
        self.n_iters = n_iters
        self.weights = np.random.randn(n_features+1)
        # Early terminate Gradient Descent if error in the training set becomes less than threshold
        self.threshold = threshold 
        self.show_loss = show_loss

    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _information_gain(self, X, y, feature):
        original_entropy = self._entropy(y)

        # Get the values and counts for the feature
        values, counts = np.unique(X[:, feature], return_counts=True)

        # Calculate the remainder
        remainder = 0
        for value, count in zip(values, counts):
            remainder += count / counts.sum() * self._entropy(y[X[:, feature] == value])

        # Calculate the information gain
        info_gain = original_entropy - remainder
        return info_gain

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def _cost(self, X, y):
        y_pred = self._sigmoid(X @ self.weights)
        cost = -(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) / len(y)
        return cost

    def _mse(self, X, y_true):
        y_pred = self._sigmoid(X @ self.weights)
        return np.mean((y_true - y_pred) ** 2)

    def _gradient(self, X, y):
        y_pred = self._sigmoid(X @ self.weights)
        gradient = X.T @ (y_pred - y) / len(y)
        return gradient

    def fit(self, X, y):
        # check shape
        if X.shape[0] != y.shape[0]:
            raise ValueError("shape of X and y do not match")

        # check shape len
        if len(X.shape) != 2:
            raise ValueError("X must be 2 dimensional")

        print(X.shape, y.shape)

        # Calculate information gain for each feature
        info_gains = [self._information_gain(X, y, feature) for feature in range(X.shape[1])]

        # Get the indices of the features sorted by information gain
        indices = np.argsort(info_gains)[::-1]

        # Select the top n features
        self.selected_features = indices[:self.n_features]

        # freate a new array with only the selected features
        X = X[:, self.selected_features]

        print(X.shape, y.shape)

        # Add column for bias
        X = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)

        # apply gradient descent
        for i in range(self.n_iters):
            self.weights -= self.lr * self._gradient(X, y)
            mse = self._mse(X, y)
            if self.show_loss:
                print(f"epoch {i+1}, loss: {mse}")
            # early terminate if mse is less than threshold
            if mse < self.threshold:
                break


    def predict(self, X):

        if self.selected_features is None:
            raise Exception("model must be trained before prediction")

        # select the features
        X = X[:, self.selected_features]
        
        # Add column for bias
        X = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)

        # predict
        y_pred = self._sigmoid(X @ self.weights)

        # convert probabilities to 0 or 1
        y_pred = np.round(y_pred).astype(int)
        return y_pred
        

In [1855]:
# create the model
# model = MyLogisticRegression(n_features=X_train.shape[1])
model = MyLogisticRegression(n_features=13, show_loss=False, threshold=0)

# train the model
model.fit(X_train, y_train)


(5625, 19) (5625,)


(5625, 13) (5625,)


In [1856]:
# evaluate the model
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1038
           1       0.65      0.54      0.59       369

    accuracy                           0.80      1407
   macro avg       0.75      0.72      0.73      1407
weighted avg       0.79      0.80      0.80      1407

[[930 108]
 [170 199]]


In [1857]:
def report(y_true, y_pred):
    print(f"accuracy: {accuracy(y_true, y_pred):.4f}")
    print(f"precision: {precision(y_true, y_pred):.4f}")
    print(f"recall: {recall(y_true, y_pred):.4f}")
    print(f"f1: {f1(y_true, y_pred):.4f}")

In [1858]:
report(y_test, y_pred)

accuracy: 0.8024
precision: 0.6482
recall: 0.5393
f1: 0.5888


In [1859]:
y_pred = model.predict(X_train)
report(y_train, y_pred)

accuracy: 0.7984
precision: 0.6478
recall: 0.5347
f1: 0.5858
