In [27]:
import pandas as pd

In [28]:

df = pd.read_csv('datasets/telco_churn.csv')

***PRE PROCESSING***

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# find the duplicate row count
print(df.duplicated().sum())

In [None]:
categorical_df = df.select_dtypes(include=['object'])
categorical_df.info()

In [None]:
numerical_df = df.select_dtypes(exclude=['object'])
numerical_df.info()

In [34]:
target_column_name='Churn'

In [None]:
# null value count in categorical columns and neumerical columns
print(categorical_df.isnull().sum())
print(numerical_df.isnull().sum())

In [36]:
# replace missing values in categorical columns with the most frequent value except for the target column
for column in categorical_df.columns:
    if column != target_column_name:
        categorical_df[column]=categorical_df[column].fillna(categorical_df[column].value_counts().index[0])
    else:
        # drop null rows for target column
        categorical_df = categorical_df.dropna(subset=[column])
    
# replace missing values in numerical columns with the mean except for the target column
for column in numerical_df.columns:
    if column != target_column_name:
        numerical_df[column]=numerical_df[column].fillna(numerical_df[column].mean())
    else:
        # drop null rows for target column
        numerical_df = numerical_df.dropna(subset=[column])

In [None]:
categorical_df.info()
numerical_df.info()

***ENCODING AND SCALING***

In [None]:
#drop customer id column
categorical_df = categorical_df.drop(columns=['customerID'])
# drop total charges column
categorical_df = categorical_df.drop(columns=['TotalCharges'])

# total categories for each categorical column
for column in categorical_df.columns:
    print(f'{column}: {categorical_df[column].nunique()} categories')

In [None]:
# total columns count in categorical_df
print(len(categorical_df.columns))
categorical_df.head()

In [40]:
# do label encodind for categorical columns if there is only 2 categories, otherwise do one hot encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


# at first, do the label encoding for binary columns
le = LabelEncoder()
for column in categorical_df.columns:
    if categorical_df[column].nunique() == 2:
        categorical_df[column] = le.fit_transform(categorical_df[column])

# then do one hot encoding for the rest of the columns
categorical_df = pd.get_dummies(categorical_df).astype('int64')

In [None]:
print(len(categorical_df.columns))
categorical_df.head()

In [None]:
numerical_df.head()

In [43]:
def standard_scaling(numerical_df,categorical_df):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    # for the numerical columns
    numerical_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns=numerical_df.columns)
    # merge them back together
    df = pd.concat([numerical_df, categorical_df], axis=1)
    return df
    

In [44]:
def minmax_scaling(numerical_df,categorical_df):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    # for the numerical columns
    numerical_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns=numerical_df.columns)
    # merge them back together
    df = pd.concat([numerical_df, categorical_df], axis=1)
    
    return df

In [None]:
scaled_and_encoded_df = minmax_scaling(numerical_df,categorical_df)
scaled_and_encoded_df.head()

PERFORMANCE METRICS IMPLEMENTATION

In [46]:
from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score
def evaluate_model(model_name,y_true, y_pred, y_pred_prob=None,save_scores=True):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)  # how many are correctly classified
    sensitivity = tp / (tp + fn) # how many positive cases are correctly classified
    specificity = tn / (tn + fp) # how many negative cases are correctly classified
    precision = tp / (tp + fp) # how many of the positive predictions are correct
    f1 = 2 * (precision * sensitivity) / (precision + sensitivity)


    print(f"========================================================Model: {model_name}===========================================================")

    print("Confusion Matrix:")
    print(f"TN: {tn}, FP: {fp}")
    print(f"FN: {fn}, TP: {tp}")
    print()
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Sensitivity (Recall): {sensitivity:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"F1 Score: {f1:.4f}")

    auroc = None
    aupr = None
    if y_pred_prob is not None:
        auroc = roc_auc_score(y_true, y_pred_prob)
        print(f"AUROC: {auroc:.4f}")
        aupr = average_precision_score(y_true, y_pred_prob)
        print(f"AUPR: {aupr:.4f}")
    
    if save_scores==False:
        return
    # write all the scores in a csv file, if AUROC and AUPR are none, then keep the cells empty
    scores = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'Precision': precision,
        'F1': f1,
        'AUROC': auroc if y_pred_prob is not None else None,
        'AUPR': aupr if y_pred_prob is not None else None
    }


    #write in csv, if the file does not exist, then create it and write the header
    import os
    if not os.path.exists('scores.csv'):
        scores_df = pd.DataFrame([scores])
        scores_df.to_csv('scores.csv', index=False)
    else:
        scores_df = pd.read_csv('scores.csv')
        scores_df = pd.concat([scores_df, pd.DataFrame([scores])], ignore_index=True)
        
        scores_df.to_csv('scores.csv', index=False)

In [47]:
# about roc and pr curve-->
# the model gives the probability of the positive class for each sample.
# we define a threshold to convert these probabilities to class labels. ( normally 0.5)
# we can change the threshold to get different confusion matrix values
# but we can't change the threshold to get different roc and pr curve values
# because roc and pr curve are plotted by changing the threshold from 0 to 1
# roc curve is plotted with TPR = TP / (TP + FN) vs FPR = FP / (FP + TN) for every threshold value
# pr curve is plotted with precision = TP / (TP + FP) and recall = TP / (TP + FN) for every threshold value

MODEL IMPLEMENTATION

In [48]:
import numpy as np

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Logistic regression model
class MyLogisticRegression:
    def __init__(self, learning_rate=0.01, iterations=1000, regularization=None, strength=0.01):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.regularization = regularization
        self.regularization_strength = strength
    
    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1]) # theta is a vector of zeros with the same size as the number of features
        self.bias = 0 # bias is initialized to 0
        m = X.shape[0] # number of samples

        for _ in range(self.iterations):
            linear_model = np.dot(X, self.theta) + self.bias # z = X.theta + b
            predictions = sigmoid(linear_model) # a = sigmoid(z)
            
            dw = (1 / m) * np.dot(X.T, (predictions - y)) # X.T is the transpose of X. And np.dot returns an array with the size of the number of features
            db = (1 / m) * np.sum(predictions - y)

            if self.regularization == 'l1':
                dw += (self.regularization_strength / m) * np.sign(self.theta) # derivative of L1 regularization = the sign of theta * the regularization strength * 1/m
            elif self.regularization == 'l2':
                dw += (self.regularization_strength / m) * self.theta # derivative of L2 regularization = theta  * the regularization strength * 1/m
            
            self.theta -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        linear_model = np.dot(X, self.theta) + self.bias
        predictions = sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in predictions]
    # return prediction probabilities
    def predict_prob(self, X):
        linear_model = np.dot(X, self.theta) + self.bias
        predictions = sigmoid(linear_model)
        return predictions

In [49]:
from sklearn.utils import resample
def bagging_models(X_train, y_train, n_models=9,regularization=None,strength=0.01,saving_scores=True):
    base_model_predictions = []
    
    for i in range(n_models):
        # Bootstrap sampling of training data
        X_resampled, y_resampled = resample(X_train, y_train)
        
        model = MyLogisticRegression(regularization=regularization, strength=strength)
        model.fit(X_resampled, y_resampled)
        
        
        y_pred = model.predict(X_train)  # an np array with length of number of samples
        y_pred_prob = model.predict_prob(X_train)

        evaluate_model(f'LR_Model_{i}',y_train, y_pred, y_pred_prob,save_scores=saving_scores)
        
        base_model_predictions.append(y_pred) # 2d np array with shape (n_models, n_samples)
    
    
    return np.array(base_model_predictions).T # 2d np array with shape (n_samples, n_models)


In [50]:
# returns the meta learner model
def stack_ensembling(X_train, y_train, n_models=9, regularization=None, strength=0.01, saving_scores=True):
    
    X_meta_train = bagging_models(X_train, y_train, n_models=n_models, regularization=regularization, strength=strength, saving_scores=saving_scores)
    # for meta learner, the predictions of the base models are the features
    print("done")
    meta_learner = MyLogisticRegression(regularization=regularization, strength=strength)
    meta_learner.fit(X_meta_train, y_train)
    return meta_learner

In [51]:
def predict_with_stack_ensembling(scaled_and_encoded_df, target_column_name, n_models=9, regularization=None, strength=0.01):
    from sklearn.model_selection import train_test_split
    X = scaled_and_encoded_df.drop(columns=[target_column_name])
    y = scaled_and_encoded_df[target_column_name]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    meta_learner = stack_ensembling(X_train, y_train, n_models=n_models, regularization=regularization, strength=strength, saving_scores=False)
    X_meta_test = bagging_models(X_test, y_test, n_models=n_models, regularization=regularization, strength=strength, saving_scores=True)

    # return true value, the predicted values, and the probabilities
    y_pred = meta_learner.predict(X_meta_test)
    y_pred_prob = meta_learner.predict_prob(X_meta_test)
    return y_test, y_pred, y_pred_prob


In [None]:
# predict with stack ensembling
y_test,y_pred, y_pred_prob = predict_with_stack_ensembling(scaled_and_encoded_df, target_column_name, n_models=9)

# get the accuracy score

evaluate_model('StackEnsemble',y_test, y_pred, y_pred_prob,save_scores=True)