In [26]:
import pandas as pd 

***PRE PROCESSING***

In [27]:
def dataset_1_preprocessing():
    df = pd.read_csv('datasets/telco_churn.csv')
    target_column_name='Churn'
    df = df.drop(columns=['customerID'])
    # change data type of TotalCharges to float
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

    df = df.dropna(subset=[target_column_name])
    df = df.drop_duplicates()
    df.reset_index(drop=True, inplace=True)
    
    categorical_df = df.select_dtypes(include=['object'])
    numerical_df = df.select_dtypes(exclude=['object'])

    
    # replace missing values in categorical columns with the most frequent value except for the target column
    for column in categorical_df.columns:
        categorical_df[column]=categorical_df[column].fillna(categorical_df[column].value_counts().index[0])

    # replace missing values in numerical columns with the mean except for the target column
    for column in numerical_df.columns:
        numerical_df[column]=numerical_df[column].fillna(numerical_df[column].mean())

    
    return categorical_df, numerical_df,target_column_name

In [28]:
def dataset_2_preprocessing():
    # Load data
    import numpy as np
    df = pd.read_csv('datasets/adult/adult.data', header=None)
    test = pd.read_csv('datasets/adult/adult.test', header=None, skiprows=1)
    # count rows
    df_rows = df.shape[0]
    test_rows = test.shape[0]

    # append test data to df
    df = pd.concat([df, test], ignore_index=True)

    # delete duplicates and missing values for df and test
    df.replace(' ?', np.nan, inplace=True)
    df.replace(' <=50K.', ' <=50K', inplace=True)
    df.replace(' >50K.', ' >50K', inplace=True)
    df.drop_duplicates(inplace=True)
    df.dropna(subset=[14], inplace=True)
    df.reset_index(drop=True, inplace=True)

    # rename all column name from 0 to 14 
    df.rename(columns={14: '14_column'}, inplace=True)
    df.rename(columns={0: '0_column'}, inplace=True)
    df.rename(columns={1: '1_column'}, inplace=True)
    df.rename(columns={2: '2_column'}, inplace=True)
    df.rename(columns={3: '3_column'}, inplace=True)
    df.rename(columns={4: '4_column'}, inplace=True)
    df.rename(columns={5: '5_column'}, inplace=True)
    df.rename(columns={6: '6_column'}, inplace=True)
    df.rename(columns={7: '7_column'}, inplace=True)
    df.rename(columns={8: '8_column'}, inplace=True)
    df.rename(columns={9: '9_column'}, inplace=True)
    df.rename(columns={10: '10_column'}, inplace=True)
    df.rename(columns={11: '11_column'}, inplace=True)
    df.rename(columns={12: '12_column'}, inplace=True)
    df.rename(columns={13: '13_column'}, inplace=True)

    target_column_name = '14_column'
    




    # separate categorical and numerical columns
    categorical_df = df.select_dtypes(include=['object'])
    numerical_df = df.select_dtypes(exclude=['object'])


    # replace missing values in categorical columns with the most frequent value except for the target column
    for column in categorical_df.columns:
        categorical_df[column]=categorical_df[column].fillna(categorical_df[column].value_counts().index[0])

    # replace missing values in numerical columns with the mean except for the target column
    for column in numerical_df.columns:
        numerical_df[column]=numerical_df[column].fillna(numerical_df[column].mean())

    

    return categorical_df, numerical_df, target_column_name


    
    

In [29]:
def dataset_3_preprocessing(small_dataset=True):
    # Load data
    df = pd.read_csv('datasets/creditcard.csv')
    if small_dataset:
        positive_data = df[df['Class'] == 1]
        negative_data = df[df['Class'] == 0].sample(n=20000, random_state=42)
        df = pd.concat([positive_data,negative_data],ignore_index=True)
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
            
    # delete duplicates and missing values
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)

    target_column_name = 'Class'
    # separate categorical and numerical columns
    categorical_df = df.select_dtypes(include=['object'])
    numerical_df = df.select_dtypes(exclude=['object'])

    # replace missing values in categorical columns with the most frequent value except for the target column
    for column in categorical_df.columns:
        categorical_df[column]=categorical_df[column].fillna(categorical_df[column].value_counts().index[0])
    
    # replace missing values in numerical columns with the mean except for the target column
    for column in numerical_df.columns:
        numerical_df[column]=numerical_df[column].fillna(numerical_df[column].mean())
    
    return categorical_df, numerical_df, target_column_name

In [None]:
categorical_df, numerical_df,target_column_name = dataset_3_preprocessing()
#categorical_df, numerical_df,target_column_name = dataset_1_preprocessing()
#categorical_df, numerical_df,target_column_name = dataset_2_preprocessing()
categorical_df.info()
numerical_df.info()


***ENCODING AND SCALING***

In [31]:
# total categories for each categorical column
for column in categorical_df.columns:
    print(f'{column}: {categorical_df[column].nunique()} categories')

In [None]:
# total columns count in categorical_df
print(len(categorical_df.columns))
categorical_df.head()

In [33]:
# do label encodind for categorical columns if there is only 2 categories, otherwise do one hot encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


# at first, do the label encoding for binary columns
le = LabelEncoder()
for column in categorical_df.columns:
    if categorical_df[column].nunique() == 2:
        categorical_df[column] = le.fit_transform(categorical_df[column])

# then do one hot encoding for the rest of the columns
if len(categorical_df.columns) > 0:
    categorical_df = pd.get_dummies(categorical_df).astype('int64')

In [None]:
print(len(categorical_df.columns))
categorical_df.info()

In [None]:
numerical_df.head()

In [36]:
def standard_scaling(numerical_df,categorical_df):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    # for the numerical columns
    numerical_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns=numerical_df.columns)
    # merge them back together
    df = pd.concat([numerical_df, categorical_df], axis=1)
    return df
    

In [37]:
def minmax_scaling(numerical_df,categorical_df):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    # for the numerical columns
    numerical_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns=numerical_df.columns)
    # merge them back together
    df = pd.concat([numerical_df, categorical_df], axis=1)
    
    return df

In [None]:
scaled_and_encoded_df = minmax_scaling(numerical_df,categorical_df)
scaled_and_encoded_df.info()

**FEATURE_SELECTION**

In [39]:
def correlation_feature_selection(scaled_and_encoded_df, target_column_name, top_n_features=20):
    X = scaled_and_encoded_df.drop(columns=[target_column_name])
    y = scaled_and_encoded_df[target_column_name]
    
    correlation_matrix = X.corrwith(y)

    # for feature, correlation in correlation_matrix.iteritems():
    #     print(f'{feature}: {correlation}')

    top_features = correlation_matrix.abs().sort_values(ascending=False).head(top_n_features).index

    selected_columns = list(top_features) + [target_column_name]
    return scaled_and_encoded_df[selected_columns]


In [40]:
def info_gain_feature_selection(scaled_and_encoded_df,target_column_name,top_n_features=20):
    

    X = scaled_and_encoded_df.drop(columns=[target_column_name])
    y = scaled_and_encoded_df[target_column_name]

    # information gain
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import mutual_info_classif

    selector = SelectKBest(mutual_info_classif, k=top_n_features)
    selector.fit(X, y)

    # get the selected feature names
    selected_features = X.columns[selector.get_support()]
    #print(selected_features)
    #return new df with selected features and target column
    return pd.concat([scaled_and_encoded_df[selected_features],scaled_and_encoded_df[target_column_name]],axis=1)


In [None]:
selected_df = info_gain_feature_selection(scaled_and_encoded_df,target_column_name,top_n_features=20)
selected_df.head()

In [None]:
selected_df_2 = correlation_feature_selection(scaled_and_encoded_df,target_column_name,top_n_features=20)
selected_df_2.head()

***PERFORMANCE METRICS IMPLEMENTATION***

In [43]:
from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score
def evaluate_model(model_name,y_true, y_pred, y_pred_prob=None,save_scores=True):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)  # how many are correctly classified
    sensitivity = tp / (tp + fn) # how many positive cases are correctly classified
    specificity = tn / (tn + fp) # how many negative cases are correctly classified
    precision = tp / (tp + fp) # how many of the positive predictions are correct
    f1 = 2 * (precision * sensitivity) / (precision + sensitivity)


    print(f"========================================================Model: {model_name}===========================================================")

    print("Confusion Matrix:")
    print(f"TN: {tn}, FP: {fp}")
    print(f"FN: {fn}, TP: {tp}")
    print()
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Sensitivity (Recall): {sensitivity:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"F1 Score: {f1:.4f}")

    auroc = None
    aupr = None
    if y_pred_prob is not None:
        auroc = roc_auc_score(y_true, y_pred_prob)
        print(f"AUROC: {auroc:.4f}")
        aupr = average_precision_score(y_true, y_pred_prob)
        print(f"AUPR: {aupr:.4f}")
    
    if save_scores==False:
        return
    # write all the scores in a csv file, if AUROC and AUPR are none, then keep the cells empty
    scores = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'Precision': precision,
        'F1': f1,
        'AUROC': auroc if y_pred_prob is not None else None,
        'AUPR': aupr if y_pred_prob is not None else None
    }

    # draw violin plot for the scores


    #write in csv, if the file does not exist, then create it and write the header
    import os
    # if the model name is StackEnsemble then write the scores in a separate file named final_scores.csv
    if model_name == 'StackEnsemble':
        if not os.path.exists('final_scores.csv'):
            scores_df = pd.DataFrame([scores])
            scores_df.to_csv('final_scores.csv', index=False)
        else:
            scores_df = pd.read_csv('final_scores.csv')
            scores_df = pd.concat([scores_df, pd.DataFrame([scores])], ignore_index=True)
            
            scores_df.to_csv('final_scores.csv', index=False)
        return
    
    # do same for VotingEnsemble
    if model_name == 'VotingEnsemble':
        if not os.path.exists('final_scores.csv'):
            scores_df = pd.DataFrame([scores])
            scores_df.to_csv('final_scores.csv', index=False)
        else:
            scores_df = pd.read_csv('final_scores.csv')
            scores_df = pd.concat([scores_df, pd.DataFrame([scores])], ignore_index=True)
            
            scores_df.to_csv('final_scores.csv', index=False)
        return
    
    if not os.path.exists('LR_scores.csv'):
        scores_df = pd.DataFrame([scores])
        scores_df.to_csv('LR_scores.csv', index=False)
    else:
        scores_df = pd.read_csv('LR_scores.csv')
        scores_df = pd.concat([scores_df, pd.DataFrame([scores])], ignore_index=True)
        
        scores_df.to_csv('LR_scores.csv', index=False)

In [44]:
# about roc and pr curve-->
# the model gives the probability of the positive class for each sample.
# we define a threshold to convert these probabilities to class labels. ( normally 0.5)
# we can change the threshold to get different confusion matrix values
# but we can't change the threshold to get different roc and pr curve values
# because roc and pr curve are plotted by changing the threshold from 0 to 1
# roc curve is plotted with TPR = TP / (TP + FN) vs FPR = FP / (FP + TN) for every threshold value
# pr curve is plotted with precision = TP / (TP + FP) and recall = TP / (TP + FN) for every threshold value

In [45]:
def violin_plot():
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import os

    df = pd.read_csv('LR_scores.csv')  # Replace 'your_file.csv' with the path to your CSV

    



    # Model as id_vars, will remain fixed
    # columns as var_name, will be melted into a single column
    # values as value_name, will be the values in the melted
    df_melted = pd.melt(df, id_vars=["Model"], 
                        var_name="Metric", value_name="Score")
    df_melted.head()

    plt.figure(figsize=(12, 6))
    sns.violinplot(x='Metric', y='Score', data=df_melted, palette='Set2')


    plt.title('Violin Plot of Model Metrics Across Different Logistic Regression Models')
    plt.xticks(rotation=45) 
    plt.show()

    df=df.drop(columns=['Model'])
    mean_scores = df.mean()
    standard_deviation = df.std()

    # mean scores has a list of mean values for each metric. add a new column for column names at first, the mean values are the other columns
    scores = {
        'Model': 'LR',
        'Accuracy': f"{mean_scores['Accuracy']:.2f} ± {standard_deviation['Accuracy']:.2f}",
        'Sensitivity': f"{mean_scores['Sensitivity']:.2f} ± {standard_deviation['Sensitivity']:.2f}",
        'Specificity': f"{mean_scores['Specificity']:.2f} ± {standard_deviation['Specificity']:.2f}",
        'Precision': f"{mean_scores['Precision']:.2f} ± {standard_deviation['Precision']:.2f}",
        'F1': f"{mean_scores['F1']:.2f} ± {standard_deviation['F1']:.2f}",
        'AUROC': f"{mean_scores['AUROC']:.2f} ± {standard_deviation['AUROC']:.2f}",
        'AUPR': f"{mean_scores['AUPR']:.2f} ± {standard_deviation['AUPR']:.2f}"
    }



    # write the mean values as a row in a new csv file
    if not os.path.exists('mean_scores.csv'):
        scores_df = pd.DataFrame([scores])
        scores_df.to_csv('mean_scores.csv', index=False)
    else:
        scores_df = pd.read_csv('mean_scores.csv')
        scores_df = pd.concat([scores_df, pd.DataFrame([scores])], ignore_index=True)
        
        scores_df.to_csv('mean_scores.csv', index=False)



**MODEL IMPLEMENTATION**

In [46]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))


class MyLogisticRegression:
    def __init__(self, learning_rate=0.01, iterations=1000, regularization=None, strength=0.01):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.regularization = regularization
        self.regularization_strength = strength
    
    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1]) # theta is a vector of zeros with the same size as the number of features
        self.bias = 0 # bias is initialized to 0
        m = X.shape[0] # number of samples

        for _ in range(self.iterations):
            linear_model = np.dot(X, self.theta) + self.bias # z = X.theta + b
            predictions = sigmoid(linear_model) # a = sigmoid(z)
            
            dw = (1 / m) * np.dot(X.T, (predictions - y)) # X.T is the transpose of X. And np.dot returns an array with the size of the number of features
            db = (1 / m) * np.sum(predictions - y)

            if self.regularization == 'l1':
                dw += (self.regularization_strength / m) * np.sign(self.theta) # derivative of L1 regularization = the sign of theta * the regularization strength * 1/m
            elif self.regularization == 'l2':
                dw += (self.regularization_strength / m) * self.theta # derivative of L2 regularization = theta  * the regularization strength * 1/m
            
            self.theta -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        linear_model = np.dot(X, self.theta) + self.bias
        predictions = sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in predictions]
    # return prediction probabilities
    def predict_proba(self, X):
        linear_model = np.dot(X, self.theta) + self.bias
        predictions = sigmoid(linear_model)
        return predictions

In [47]:
from sklearn.utils import resample
def bagging_models(X_train, y_train,n_models=9,regularization=None,strength=0.01):
    models=[]
    for i in range(n_models):
        X_resampled, y_resampled = resample(X_train, y_train, random_state=i)
        
        model = MyLogisticRegression(regularization=regularization, strength=strength)
        model.fit(X_resampled, y_resampled)
        models.append(model)

    return models
        
        

In [48]:
def predict_with_stack_ensembling(scaled_and_encoded_df, target_column_name, n_models=9, regularization=None, strength=0.01):
    from sklearn.model_selection import train_test_split
    X = scaled_and_encoded_df.drop(columns=[target_column_name])
    y = scaled_and_encoded_df[target_column_name]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73)
    # validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=73)


    # for meta learner, both the predictions of the base models and base model features are the features

    
    #--------------------------------------------fitting----------------------------------------------
    models=bagging_models(X_train, y_train,n_models=n_models,regularization=regularization,strength=strength)

    # get the predictions of the base models on validation set
    validation_preds=[]
    for model in models:
        validation_preds.append(model.predict(X_val))

    # validation_preds has shape (n_models, n_samples)... we need to transpose it to (n_samples, n_models)
    validation_preds = np.array(validation_preds).T
    # merge the predictions with the validation train for meta model
    X_meta_train = np.concatenate([X_val, validation_preds], axis=1)

    # fit the meta learner
    meta_learner = MyLogisticRegression()
    meta_learner.fit(X_meta_train, y_val)

    #--------------------------------------------predicting----------------------------------------------
    # get the predictions of the base models on test set
    test_preds=[]
    for model in models:
        test_preds.append(model.predict(X_test))
    
    # test_preds has shape (n_models, n_samples)... we need to transpose it to (n_samples, n_models)
    test_preds = np.array(test_preds).T
    # merge the predictions with the test set for meta model
    X_meta_test = np.concatenate([X_test, test_preds], axis=1)

    # predict with the meta learner
    y_pred = meta_learner.predict(X_meta_test)
    y_pred_prob = meta_learner.predict_proba(X_meta_test)
    
    return y_test, y_pred, y_pred_prob


In [49]:
def predict_with_voting_ensembling(scaled_and_encoded_df, target_column_name, n_models=9, regularization=None, strength=0.01,soft_voting=True):
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import VotingClassifier
    from sklearn.utils import resample
    X = scaled_and_encoded_df.drop(columns=[target_column_name])
    y = scaled_and_encoded_df[target_column_name]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73)

    #--------------------------------------------fitting----------------------------------------------
    models=bagging_models(X_train, y_train,n_models=n_models,regularization=regularization,strength=strength)

    #--------------------------------------------predicting----------------------------------------------

    # find y_pred and y_pred_prob for each model
    y_pred_probs = []
    for model in models:
        y_pred_probs.append(model.predict_proba(X_test))
    
    y_preds= []
    for y_pred_prob in y_pred_probs:
        y_preds.append([1 if i > 0.5 else 0 for i in y_pred_prob])
    
    # evaluate each model
    for i in range(n_models):
        evaluate_model(f'LR_Model_{i}',y_test, y_preds[i], y_pred_probs[i],save_scores=True)


    #--------------------------------------------voting----------------------------------------------

    # find the final prediction by voting

    if soft_voting:
        # soft voting

        y_pred_probs = np.array(y_pred_probs)
        y_pred_prob = np.mean(y_pred_probs, axis=0)
        y_pred = [1 if i > 0.5 else 0 for i in y_pred_prob]

    else:
        # hard voting 
        y_pred = []
        for i in range(len(y_test)):
            y_pred.append(np.argmax(np.bincount([y_preds[j][i] for j in range(n_models)])))
        y_pred = np.array(y_pred)


    

    return y_test, y_pred, y_pred_prob

In [None]:

# # #predict with stack ensembling----------------------------------------------------------
y_test,y_pred, y_pred_prob = predict_with_stack_ensembling(selected_df, target_column_name, n_models=9)

# get the accuracy score

evaluate_model('StackEnsemble',y_test, y_pred, y_pred_prob,save_scores=True)



# predict with voting ensembling----------------------------------------------------------
y_test,y_pred, y_pred_prob = predict_with_voting_ensembling(selected_df, target_column_name, n_models=9,soft_voting=True)

# get the accuracy score

evaluate_model('VotingEnsemble',y_test, y_pred, y_pred_prob,save_scores=True)
violin_plot()

# delete the scores file
import os
os.remove('LR_scores.csv')
