# **Default of Credit Card Prediction**

# **Import**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix #Cross checking the values
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
df=pd.read_excel("default of credit card clients.xls", header=1)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head(5)

**We split the dataset to distinguish between clients who consistently paid their dues on time over six months and those who defaulted on payments in at least one of the months, aiding in analyzing credit behavior and assessing default risk.**

In [None]:
df1 = df[(df['PAY_0'] == -1) & (df['PAY_2'] == -1) & (df['PAY_3'] == -1) & (df['PAY_4'] == -1) & (df['PAY_5'] == -1) & (df['PAY_6'] == -1)]
df2 = df[(df['PAY_0'] !=-1) & (df['PAY_2'] != -1) & (df['PAY_3'] != -1) & (df['PAY_4'] != -1) & (df['PAY_5'] != -1) & (df['PAY_6'] != -1)]

In [None]:
df

# Variable information
> LIMIT_BAL : Amount of the given credit ($) (individual and his/her family)

> SEX : Gender (1 = male; 2 = female)

> EDUCATION : 1 = graduate school; 2 = university; 3 = high school; 4 = others

> MARRAIGE : 1 = Maried 2 = Single 3 = others

> AGE : in years

> PAY_0,PAY_2 to PAY_6 : History of past payment (September 2005, August 2005, July 2005, June 2005, May 2005, April 2005)
                         -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .
                         8 = payment delay for eight months; 9 = payment delay for nine months and above

> BILL_AMT1 to BILL_AMT6 : Amount of bill ($) (Sep 2005 to April 2005)

> PAY_AMT1 to PAY_AMT6 : Amount of previous payment ($) (Sep 2005 to April 2005)

> default payment next month : response variable (Yes = 1, No = 0)

In [None]:
df.info()

In [None]:
# converting data type to int
df = df.astype(int)
df.info()

In [None]:
df.describe()

# Data Analysis

**Default Payment Analysis**

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
sns.countplot(x='default payment next month', data=df, ax=axes[0])
axes[0].set_title('All Data')
axes[0].set_xlabel('Default Payment')
axes[0].set_ylabel('Count')
axes[0].set_xticks([0, 1])
axes[0].set_xticklabels(['No', 'Yes'])
sns.countplot(x='default payment next month', data=df1, ax=axes[1])
axes[1].set_title('Timely Payments')
axes[1].set_xlabel('Default Payment')
axes[1].set_ylabel('Count')
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['No', 'Yes'])
sns.countplot(x='default payment next month', data=df2, ax=axes[2])
axes[2].set_title('Delayed Payments')
axes[2].set_xlabel('Default Payment')
axes[2].set_ylabel('Count')
axes[2].set_xticks([0, 1])
axes[2].set_xticklabels(['No', 'Yes'])
plt.suptitle('Counts of Default Payment Next Month')
plt.tight_layout()
plt.show()

This highlights a lower default rate among clients who made timely payments compared to those with delayed payments, emphasizing the significance of payment punctuality in credit risk management.

**Age Distribution**

In [None]:
plt.figure(figsize=(20, 6))
sns.lineplot(data=df['AGE'].value_counts(), label='All Data')
sns.lineplot(data=df1['AGE'].value_counts(), label='Timely Payments')
sns.lineplot(data=df2['AGE'].value_counts(), label='Delayed Payments')
sns.lineplot(data=df[df['default payment next month'] == 1]['AGE'].value_counts(), label='Defaulted')
plt.title('Age Count Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()
plt.show()

This graph reveals that we mostly have data for age group beween their late 20s and mid 40s.
It can also be observed that most defaulters and clients who have consistently delayed in payements are in the age of 20-30.

**Gender Count**

In [None]:
df[['SEX','EDUCATION','MARRIAGE']].describe()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
sns.countplot(x='SEX', data=df, ax=axes[0])
axes[0].set_title('All Data')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Count')
axes[0].set_xticks([0, 1])
axes[0].set_xticklabels(['Male', 'Female'])

sns.countplot(x='SEX', data=df1, ax=axes[1])
axes[1].set_title('Timely Payments')
axes[1].set_xlabel('Gender')
axes[1].set_ylabel('Count')
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['Male', 'Female'])

sns.countplot(x='SEX', data=df2, ax=axes[2])
axes[2].set_title('Delayed Payments')
axes[2].set_xlabel('Gender')
axes[2].set_ylabel('Count')
axes[2].set_xticks([0, 1])
axes[2].set_xticklabels(['Male', 'Female'])
plt.suptitle('Gender Distribution Comparison')
plt.tight_layout()
plt.show()

This analysis reveals that the dataset comprises more females than males. It can also be observed that althought females have equal distribution between doing timely payments or not, more percentage of males are likely to have delayed payment.

**Education Level**

In [None]:
print(df['EDUCATION'].value_counts())

In [None]:
# 0,5,6 shouldn't be there for education
df.loc[(df.EDUCATION == 5) | (df.EDUCATION == 6) | (df.EDUCATION == 0),'EDUCATION'] = 4
print(df['EDUCATION'].value_counts())
df1.loc[(df1.EDUCATION == 5) | (df1.EDUCATION == 6) | (df1.EDUCATION == 0),'EDUCATION'] = 4
print(df1['EDUCATION'].value_counts())
df2.loc[(df2.EDUCATION == 5) | (df2.EDUCATION == 6) | (df2.EDUCATION == 0),'EDUCATION'] = 4
print(df2['EDUCATION'].value_counts())

The education feature in our dataset contains numerical values such as 0, 5, and 6, which do not correspond to valid educational categories. To rectify this inconsistency, these values were replaced with 4, representing a more appropriate category.

In [None]:
combined_df = pd.concat([df.assign(Dataset='All Data'), df1.assign(Dataset='Timely Payments'), df2.assign(Dataset='Delayed Payments')])
plt.figure(figsize=(10, 6))
sns.countplot(x='EDUCATION', hue='Dataset', data=combined_df)
plt.title('Education Background')
plt.xlabel('Education Level')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1, 2, 3], labels=['Grad School', 'University', 'High School', 'Others'])
plt.legend(title='Dataset')
plt.show()

It indicates that individuals with university education form the largest proportion across all datasets, while those with graduate school education are the next most common. However individuals categorized as "Others" (level 4) have the highest percentage of delayed payments compared to other education levels.

**Maratial Status**

In [None]:
print(df['MARRIAGE'].value_counts())

In [None]:
# 0 shouldn't be there for marraige
df.loc[(df.MARRIAGE == 0),'MARRIAGE'] = 3
print(df['MARRIAGE'].value_counts())
df1.loc[(df1.MARRIAGE == 0),'MARRIAGE'] = 3
print(df1['MARRIAGE'].value_counts())
df2.loc[(df2.MARRIAGE == 0),'MARRIAGE'] = 3
print(df2['MARRIAGE'].value_counts())

**Marriage category '0' is not valid and hence replaced with category '3' that is others in the dataset**

In [None]:
combined_df = pd.concat([df.assign(Dataset='All Data'), df1.assign(Dataset='Timely Payments'), df2.assign(Dataset='Delayed Payments')])
plt.figure(figsize=(10, 6))
sns.countplot(x='MARRIAGE', hue='Dataset', data=combined_df)
plt.title('Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.xticks(ticks=[0,1,2], labels=['Married','Single','Others'])
plt.legend(title='Dataset')
plt.show()

Marriage category 1 (Married) appears to have higher proportion of delayed payments compared to timely payments, with approximately 65.42% of clients paying late. This suggests that individuals who are married may be more prone to delaying their credit card payments.
Further analysis could explore potential reasons behind this trend, such as financial responsibilities shared within a household or differing spending habits between married and unmarried individuals.

**Delays in payment**

In [None]:
df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].describe()

In [None]:
df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].apply(pd.Series.value_counts)

In [None]:
df[(df['PAY_0'] == -2)]

In [None]:
df[(df['PAY_0'] == 0)]

It was observed that -2 indicates that the balance was paid in full, and there were no transactions during this period, implying that the credit card account was inactive.
0 signifies that the customer paid the minimum due amount but did not settle the entire balance. This means the customer paid enough to maintain their account in good standing but did carry forward a balance.

In [None]:
delay = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
delay_month = ['September 2005', 'August 2005', 'July 2005', 'June 2005', 'May 2005', 'April 2005']

plt.figure(figsize=(12, 8))
for i, col in enumerate(delay, 1):
    plt.subplot(2, 3, i)
    sns.countplot(x=col, data=df)
    plt.title(f'Count of {col} ({delay_month[i-1]})')
    plt.xlabel(f'{col}')
    plt.ylabel('Count')
plt.tight_layout()
plt.show()

**Bill amount over months (Sept 2005 to April 2005)**

In [None]:
df[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].describe()

Let us assume -ve is credit in account

In [None]:
bill_col = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
bill_month = ['Sep 2005', 'Aug 2005', 'Jul 2005', 'Jun 2005', 'May 2005', 'April 2005']

plt.figure(figsize=(12, 8))
for i, col in enumerate(bill_col, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f'Distribution of {col} ({bill_month[i-1]})')
    plt.xlabel(f'{col}')
    plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
data = {
    'Bill Month': bill_month,
    'Bill Amount (mean)': [df[col].mean() for col in bill_col],
    'Bill Amount (median)': [df[col].median() for col in bill_col],
    'Bill Amount (std)': [df[col].std() for col in bill_col]
}
payment_summary_df = pd.DataFrame(data)
payment_summary_df



**Paid amount over months (Sept 2005 to April 2005)**

In [None]:
df[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].describe()

In [None]:
paid_col = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
payment_month = ['September 2005', 'August 2005', 'July 2005', 'June 2005', 'May 2005', 'April 2005']
plt.figure(figsize=(12, 8))
for i, col in enumerate(paid_col, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f'Distribution of {col} ({payment_month[i-1]})')
    plt.xlabel('Payment Amount')
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:
data = {
    'Payment Month': payment_month,
    'Payment Amount (mean)': [df[col].mean() for col in paid_col],
    'Payment Amount (median)': [df[col].median() for col in paid_col],
    'Payment Amount (std)': [df[col].std() for col in paid_col]
}
payment_summary_df = pd.DataFrame(data)
payment_summary_df

In [None]:
plt.figure(figsize=(10, 6))

sns.histplot(df['LIMIT_BAL'], bins=30, color='skyblue', edgecolor='black')

plt.title('Distribution of Credit Limit')

plt.xlabel('Credit Limit')

plt.ylabel('Frequency')

plt.grid(True)

plt.show()

## Corelations

**Gender & Default Payment**

In [None]:
df_gender_y = df.groupby(['SEX','default payment next month']).size()
# default payment next month : response variable (Yes = 1, No = 0)
# SEX : Gender (1 = male; 2 = female)
print(df_gender_y)
p_sex = df_gender_y.groupby(['SEX'],group_keys=False).apply(lambda x: 100 * x / float(x.sum()))
print(p_sex)

In [None]:
labels = ['Male - No', 'Male - Yes', 'Female - No', 'Female - Yes']
plt.pie(p_sex.values, labels=labels, autopct='%1.1f%%')
plt.axis('equal')
plt.title('Default Payment by Gender')
plt.show()

The analysis reveals that females have a slightly lower default rate compared to males, indicating potential gender-based differences in credit risk.

**Education vs Default Payment**

In [None]:
df_education_y = df.groupby(['EDUCATION', 'default payment next month']).size()
print(df_education_y)
# EDUCATION : 1 = graduate school; 2 = university; 3 = high school; 4 = others
p_education = df_education_y.groupby('EDUCATION',group_keys=False).apply(lambda x: 100 * x / float(x.sum()))
print(p_education)

Its observed that distribution is slightly correlated on gender. Males are more likely to default next month

In [None]:
labels = ['Graduate School - No', 'Graduate School - Yes','University - No', 'University - Yes',
                    'High School - No', 'High School - Yes','Others - No', 'Others - Yes']
plt.figure(figsize=(4, 4))
plt.pie(p_education.values, labels=labels, autopct='%1.1f%%')
plt.axis('equal')
plt.title('Default Payment by Education Level')
plt.show()

The data suggests that individuals with higher education levels tend to have lower default rates, with the default rate decreasing as education level increases. This highlights the potential correlation between education level and credit worthiness.

**Outlier Analysis**

In [None]:
def outliers_plot(df, columns):
    fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=(10, len(columns)*5))

    for i, column in enumerate(columns):
        ax = axes[i] if len(columns) > 1 else axes
        ax.boxplot(df[column], vert=False)
        ax.set_title(f'Outlier Analysis for {column}')
        ax.set_xlabel(column)
        ax.set_ylabel('Outlier Detection')
        ax.grid(True)

    plt.tight_layout()
    plt.show()


columns_to_analyze = ['LIMIT_BAL', 'AGE', 'BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2' ,'PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']
outliers_plot(df, columns_to_analyze)

**Correlation Matrix**

In [None]:
corr = df[df.columns].corr()
plt.figure(figsize=(12, 10))
mask=np.triu(np.ones_like(corr, dtype=bool)) # to hide the lower triangle (including diagonal) of the correlation matrix.
plt.figure(figsize=(18, 12))
sns.heatmap(corr, mask=mask, annot=True, annot_kws={'size': 10})
plt.show()

The above code generates a lower triangular correlation matrix
Each cell denotes the correlation coefficient between two variables, where one variable corresponds to the row index and the other variable corresponds to the column index. The diagonal of the correlation matrix always consists of correlation coefficients of 1.0, indicating perfect correlation, as each variable is perfectly correlated with itself. Correlation is a statistical tool used to quantify the strength and direction of the relationship between two variables. It measures how much the variables change together. The correlation coefficient, denoted by

r, it varies between -1 and 1.



 r=1 : Represents a perfect positive correlation, where both variables increase together.
r
=
−
1
: Signifies a perfect negative correlation, where one variable increases as the other decreases.
r
=
0
: Indicates no correlation, suggesting no linear relationship between the variables.


Standardisation

In [None]:
df_stnd = df.drop('ID', axis=1)

In [None]:
categorical_data=df_stnd[['SEX','EDUCATION','MARRIAGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','default payment next month']]

In [None]:
continuous_data = df_stnd[['LIMIT_BAL', 'AGE', 'BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2' ,'PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']]
print(continuous_data)

In [None]:
scaler = StandardScaler()
X_standardized = scaler.fit_transform(continuous_data)

In [None]:
X_standardized_df= pd.DataFrame(X_standardized, columns=continuous_data.columns)

In [None]:
X_standardized_df.head()

In [None]:
df_final = pd.concat([X_standardized_df, categorical_data],axis=1)

In [None]:
for col in categorical_data.columns:
    X_standardized_df[col] = categorical_data[col].values
df_final = X_standardized_df

In [None]:
df_final

**Balancing the dataset**
Using Oversample (smote)

In [None]:
class_count_0, class_count_1 = df_final['default payment next month'].value_counts()

class_0 = df_final [df_final['default payment next month'] == 0]
class_1 =df_final[df_final['default payment next month'] == 1]

In [None]:
print('class 0:', class_0.shape)
print('\nclass 1:', class_1.shape)

In [None]:
#class_1_over = class_1.sample(class_count_0, replace=True)

#test_under = pd.concat([class_1_over, class_0], axis=0)

# print the number of class count
#print('class count of 1 and 0:\n', test_under['default payment next month'].value_counts())

#test_under

In [None]:
x = df_final.iloc[:, :-1]
y = df_final.iloc[:, -1]

In [None]:
x.shape

In [None]:
x.head()

In [None]:
# Assuming x and y are your features and target variable respectively
X_train, X_test, Y_train, Y_test = train_test_split(x, y, stratify=y, test_size=0.3)

# Dynamically adjust k_neighbors based on the class distribution
smote = SMOTE(k_neighbors=20)

# fit target and predictor variable
x_smote , y_smote = smote.fit_resample(X_train, Y_train)

print('Original dataset shape:', Counter(Y_train))
print('Resampled dataset shape:', Counter(y_smote))

In [None]:
x_smote.shape

In [None]:
y_smote.shape

In [None]:
x_smote.head()

**Implementation of PCA**

In [None]:
class PCA:
    def __init__(self, explained_variance_threshold=0.95):
        self.explained_variance_threshold = explained_variance_threshold
        self.n_components = None
        self.components = None
        self.mean = None

    def fit(self, X):
        # Mean of each feature
        self.mean = np.mean(X, axis=0)
        
        X -= self.mean
        #Centering the data is crucial for PCA because it ensures that the principal components 
        #represent directions of maximum variance
        
        # Covariance matrix
        cov_matrix = np.cov(X.T)
        
        eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
        # Sort eigenvectors based on eigenvalues, this prioritizes the principal components with the most variance.
        eigenvectors = eigenvectors.T
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvectors = eigenvectors[idxs]
        
        explained_variance_ratio = np.cumsum(eigenvalues[idxs]) / np.sum(eigenvalues)
        
        # Determine number of components to retain explained variance
        self.n_components = np.argmax(explained_variance_ratio >= self.explained_variance_threshold) + 1
        # Store the first n_components eigenvectors as principal components
        self.components = eigenvectors[:self.n_components]
        
    # Project data onto the principal components
    def transform(self, X):
        X -= self.mean
        return np.dot(X, self.components.T)
    
    # Transform data back to the original space
    def inverse_transform(self, X_transformed):
        return np.dot(X_transformed, self.components) + self.mean

In [None]:
# Apply PCA to the oversampled data with 95% explained variance threshold
pca = PCA(explained_variance_threshold=0.95)
pca.fit(x_smote)

# Transform the oversampled data onto the new lower-dimensional space
X_train_transformed = pca.transform(x_smote)
X_test_transformed = pca.transform(X_test)

In [None]:
X_train_transformed.shape

# MODEL IMPLEMENTATION ON OVERSAMPLED DATA


*   Logistic Regression (Implementation of Logistic Regression with regularization and hyperparameter tuning)



In [None]:
class LogisticRegressionWithRegularization:
    def __init__(self, learning_rate=0.01, num_iterations=3000, lambda_param=0.01, regularization=None):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.lambda_param = lambda_param
        self.regularization = regularization
        self.weights = None
        self.bias = None

    def sigmoid(self, z):                      #sigmoid
        return 1 / (1 + np.exp(-z))

    def _compute_regularization_term(self):
        if self.regularization == 'ridge':
            return self.lambda_param * np.sum(self.weights**2)
        elif self.regularization == 'lasso':
            return self.lambda_param * np.sum(np.abs(self.weights))
        else:
            return 0

    def predict_proba(self, X):

      linear_model = np.dot(X, self.weights) + self.bias
      return self.sigmoid(linear_model)

    def compute_loss(self, y, predictions):   # Loss calculation with regularization term

      reg_term = 0
      if self.regularization == 'ridge':
          reg_term = self.lambda_param * np.sum(self.weights**2)
      elif self.regularization == 'lasso':
          reg_term = self.lambda_param * np.sum(np.abs(self.weights))
      return -np.mean(y * np.log(predictions + 1e-15) + (1 - y) * np.log(1 - predictions + 1e-15)) + reg_term


    def fit(self, X, y, return_history=True):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0
        loss_history = []  # List to store the loss values

        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)


            loss = -np.mean(y * np.log(y_predicted + 1e-15) + (1 - y) * np.log(1 - y_predicted + 1e-15)) # Compute the regular loss

            loss += self._compute_regularization_term() # Add regularization term
            if return_history:
                loss_history.append(loss)

            # Compute gradients
            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)

            # Apply regularization
            if self.regularization == 'ridge':
                dw += 2 * self.lambda_param * self.weights
            elif self.regularization == 'lasso':
                dw += self.lambda_param * np.sign(self.weights)

            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

        if return_history:
            return loss_history

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_cls

In [None]:
learning_rates = [0.1, 0.01, 0.001]
lambda_params = [0.1, 0.01, 0.001]
regularizations = ['ridge', 'lasso', None]

best_accuracy = 0
best_params = {}
train_losses = []
cv_losses = []
validation_losses = []

In [None]:
for lr in learning_rates:
    for lambda_param in lambda_params:
        for regularization in regularizations:
            model = LogisticRegressionWithRegularization(learning_rate=lr, lambda_param=lambda_param, regularization=regularization)
            model.fit(X_train_transformed, y_smote)
            y_pred = model.predict(X_test_transformed)
            train_pred = model.predict_proba(X_train_transformed)
            val_pred = model.predict_proba(X_test_transformed)
            train_loss = model.compute_loss(y_smote, train_pred)
            validation_loss = model.compute_loss(Y_test, val_pred)
            train_losses.append(train_loss)
            validation_losses.append(validation_loss)
            accuracy = np.mean(y_pred == Y_test)
            print(f"Accuracy with lr={lr}, lambda={lambda_param}, regularization={regularization}  : {round(accuracy,3)}")
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {'learning_rate': lr, 'lambda_param': lambda_param, 'regularization': regularization}


print("Best parameters:", best_params)

In [None]:
def compute_confusion_matrix(y_true, y_pred):          #confusion matrix

    TP = TN = FP = FN = 0

    for actual, predicted in zip(y_true, y_pred):
        if actual == 1 and predicted == 1:
            TP += 1
        elif actual == 0 and predicted == 0:
            TN += 1
        elif actual == 0 and predicted == 1:
            FP += 1
        elif actual == 1 and predicted == 0:
            FN += 1

    confusion_matrix = [
        [TN, FP],
        [FN, TP]
    ]

    return confusion_matrix, TP, TN, FP, FN

In [None]:
cm = compute_confusion_matrix(Y_test, y_pred)
print(cm)

In [None]:
cm = confusion_matrix(Y_test, y_pred)
TP, TN, FP, FN  = cm.ravel()


print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

In [None]:
def calculate_metrics(TP, TN, FP, FN):
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score


precision, recall, f1_score = calculate_metrics(TP, TN, FP, FN) # Calculate Precision, Recall, F1 Score

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


In [None]:
auc_score = roc_auc_score(Y_test, y_pred)


print(f"ROC-AUC Score: {auc_score:.4f}") # Print the ROC-AUC Score

In [None]:
fig, ax = plt.subplots()
for regularization in regularizations:
    accuracies = []
    for lambda_param in lambda_params:
        model = LogisticRegressionWithRegularization(learning_rate=0.01, num_iterations=1000, lambda_param=lambda_param, regularization=regularization)
        model.fit(X_train_transformed, y_smote)
        y_pred = model.predict(X_test_transformed)
        accuracy = np.mean(y_pred == Y_test)
        accuracies.append(accuracy)
    ax.plot(lambda_params, accuracies, label=f'{regularization}')
ax.set_xlabel('Lambda')
ax.set_ylabel('Accuracy')
ax.set_title('Comparison of Model Accuracy with Varying Lambda for Regularization Techniques')
ax.legend()
plt.show()

**Bias - Variance Trade-off**

In [None]:
X_train_1, X_temp, y_train_1, y_temp = train_test_split(x, y, stratify=y, test_size=0.4, random_state=42)
X_cv_1, X_test_1, y_cv_1, y_test_1 = train_test_split(X_temp, y_temp, stratify=y_temp, test_size=0.5, random_state=42)

# Applying SMOTE to handle imbalance in the training dataset
smote = SMOTE(k_neighbors=20)
X_train_smote, y_train_smote = smote.fit_resample(X_train_1, y_train_1)

# Print the class distributions after resampling
print('Original dataset shape:', Counter(y_train_1))
print('Resampled dataset shape:', Counter(y_train_smote))

# Apply PCA to the oversampled data with 95% explained variance threshold
pca = PCA(explained_variance_threshold=0.95)
pca.fit(X_train_smote)

X_train_transformed_1 = pca.transform(X_train_smote)
X_cv_transformed_1 = pca.transform(X_cv_1)
X_test_transformed_1 = pca.transform(X_test_1)

lambdas = [0.1, 0.01, 0.001]  # Varying regularization strengths
learning_rate = 0.1  # Use an optimal or constant learning rate
regularization_type = 'ridge'  # Choose from 'ridge', 'lasso'
train_errors = []
test_errors = []
cv_errors = []

plt.figure(figsize=(10, 8))

colors = ['r', 'g', 'b']
labels = ['Lambda = 0.1', 'Lambda = 0.01', 'Lambda = 0.001']

for lambda_param, color, label in zip(lambdas, colors, labels):
    model = LogisticRegressionWithRegularization(learning_rate=learning_rate, lambda_param=lambda_param, regularization=regularization_type)
    training_loss = model.fit(X_train_transformed_1, y_train_smote, return_history=True)
    cv_loss = model.fit(X_cv_transformed_1, y_cv_1, return_history=True)
    test_loss = model.fit(X_test_transformed_1, y_test_1, return_history=True)

    y_pred_train = model.predict(X_train_transformed_1)
    y_pred_val = model.predict(X_test_transformed_1)
    cv_pred = model.predict(X_cv_transformed_1)

    train_error = 1 - accuracy_score(y_train_smote, y_pred_train)
    val_error = 1 - accuracy_score(y_test_1, y_pred_val)
    cv_error = 1 - accuracy_score(y_cv_1, cv_pred)

    # Plotting
    plt.plot(range(1, model.num_iterations + 1), training_loss, color=color, linestyle='-', label=f'Training Loss {label}')
    plt.plot(range(1, model.num_iterations + 1), cv_loss, color=color, linestyle='--', label=f'CV Loss {label}')
    plt.plot(range(1, model.num_iterations + 1), test_loss, color=color, linestyle=':', label=f'Test Loss {label}')


# Final plot adjustments
plt.title('Training, CV, and Test Losses for Different Lambda Values - Bias Variance Tradeoff')
plt.xlabel('Number of Iterations')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:

sns.set(style="whitegrid")

# Data setup
data = {
    'Learning Rate': [0.1]*9 + [0.01]*9 + [0.001]*9,
    'Lambda': [0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001]*3,
    'Regularization': ['ridge', 'lasso', 'None', 'ridge', 'lasso', 'None', 'ridge', 'lasso', 'None']*3,
    'Accuracy': [
        0.781, 0.63, 0.807, 0.804, 0.791, 0.807, 0.807, 0.806, 0.807,
        0.766, 0.623, 0.782, 0.784, 0.757, 0.782, 0.782, 0.781, 0.782,
        0.682, 0.635, 0.683, 0.682, 0.668, 0.683, 0.683, 0.681, 0.683
    ]
}


df = pd.DataFrame(data)


pivot_df = df.pivot_table(values='Accuracy', index='Lambda', columns=['Learning Rate', 'Regularization'], aggfunc='first')
pivot_df = pivot_df.swaplevel(axis=1).sort_index(axis=1)

fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
fig.suptitle('Accuracy by Lambda and Learning Rate for Different Regularizations', fontsize=16)

# Plot each regularization method in a different subplot
for ax, reg in zip(axes, ['ridge', 'lasso', 'None']):
    for lr in pivot_df.columns.levels[1]:
        pivot_df[reg][lr].plot(ax=ax, marker='o', label=f'LR={lr}')
    ax.set_title(f'Regularization: {reg}')
    ax.set_xlabel('Lambda')
    ax.set_ylabel('Accuracy')
    ax.legend(title='Learning Rate')


plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:

# Data setup
data = {
    'Learning Rate': [0.1]*9 + [0.01]*9 + [0.001]*9,
    'Lambda': [0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001]*3,
    'Regularization': ['ridge', 'lasso', 'None', 'ridge', 'lasso', 'None', 'ridge', 'lasso', 'None']*3,
    'Accuracy': [
        0.781, 0.63, 0.807, 0.804, 0.791, 0.807, 0.807, 0.806, 0.807,
        0.766, 0.623, 0.782, 0.784, 0.757, 0.782, 0.782, 0.781, 0.782,
        0.682, 0.635, 0.683, 0.682, 0.668, 0.683, 0.683, 0.681, 0.683
    ]
}


df = pd.DataFrame(data)


pivot_df = df.pivot_table(values='Accuracy', index=['Lambda', 'Regularization'], columns='Learning Rate', aggfunc='mean')

# Creating the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_df, annot=True, cmap='viridis', fmt=".3f", linewidths=.5, cbar_kws={'label': 'Accuracy'})
plt.title('Heatmap of Accuracy vs. Learning Rate, Lambda, and Regularization')
plt.xlabel('Learning Rate')
plt.ylabel('Lambda, Regularization')
plt.show()


In [None]:
print("Best parameters:", best_params)
best_model = LogisticRegressionWithRegularization(learning_rate=best_params['learning_rate'],
                                                  lambda_param=best_params['lambda_param'],
                                                  regularization=best_params['regularization'])

Observations:The accuracy levels for the dataset that has been balanced using SMOTE and trained using logistic regression
are generally higher

*Scenerio -2 *

**LOGISTIC REGRESSION FOR UNDERSAMPLED DATA**

In [None]:
df_final.shape

In [None]:
df_final.head(5)

In [None]:
x_for_undersampled= df_final.iloc[:, :-1]
y_for_undersampled= df_final.iloc[:, -1]

In [None]:
y.head(5)

In [None]:
#split the dataset
X_undersample_train, X_undersample_test, Y_undersample_train, Y_undersample_test = train_test_split(x_for_undersampled, y_for_undersampled, test_size=0.3)

In [None]:
X_undersample_train.shape

In [None]:
X_undersample_test.shape

**Under-Sampling**

In [None]:
rus = RandomUnderSampler(random_state=42, replacement=True)

# fit predictor and target varialbe
x_rus, y_rus = rus.fit_resample(X_undersample_train,Y_undersample_train)

print('original dataset shape:', Counter(Y_undersample_train))
print('Resample dataset shape', Counter(y_rus))

PCA IMPLEMENTATION

In [None]:
class PCA:
    def __init__(self, explained_variance_threshold=0.95):
        self.explained_variance_threshold = explained_variance_threshold
        self.n_components = None
        self.components = None
        self.mean = None

    def fit(self, X):
        self.mean = np.mean(X, axis=0)

        X -= self.mean

        cov_matrix = np.cov(X.T) #  covariance matrix

        eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

        eigenvectors = eigenvectors.T
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvectors = eigenvectors[idxs]

        explained_variance_ratio = np.cumsum(eigenvalues[idxs]) / np.sum(eigenvalues)
        # Determine number of components to retain explained variance
        self.n_components = np.argmax(explained_variance_ratio >= self.explained_variance_threshold) + 1
        # Store the first n_components eigenvectors as principal components
        self.components = eigenvectors[:self.n_components]

    def transform(self, X):

        X -= self.mean
        return np.dot(X, self.components.T) # Project data onto the principal components

    def inverse_transform(self, X_transformed):

        return np.dot(X_transformed, self.components) + self.mean # Transform data back to the original space

In [None]:
# Apply PCA to the oversampled data with 95% explained variance threshold
pca = PCA(explained_variance_threshold=0.95)
pca.fit(x_rus)


X_train_undersample_transformed = pca.transform(x_rus)
X_test_undersample_transformed = pca.transform(X_undersample_test)

In [None]:
X_train_undersample_transformed.shape

In [None]:
X_test_undersample_transformed.shape

In [None]:
best_accuracy_rus = 0
best_params_rus = {}


In [None]:
for lr in learning_rates:
    for lambda_param in lambda_params:
        for regularization in regularizations:
            model = LogisticRegressionWithRegularization(learning_rate=lr, lambda_param=lambda_param, regularization=regularization)
            model.fit(X_train_undersample_transformed,y_rus)
            y_pred_rus = model.predict(X_test_undersample_transformed)
            accuracy_rus = np.mean(y_pred_rus == Y_undersample_test)
            print(f"Accuracy with lr={lr}, lambda={lambda_param}, regularization={regularization}  : {round(accuracy_rus,3)}")
            if accuracy_rus > best_accuracy_rus:
                best_accuracy = accuracy_rus
                best_params_rus = {'learning_rate': lr, 'lambda_param': lambda_param, 'regularization': regularization}

print("Best parameters:", best_params_rus)

In [None]:
def compute_confusion_matrix(y_true, y_pred):
    """
    Manually compute the confusion matrix for binary classification.

    Args:
    y_true (list or array): Actual true class labels.
    y_pred (list or array): Predicted class labels.

    Returns:
    tuple: (confusion_matrix, TP, TN, FP, FN)
    """
    TP = TN = FP = FN = 0

    for actual, predicted in zip(y_true, y_pred):
        if actual == 1 and predicted == 1:
            TP += 1
        elif actual == 0 and predicted == 0:
            TN += 1
        elif actual == 0 and predicted == 1:
            FP += 1
        elif actual == 1 and predicted == 0:
            FN += 1

    confusion_matrix = [
        [TN, FP],
        [FN, TP]
    ]

    return confusion_matrix, TP, TN, FP, FN

In [None]:
cm_rus = confusion_matrix(Y_undersample_test, y_pred_rus)

In [None]:
TN_rus, FP_rus, FN_rus, TP_rus = cm.ravel()

# Display the confusion matrix and the individual components with formatted strings
print(f"True Positives (TP): {TP_rus}")
print(f"True Negatives (TN): {TN_rus}")
print(f"False Positives (FP): {FP_rus}")
print(f"False Negatives (FN): {FN_rus}")

In [None]:
# Calculate Precision, Recall, F1 Score
precision_rus = TP_rus / (TP_rus + FP_rus)
recall_rus = TP_rus / (TP_rus + FN_rus)

f1_score_rus = 2 * (precision_rus * recall_rus) / (precision_rus + recall_rus)
print(f"Precision: {precision_rus:.4f}")
print(f"Recall: {recall_rus:.4f}")
print(f"F1 Score: {f1_score_rus:.4f}")


In [None]:
auc_score_rus = roc_auc_score(Y_undersample_test, y_pred_rus)


print(f"ROC-AUC Score: {auc_score_rus:.4f}") # Print the ROC-AUC Score

In [None]:

sns.set(style="whitegrid")

# Data setup
data_rus = {
    'Learning Rate': [0.1]*9 + [0.01]*9 + [0.001]*9,
    'Lambda': [0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001]*3,
    'Regularization': ['ridge', 'lasso', 'None', 'ridge', 'lasso', 'None', 'ridge', 'lasso', 'None']*3,
    'Accuracy': [
        0.651, 0.62, 0.699, 0.687, 0.604, 0.699, 0.698, 0.696, 0.699,
        0.657, 0.617, 0.688, 0.681, 0.682, 0.688, 0.687, 0.684, 0.688,
        0.668, 0.631, 0.671, 0.671, 0.661, 0.671, 0.671, 0.67, 0.671
    ]
}


df_rus = pd.DataFrame(data_rus)

pivot_df_rus = df_rus.pivot_table(values='Accuracy', index='Lambda', columns=['Learning Rate', 'Regularization'], aggfunc='first')
pivot_df_rus = pivot_df_rus.swaplevel(axis=1).sort_index(axis=1)


fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
fig.suptitle('Accuracy by Lambda and Learning Rate for Different Regularizations', fontsize=16)

# Plot each regularization method in a different subplot
for ax, reg in zip(axes, ['ridge', 'lasso', 'None']):
    for lr in pivot_df_rus.columns.levels[1]:
        pivot_df_rus[reg][lr].plot(ax=ax, marker='o', label=f'LR={lr}')
    ax.set_title(f'Regularization: {reg}')
    ax.set_xlabel('Lambda')
    ax.set_ylabel('Accuracy')
    ax.legend(title='Learning Rate')


plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:

# Data setup
data_rus = {
    'Learning Rate': [0.1]*9 + [0.01]*9 + [0.001]*9,
    'Lambda': [0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001]*3,
    'Regularization': ['ridge', 'lasso', 'None', 'ridge', 'lasso', 'None', 'ridge', 'lasso', 'None']*3,
    'Accuracy': [
        0.651, 0.62, 0.699, 0.687, 0.604, 0.699, 0.698, 0.696, 0.699,
        0.657, 0.617, 0.688, 0.681, 0.682, 0.688, 0.687, 0.684, 0.688,
        0.668, 0.631, 0.671, 0.671, 0.661, 0.671, 0.671, 0.67, 0.671
    ]
}


df_rus = pd.DataFrame(data_rus)


pivot_df_rus = df_rus.pivot_table(values='Accuracy', index=['Lambda', 'Regularization'], columns='Learning Rate', aggfunc='mean')

# Creating the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_df_rus, annot=True, cmap='viridis', fmt=".3f", linewidths=.5, cbar_kws={'label': 'Accuracy'})
plt.title('Heatmap of Accuracy vs. Learning Rate, Lambda, and Regularization')
plt.xlabel('Learning Rate')
plt.ylabel('Lambda, Regularization')
plt.show()


In [None]:
X_undersample_train_rus, X_undersample_temp, Y_undersample_train_rus, Y_undersample_temp = train_test_split(x_for_undersampled, y_for_undersampled, test_size=0.4)
X_undersample_cv_rus, X_undersample_test_rus, y_undersample_cv_rus, y_undersample_test_rus = train_test_split(X_undersample_temp, Y_undersample_temp, stratify=Y_undersample_temp, test_size=0.5, random_state=42)



# Apply PCA to the oversampled data with 95% explained variance threshold
pca = PCA(explained_variance_threshold=0.95)
pca.fit(X_undersample_train)

# Transform the oversampled data onto the new lower-dimensional space
X_train_undersample_transformed_1 = pca.transform(X_undersample_train_rus)
X_cv_undersample_transformed_1 = pca.transform(X_undersample_cv_rus)
X_test_undersample_transformed_1 = pca.transform(X_undersample_test_rus)



lambdas = [0.1, 0.01, 0.001]  # Varying regularization strengths
learning_rate = 0.1  # Use an optimal or constant learning rate
regularization_type = 'ridge'  # Choose from 'ridge', 'lasso'
train_errors = []
test_errors = []
cv_errors = []

plt.figure(figsize=(10, 8))

colors = ['r', 'g', 'b']
labels = ['Lambda = 0.1', 'Lambda = 0.01', 'Lambda = 0.001']

for lambda_param, color, label in zip(lambdas, colors, labels):
    model_rus = LogisticRegressionWithRegularization(learning_rate=learning_rate, lambda_param=lambda_param, regularization=regularization_type)
    training_loss_rus = model_rus.fit(X_train_undersample_transformed_1, Y_undersample_train_rus, return_history=True)
    cv_loss_rus = model_rus.fit(X_cv_undersample_transformed_1, y_undersample_cv_rus, return_history=True)
    test_loss_rus = model_rus.fit(X_test_undersample_transformed_1, y_undersample_test_rus, return_history=True)

    y_pred_train_rus = model.predict(X_train_undersample_transformed_1)
    y_pred_val_rus = model.predict(X_test_undersample_transformed_1)
    cv_pred_rus = model.predict(X_cv_undersample_transformed_1)

    train_error_rus = 1 - accuracy_score(Y_undersample_train_rus, y_pred_train_rus)
    val_error_rus = 1 - accuracy_score(y_undersample_test_rus, y_pred_val_rus)
    cv_error_rus = 1 - accuracy_score(y_undersample_cv_rus, cv_pred_rus)

    # Plotting
    plt.plot(range(1, model.num_iterations + 1), training_loss_rus, color=color, linestyle='-', label=f'Training Loss {label}')
    plt.plot(range(1, model.num_iterations + 1), cv_loss_rus, color=color, linestyle='--', label=f'CV Loss {label}')
    plt.plot(range(1, model.num_iterations + 1), test_loss_rus, color=color, linestyle=':', label=f'Test Loss {label}')


# Final plot adjustments
plt.title('Training, CV, and Test Losses for Different Lambda Values - Bias Variance Tradeoff')
plt.xlabel('Number of Iterations')
plt.ylabel('Loss')
plt.legend()
plt.show()
