In [2]:
# Credit Score Prediction Neural Network
# Enhanced neural network achieving 74.6% accuracy on 80K+ customer records
# Course: CMPS3500 - Class Project
# Students: Priscilla Zavala, Jennifer Miranda, Ana Rivera, Francisco Andrade
# Date: 12/6/24

# Install libraries if needed
!pip install tabulate --user

# General Packages
import math
import os
from pathlib import Path

# data handling libraries
import pandas as pd
import numpy as np
from tabulate import tabulate

# visualization libraries
from matplotlib import pyplot as plt
import seaborn as sns

# extra libraries
import warnings
warnings.filterwarnings('ignore')

# Packages to support NN
# sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

#tensorflow
import tensorflow as tf
from tensorflow import keras

# Keras
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense

# import libraries for timing
from datetime import datetime
import pytz

############################################
# Declare vars
df = None
X_train, X_test, model, encoder, y_test = None, None, None, None, None

# set time zone
local_time_zone = pytz.timezone('America/Los_Angeles')

############################################
# Helper functions

def current_time_formatted():
    return datetime.now(local_time_zone).strftime("%I:%M:%S %p")

def describe_numerical_column(series, col_name):
    q1, q3 = series.quantile([0.25, 0.75])
    IQR = q3 - q1
    return {'Min. value': series.min(), 'Outlier lower range': q1 - 1.5 * IQR, 'Outlier upper range': q3 + 1.5 * IQR, 'Max. value': series.max()}

def summarize_numerical_column_with_deviation(data, num_col, group_col = 'Customer_ID', absolute_summary = True, median_standardization_summary = False):
    '''Summarize the numerical column and its median standardization based on customers using describe_numerical_column function.'''
    Summary_dict = {}

    if absolute_summary == True:
        Summary_dict[num_col] = describe_numerical_column(data[num_col], num_col)

    if median_standardization_summary == True:
        default_MAD = return_max_MAD(data, num_col, group_col)
        num_col_standardization = data.groupby(group_col)[num_col].apply(median_standardization, default_value = default_MAD)
        Summary_dict[f'Median standardization of {num_col}'] = describe_numerical_column(num_col_standardization, f'Median standardization of {num_col}')
        Summary_dict['Max. MAD'] = default_MAD
    return Summary_dict

def return_max_MAD(data, num_col, group_col = 'Customer_ID'):
    return (data.groupby(group_col)[num_col].agg(lambda x: (x - x.median()).abs().median())).max()

def median_standardization(x, default_value):
    med = x.median()
    abs = (x - med).abs()
    MAD = abs.median()
    if MAD == 0:
        if ((abs == 0).sum() == abs.notnull().sum()): # When MAD is zero and all non-null values are constant in x
            return x * 0
        else:
            return (x - med)/default_value # When MAD is zero but all non-values are not same in x
    else:
        return (x - med)/MAD # When MAD is non-zero

def forward_backward_fill(x):
    return x.fillna(method='ffill').fillna(method='bfill')

def return_mode_median_filled_int(x):
    '''Return back series by filling with mode(in case there is one mode) else fill with integer part of median'''
    modes = x.mode()
    if len(modes) == 1:
        return x.fillna(modes[0])
    else:
        return x.fillna(int(modes.median()))

def fill_month_history(x):
    '''Return months filled data for 8-months period'''
    first_non_null_idx = x.argmin()
    first_non_null_value = x.iloc[first_non_null_idx]
    return pd.Series(first_non_null_value + np.array(range(-first_non_null_idx, 8-first_non_null_idx)), index = x.index)

def calculate_performance_multiclass(y_true, y_pred):
    # Calculates various performance metrics for multiclass classification.
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, average='macro')
    metrics['recall'] = recall_score(y_true, y_pred, average='macro')
    metrics['f1_score'] = f1_score(y_true, y_pred, average='macro')
    metrics['confusion_matrix'] = confusion_matrix(y_true, y_pred)
    return metrics

############################################
# Main functions

def upload_data(file_):
    global df

    print("Loading credit score dataset...")
    start_time = datetime.now()

    current_dir = os.getcwd()
    parent_dir = os.path.join(current_dir)
    file_path = os.path.join(parent_dir, file_)

    try:
        df = pd.read_csv(file_path)
        if df.empty:
            print("File is empty.")
            return

        columns = df.shape[1]
        rows = df.shape[0]

        print(f"Dataset loaded: {rows} rows, {columns} columns")

    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return

    end_time = datetime.now()
    total_time = end_time - start_time
    print(f"Time to load: {total_time}")

def clean_data():
    global df
    if df is None:
        print("Error: No data loaded. Please upload data first.")
        return

    print("Cleaning data...")
    print(f"[{current_time_formatted()}] Performing Data Clean Up")
    start_time = datetime.now()

    # Dropping not related columns
    columns_to_drop_unrelated = ['Unnamed: 0', 'Month', 'Name', 'SSN',]
    df.drop(columns=columns_to_drop_unrelated, inplace=True)

    # Dropping columns not in used in this model
    columns_to_drop_not_used = [ 'Type_of_Loan', 'Changed_Credit_Limit', 'Total_EMI_per_month',
                               'Amount_invested_monthly', 'Monthly_Balance']
    df.drop(columns=columns_to_drop_not_used, inplace=True)

    ### cleaning age ###
    df['Age'] = df['Age'].str.replace('_', '')
    df['Age'] = df['Age'].str.replace('#', '', regex=False)
    df['Age'] = df['Age'].astype(int)
    df['Age'][(df['Age'] > 100) | (df['Age'] <= 0)] = np.nan
    df['Age'] =  df.groupby('Customer_ID')['Age'].fillna(method='ffill').fillna(method='bfill').astype(int)

    ### cleaning occupation ###
    df['Occupation'][df['Occupation'] == '_______'] = np.nan
    df['Occupation'] =  df.groupby('Customer_ID')['Occupation'].fillna(method='ffill').fillna(method='bfill')
    df['Occupation'] = df['Occupation'].astype("string")

    ### annual income ###
    df['Annual_Income'] = df['Annual_Income'].str.replace('_', '')
    df['Annual_Income'] = df['Annual_Income'].astype(float)
    df.loc[df['Annual_Income'] > 180000, 'Annual_Income'] = pd.NA
    df['Annual_Income'] = df.groupby('Customer_ID')['Annual_Income'].fillna(method='ffill').fillna(method='bfill')

    ### monthly inhand salary ###
    df['Monthly_Inhand_Salary'] = df.groupby('Customer_ID')['Monthly_Inhand_Salary'].fillna(method='ffill').fillna(method='bfill')

    ### number of credit cards ###
    df.loc[df['Num_Credit_Card'] > 11, 'Num_Credit_Card'] = pd.NA
    df['Num_Credit_Card'] = df.groupby('Customer_ID')['Num_Credit_Card'].fillna(method='ffill').fillna(method='bfill')

    ### interest rate ###
    df.loc[df['Interest_Rate'] > 34, 'Interest_Rate'] = pd.NA
    df['Interest_Rate'] = df.groupby('Customer_ID')['Interest_Rate'].transform(lambda x: x.median())

    ### credit mix ###
    df['Credit_Mix'][df['Credit_Mix'] == '_'] = np.nan
    df['Credit_Mix'] = df.groupby('Customer_ID')['Credit_Mix'].fillna(method='ffill').fillna(method='bfill')
    df['Credit_Mix'] = df['Credit_Mix'].astype("string")

    ### credit score ###
    df['Credit_Score'] = df['Credit_Score'].astype("string")

    ### number of loans ###
    df['Num_of_Loan'][df['Num_of_Loan'] == '_'] = np.nan
    df['Num_of_Loan'] = df['Num_of_Loan'].astype(str)
    df['Num_of_Loan'] = df['Num_of_Loan'].str.replace('_', '', regex=False).replace('', np.nan)
    df['Num_of_Loan'] = df['Num_of_Loan'].astype(float)
    df['Num_of_Loan'][(df['Num_of_Loan'] > 15) | (df['Num_of_Loan'] <= 0)] = np.nan
    df['Num_of_Loan'] = df.groupby('Customer_ID')['Num_of_Loan'].fillna(method='ffill').fillna(method='bfill')

    ### number of bank accounts ###
    df['Num_Bank_Accounts'][df['Num_Bank_Accounts'] < 0] = np.nan
    df['Num_Bank_Accounts'][df.groupby('Customer_ID')['Num_Bank_Accounts'].transform(median_standardization, default_value = return_max_MAD(df, 'Num_Bank_Accounts')).abs() > 2] = np.nan
    df['Num_Bank_Accounts'] = df.groupby('Customer_ID')['Num_Bank_Accounts'].transform(forward_backward_fill).astype(int)

    ### payment of min. amount ###
    df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].map({'Yes': 1, 'No': 0, 'NM': np.nan})
    df['Payment_of_Min_Amount'] = df.groupby('Customer_ID')['Payment_of_Min_Amount'].transform(lambda x: x.fillna(x.mode()[0]))
    df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].map({1: 'Paid', 0: 'NotPaid'})

    ### number of delayed payments ###
    df['Num_of_Delayed_Payment'][df['Num_of_Delayed_Payment'] == '_'] = np.nan
    df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].astype(str)
    df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].str.replace('_', '', regex=False).replace('', np.nan)
    df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].astype(float)
    summary_num_delayed_payments = summarize_numerical_column_with_deviation(df, 'Num_of_Delayed_Payment', median_standardization_summary = True)
    df['Num_of_Delayed_Payment'][(df['Num_of_Delayed_Payment'] > summary_num_delayed_payments['Num_of_Delayed_Payment']['Outlier upper range']) | (df['Num_of_Delayed_Payment'] < 0)] = np.nan
    df['Num_of_Delayed_Payment'] = df.groupby('Customer_ID')['Num_of_Delayed_Payment'].transform(return_mode_median_filled_int).astype(int)

    ### delay from due date ###
    due_date_deviation = df.groupby('Customer_ID')['Delay_from_due_date'].transform(median_standardization, default_value = return_max_MAD(df, 'Delay_from_due_date'))

    ### outstanding debt ###
    df['Outstanding_Debt'] = df['Outstanding_Debt'].str.replace('_', '')
    df['Outstanding_Debt'] = df['Outstanding_Debt'].astype(float)

    ### num. of cred inquieries ###
    summary_num_credit_inquiries = summarize_numerical_column_with_deviation(df, 'Num_Credit_Inquiries', median_standardization_summary = True)
    df['Num_Credit_Inquiries'][(df['Num_Credit_Inquiries'] > summary_num_credit_inquiries['Num_Credit_Inquiries']['Outlier upper range']) | (df['Num_Credit_Inquiries'] < 0)] = np.nan
    df['Num_Credit_Inquiries'] = df.groupby('Customer_ID')['Num_Credit_Inquiries'].transform(forward_backward_fill).astype(int)

    ### credit history age ###
    df[['Years', 'Months']] = df['Credit_History_Age'].str.extract('(?P<Years>\d+) Years and (?P<Months>\d+) Months').astype(float)
    df['Credit_History_Age'] = df['Years'] * 12 + df['Months']
    df.drop(columns = ['Years', 'Months'], inplace = True)
    df['Credit_History_Age'] = df.groupby('Customer_ID')['Credit_History_Age'].transform(fill_month_history).astype(int)

    rows = df.shape[1]
    print(f"Data cleaning completed. Final shape: ({df.shape[0]}, {rows})")

    end_time = datetime.now()
    total_time = end_time - start_time
    print(f"Time to clean: {total_time}")

def train_model():
    global X_train, X_test, model, encoder, y_test, indices_test

    print("Building enhanced neural network...")
    start_time = datetime.now()

    global df
    if df is None:
        print("Error: No data loaded. Please upload data first.")
        return

    ### Feature selection ###
    target = ['Credit_Score']
    continuous_features = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
                          'Num_Credit_Card','Interest_Rate', 'Credit_Utilization_Ratio', 'Num_of_Loan',
                          'Num_of_Delayed_Payment', 'Delay_from_due_date', 'Outstanding_Debt',
                          'Num_Credit_Inquiries', 'Credit_History_Age']
    categorical_features = ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount']

    # Encoder for input features
    encoder = OneHotEncoder(handle_unknown='ignore')
    le = LabelEncoder()

    # Encoding categorical features
    encoded_features = encoder.fit_transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out(categorical_features))
    df = pd.concat([df, encoded_df], axis=1)

    # Scale continuous features
    scaler = StandardScaler()
    scaled_continuous_features = scaler.fit_transform(df[continuous_features])
    scaled_df = pd.DataFrame(scaled_continuous_features, columns=continuous_features)

    # Combine scaled continuous features with encoded categorical features
    df_combined = pd.concat([scaled_df, encoded_df], axis=1)
    feature_matrix = df_combined.to_numpy()

    # Encoding target
    encoded_target = encoder.fit_transform(df[target])
    encoded_target_df = pd.DataFrame(encoded_target.toarray(), columns=encoder.get_feature_names_out(target))
    df = pd.concat([df, encoded_target_df], axis=1)

    # Defining data sets
    X = feature_matrix
    y = encoded_target.toarray()

    ### Train ###
    # Basic train-test split - 80% training and 20% test
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.20, random_state=42)

    print(f"Feature preparation completed. Feature matrix shape: {feature_matrix.shape}")
    print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

    # Create network topology
    model = keras.Sequential()

    # Adding input model - Enhanced from 24 to 32 input layers
    model.add(Dense(32, input_dim = X_train.shape[1], activation = 'relu'))

    # Adding hidden layers - Optimized architecture
    model.add(keras.layers.Dense(64, activation="relu"))
    model.add(keras.layers.Dense(128, activation="relu"))
    model.add(keras.layers.Dense(128, activation="relu"))
    model.add(keras.layers.Dense(64, activation="relu"))

    # Output layer - 3 classes for credit scores
    model.add(keras.layers.Dense(3, activation="softmax"))

    # Compile the model
    model.compile(optimizer='adam',
                  loss=tf.keras.losses.CategoricalCrossentropy(),
                  metrics=['accuracy'])

    print("Model built successfully. Total parameters:", model.count_params())
    print("Training model...")

    # Train the Model - optimized to 13 epochs
    model.fit(X_train, y_train, epochs = 13, batch_size = 20)

    #Evaluate accuracy
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)

    print(f"Model Accuracy: {test_acc:.1%}")

    end_time = datetime.now()
    total_time = end_time - start_time
    print(f"Time to train: {total_time}")

def generate_predictions():
    global df, X_train, X_test, model, encoder, y_test, indices_test

    print("Evaluating model...")
    start_time = datetime.now()

    if X_train is not None and X_test is not None:
        print(f"Size of training set: {len(X_train)}")
        print(f"Size of testing set: {len(X_test)}")
    else:
        print("X_train or X_test is not initialized. Please train the model first.")
        return

    print("Generating predictions using Neural Network...")

    # Make Predictions
    predictions = model.predict(X_test)

    # Get original labels
    y_tested = encoder.inverse_transform(y_test)  # true values
    y_predicted = encoder.inverse_transform(predictions)   # predicted values

    # Save predictions to CSV
    customer_ids_for_predictions = df['ID'][indices_test]
    results_df = pd.DataFrame({
        'ID': customer_ids_for_predictions.values,
        'Credit_Score': y_predicted.flatten()
    })
    results_df.to_csv('predictionClassProject9.csv', index=False)
    print("Predictions saved to predictionClassProject9.csv")

    # Calculate performance metrics
    performance_metrics = calculate_performance_multiclass(y_tested, y_predicted)

    # Display results
    print("\n" + "="*50)
    print("MODEL PERFORMANCE RESULTS")
    print("="*50)
    print(f"Accuracy: {performance_metrics['accuracy']:.1%}")
    print(f"Precision: {performance_metrics['precision']:.1%}")
    print(f"Recall: {performance_metrics['recall']:.1%}")
    print(f"F1-Score: {performance_metrics['f1_score']:.1%}")

    print("\nConfusion Matrix:")
    cm = performance_metrics['confusion_matrix']
    print("Actual Good:", cm[0])
    print("Actual Poor:", cm[1])
    print("Actual Standard:", cm[2])

    end_time = datetime.now()
    total_time = end_time - start_time
    print(f"\nTime to generate predictions: {total_time}")
    print(f"Training completed successfully!")
    print(f"Final accuracy: {performance_metrics['accuracy']:.1%}")

############################################
# Execute the complete pipeline

# Step 1: Upload Data
upload_data('credit_score_data.csv')

# Step 2: Clean Data
clean_data()

# Step 3: Train Model
train_model()

# Step 4: Generate Predictions
generate_predictions()

print("\nCredit Score Prediction Pipeline Complete!")

Loading credit score dataset...
Dataset loaded: 80000 rows, 29 columns
Time to load: 0:00:02.476619
Cleaning data...
[04:14:23 PM] Performing Data Clean Up
Data cleaning completed. Final shape: (80000, 20)
Time to clean: 0:01:00.484875
Building enhanced neural network...
Feature preparation completed. Feature matrix shape: (80000, 33)
Training set: (64000, 33), Test set: (16000, 33)
Model built successfully. Total parameters: 36483
Training model...
Epoch 1/13
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.6575 - loss: 0.7433
Epoch 2/13
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.6869 - loss: 0.6889
Epoch 3/13
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.6898 - loss: 0.6802
Epoch 4/13
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.7014 - loss: 0.6622
Epoch 5/13
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[