In [2]:
%config InteractiveShell.ast_node_interactivity='all'

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score


In [None]:
# to find out outlier data by field

# Iterate over features and create box plots
# for feature in FeatureFieldN + NumberFields:
for feature in ["Age"]:
    print(feature)
    df_orig[[feature]].boxplot()
    plt.title(f'Distribution of {feature} Features')
    plt.ylabel('Values')
    plt.show()


### Correlation Finding

In [None]:
# to show all fields correlation
correlation_matrix = df.corr()
plt.figure(figsize=(40, 40))
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

### To find out if it is possible to group loan application to approve/reject

In [None]:
# to use unsuperivised kmeans method to group data
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# to extract features
df_train = df[FeatureFieldNH + FeatureFieldOH + FeatureFieldB + NumberFields + ["MonthlyLoanPayment", "LoanApproved"]].head(10000)
X = df_train.drop(["LoanApproved"], axis=1)

# to standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To apply K-Means clustering, for two clusters only, approve/reject
best_score = -1
best_labels = None
best_model = None
b = pd.DataFrame()
# random_state = 58 # 58 is best for full dataset, 48 is best for first half dataset
for r in range(40,70):
    kmeans_model = KMeans(n_clusters=2, init='k-means++', random_state=r)
    labels = kmeans_model.fit_predict(X_scaled)
    # score = silhouette_score(X_scaled, labels)
    b["LoanApprovedCompare"] = df_train["LoanApproved"] == labels.astype('bool')
    score = b["LoanApprovedCompare"].value_counts()[True] / b["LoanApprovedCompare"].value_counts()[False]
    if score > best_score:
        best_score = score
        best_labels = labels
        best_model = kmeans_model
        print(f"best random_state={r} -> score={score}")
    else:
        print(f"random_state={r} -> score={score}")

# to save the trained model
dump(best_model, 'kmeans_model.joblib')

# To add cluster labels to the original DataFrame
C = X
C['LoanApproved'] = df_train["LoanApproved"]
C['LoanApprovedAI'] = best_labels.astype('bool')
C["LoanApprovedCompare"] = C["LoanApproved"] == C["LoanApprovedAI"]

C["LoanApprovedCompare"].value_counts()[True] / C["LoanApprovedCompare"].value_counts()[False]

In [121]:
# to re-use the model

# to load the model from file
new_model = load('kmeans_model.joblib')
# to predict the result
new_labels = new_model.predict(X_scaled)

C = X
C['LoanApproved'] = df_train["LoanApproved"]
C['LoanApprovedAINew'] = new_labels.astype('bool')
C['LoanApprovedAI'] = best_labels.astype('bool')
C["LoanApprovedCompare"] = C["LoanApproved"] == C["LoanApprovedAI"]
C["LoanApprovedAICompare"] = C["LoanApprovedAI"] == C["LoanApprovedAINew"]


In [None]:
# logic before change to Loan Approved Seed
# to use unsupervised Gaussian Mixture method to train the model

# to construct training data
df_train = df[FeatureFieldNH + FeatureFieldOH + FeatureFieldB + NumberFields + ["LoanAmount", "LoanDuration", "LoanApproved", "CreditScore", "RiskScore"]].head(20000)

# to derive X for training
# exclude Outliers records, 
# removed the real LoanApproved indicator and CreditScore, RiskScore
X = df_train.drop(["LoanApproved", "CreditScore", "RiskScore"], axis=1)

# to standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To apply GaussianMixture clustering, for two clusters only, approve/reject
gmm = None
best_score = -1
best_labels = None
best_model = None
label_swap_indicator = False
for r in range(40,60):
    # to predict the value and train the model
    gmm = GaussianMixture(n_components=2, init_params="random", random_state=r)
    labels = gmm.fit_predict(X_scaled)

    # to evaluate the picking of cluster with two assumptions
    # 1. the False is more than True, ie. Approve is less than Reject
    # 2. the gap between False and True, bigger is better, ie. lesser Approve is better
    b = pd.DataFrame()
    b["LoanApprovedAI"] = labels.astype('bool')
    t = b["LoanApprovedAI"].value_counts()[True]
    f = b["LoanApprovedAI"].value_counts()[False]
    score = abs(t - f)/(t+f)
    if score > best_score:
        best_score = score
        best_model = gmm
        if (t < f): # Approve is less than Reject
            best_labels = b["LoanApprovedAI"]
            label_swap_indicator = False
            print(f"best random_state={r} -> score={score}, (t={t}/f={f})")
        else:
            label_swap_indicator = True
            print(f"best random_state={r} -> score={score}, (f={f}/t={t})")
        
    else:
        print(f"random_state={r} -> score={score}/({t}/{f})")

# Predict the component labels for each data point

# Print the means and covariances of the components
print("Means:", gmm.means_)
print("Covariances:", gmm.covariances_)


In [None]:
# Logic before remove score calculation


# to use unsupervised Gaussian Mixture method to train the model

# to clone Y from training data, Y is used for result evaluation later
Y = df_train.copy(deep=True)
# to exclude Outlier data
# Y = Y[Y["Outlier"] == False]

# to derive X for training
X = Y.copy(deep=True)

# to remove the manual label LoanApproved indicator and CreditScore, RiskScore, it may affect the training
drop_fields = ["LoanApproved", "CreditScore", "RiskScore"]
# to remove fields that are used to calculate other field. it may be redundant for training.
drop_fields = drop_fields + ["AnnualIncome", "TotalAssets", "TotalLiabilities", "LoanAmount", ]
drop_fields = drop_fields + ["SavingsAccountBalance", "CheckingAccountBalance", ]
# drop_fields = drop_fields + ["MonthlyIncome", "MonthlyDebtPayments", "MonthlyLoanPayment", ]
drop_fields = drop_fields + ["Outlier"]
X = X.drop(drop_fields, axis=1)

# to standardize the data
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index) # to align the index

# to train the model and predict the cluster
best_model = None
best_labels = None
for e in range(1, 2):
    # to calc the Seed Record that is approved for sure. It is used for training to identify which cluster is marked as approved.
    LoanApprovedSeed = X.nlargest(64, "NetWorth").nlargest(16, "MonthlyIncome").nsmallest(4, "MonthlyDebtPayments").nsmallest(1, "MonthlyLoanPayment")

    best_score = -1
    for r in range(48,49):
        # To use GaussianMixture for clustering, approve/reject
        gmm = GaussianMixture(n_components=4, init_params="random", random_state=r)

        # to standardize the labels
        labels = gmm.fit_predict(X_scaled)
        b = pd.DataFrame({'LoanApprovedAIl' + str(e): labels}, index=X_scaled.index)
        # to mark AI Loan Approved indicator as True if the label is same as the label of the seed record
        b["LoanApprovedAI" + str(e)] = b["LoanApprovedAIl" + str(e)] == b.iat[LoanApprovedSeed.index[0], 0]

        # to evaluate the picking of cluster
        t = b["LoanApprovedAI" + str(e)].value_counts()[True]
        f = b["LoanApprovedAI" + str(e)].value_counts()[False]
        score = abs(t - f)/(t + f) # Balance Ratio, assume the gap between False and True, bigger is better, ie. lesser Approve is better
        score = -abs(t - f)/(t + f) # Negative Balance Ratio, assume the gap between False and True, smaller is better, ie. Approve and Reject is similar
        score = min(t, f) / max(t, f) # Class Balance Measure, assume Approve and Reject is similar
        score = 1 - (t/(t + f))**2 - (f/(t + f))**2 # Gini Coefficient, assume Approve and Reject is similar
        if score > best_score and t < f: 
            best_model = gmm
            best_labels = b
            best_score = score
            print(f"best epoch={e}, random_state={r}, Score={score}, True={t}, False={f}, SeedIndex={LoanApprovedSeed.index[0]}, SeedLabel={b.iat[LoanApprovedSeed.index[0], 0]}")
        else:
            print(f"epoch={e}, random_state={r}, Score={score}, True={t}, False={f}, SeedIndex={LoanApprovedSeed.index[0]}, SeedLabel={b.iat[LoanApprovedSeed.index[0], 0]}")

        b.value_counts()

    Y["LoanApprovedAI" + str(e)] = best_labels["LoanApprovedAI" + str(e)]
    Y["LoanApprovedAIl" + str(e)] = best_labels["LoanApprovedAIl" + str(e)]

# Predict the component labels for each data point

# Print the means and covariances of the components
# print("Means:", gmm.means_)
# print("Covariances:", gmm.covariances_)


In [None]:
# logic to use repeat prediction for same set of data. result is fixed and no improvement.

# to use unsupervised Gaussian Mixture method to train the model

# to clone Y from training data, Y is used for result evaluation later
Y = df_train.copy(deep=True).head(20000)
Y["LoanApprovedAI"] = False

# to exclude Outlier data
Y = Y[Y["Outlier"] == False]

# to derive X for training
X = Y.copy(deep=True)

# to remove the manual label LoanApproved indicator and CreditScore, RiskScore, it may affect the training
drop_fields = ["LoanApproved", "CreditScore", "RiskScore"]
# to remove fields that are used to calculate other field. it may be redundant for training.
drop_fields = drop_fields + ["AnnualIncome", "TotalAssets", "TotalLiabilities", "LoanAmount", ]
drop_fields = drop_fields + ["SavingsAccountBalance", "CheckingAccountBalance", ]
# drop_fields = drop_fields + ["MonthlyIncome", "MonthlyDebtPayments", "MonthlyLoanPayment", ]
drop_fields = drop_fields + ["Outlier"]
X = X.drop(drop_fields, axis=1)

# to train the model and predict the cluster
# to define hyper-parameter
hp_r = 48 # random_state
hp_c = 6 # number of cluster for labeling
hp_e = 5 # epoch for training
hp_s = 0.2 # evaluation score

# to standardize the data
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index) # to align the index

# To use GaussianMixture model for clustering, approve/reject
gmm = GaussianMixture(n_components=hp_c, init_params="random", random_state=hp_r)
    
best_model = None
best_labels = None
for e in range(1, hp_e + 1):
    # to prepare X scaled of the epoch
    # a = Y[Y["LoanApprovedAI"] == False].index
    Xe_scaled = X_scaled[X_scaled.index.isin(Y[Y["LoanApprovedAI"] == False].index)]

    # to calc the Seed Record that is approved for sure. It is used to identify which cluster is marked as approved during un-supervised training, semi-un-supervised
    LoanApprovedSeed = Y[Y["LoanApprovedAI"] == False].nlargest(64, "NetWorth").nlargest(16, "MonthlyIncome").nsmallest(4, "MonthlyDebtPayments").nsmallest(1, "MonthlyLoanPayment")

    # to standardize the labels
    labels = gmm.fit_predict(Xe_scaled)
    b = pd.DataFrame({"LoanApprovedAIl" + str(e): labels}, index=Xe_scaled.index)
    b["LoanApprovedAI" + str(e)] = b["LoanApprovedAIl" + str(e)] == b.loc[LoanApprovedSeed.index[0]][0]

    # to evaluate the picking of cluster
    t = b["LoanApprovedAI" + str(e)].value_counts()[True]
    f = b["LoanApprovedAI" + str(e)].value_counts()[False]
    score = -abs(t - f)/(t + f) # Negative Balance Ratio, assume the gap between False and True, smaller is better, ie. Approve and Reject is similar
    score = 1 - (t/(t + f))**2 - (f/(t + f))**2 # Gini Coefficient, assume Approve and Reject is similar
    score = abs(t - f)/(t + f) # Balance Ratio, assume the gap between False and True, bigger is better, ie. lesser Approve is better
    score = min(t, f) / max(t, f) # Class Balance Measure, assume Approve and Reject is similar
    if score > hp_s: 
        print(f"skip, epoch={e}, random_state={hp_r}, Score={score}, True={t}, False={f}")
        continue
    else:
        print(f"epoch={e}, random_state={hp_r}, Score={score}, True={t}, False={f}")
        
    # to keep the label
    Y["LoanApprovedAIl" + str(e)] = b["LoanApprovedAIl" + str(e)]
    # to mark LoanApprovedAI of current epoch as True if the label is same as the label of the seed record
    Y["LoanApprovedAI" + str(e)] = b["LoanApprovedAI" + str(e)]

    # to mark the overall LoanApprovedAI as True if any of epoch mark the it as Approved
    col_approved =[]
    for i in range(1, e + 1):
        col_approved.append("LoanApprovedAI" + str(i))
    Y["LoanApprovedAI"] = Y[col_approved].any(axis=1)

    print(f"epoch={e}, random_state={hp_r}, SeedIndex={LoanApprovedSeed.index[0]}, SeedLabel={b.loc[LoanApprovedSeed.index[0]][0]}")
    b.value_counts()
    df_group = Y.groupby(["LoanApprovedAI", "LoanApproved"]).agg({'MonthlyLoanPayment': ['sum', 'count']})
    formatted_sum = df_group.applymap(lambda x: f"{x:,.2f}")
    print(formatted_sum)

# Predict the component labels for each data point

# Print the means and covariances of the components
# print("Means:", gmm.means_)
# print("Covariances:", gmm.covariances_)


In [None]:
# to draw static diagram

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(eval_result['Clusters'], eval_result['Random'], eval_result['Score'], 
           s=eval_result['Accuracy']*1000,  # Scale Accuracy to suitable dot size
           alpha=0.7, edgecolors='black')

# Set labels and title
ax.set_xlabel('Clusters')
ax.set_ylabel('Random')
ax.set_zlabel('Score')
ax.set_title('3D Scatter Plot of Clusters, Random, and Score')

# Show plot
plt.show()

In [None]:
# Categories
categories = ['TT', 'FF', 'TF', 'FT']
N = len(categories)

# Angles
angles = np.linspace(0, 2*np.pi, N, endpoint=False)

# Create radar chart
fig = go.Figure()
for i, cluster in enumerate(data['Clusters'].unique()):
    values = data[data['Clusters'] == cluster][['TT', 'FF', 'TF', 'FT']].mean().values
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories,
        name=f'Cluster {cluster}',
        line=dict(color=f'rgb({i*50}, {i*20}, {i*100})')
    ))

# Update layout
fig.update_layout(
    title='Radar Chart',
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, data[['TT', 'FF', 'TF', 'FT']].max().max()]
        )
    ),
    showlegend=True
)


In [None]:
# Create radar chart
categories = ['TT', 'FF', 'TF', 'FT']
N = len(categories)

# Plot radar chart
plt.figure(figsize=(10, 8))
for i, cluster in enumerate(data['Clusters'].unique()):
    values = data[data['Clusters'] == cluster][['TT', 'FF', 'TF', 'FT']].mean().values
    angles = np.linspace(0, 2*np.pi, N, endpoint=False)
    plt.polar(angles, values, 'o-', linewidth=2, label=cluster)

plt.thetagrids(angles * 180/np.pi, categories)
plt.legend()
plt.show()

In [None]:
# Create confusion matrix
conf_mat = data[['TT', 'FF', 'TF', 'FT']]

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Melt data for plotting
data_melt = data.melt(id_vars=['Clusters', 'Random'], value_vars=['TT', 'FF', 'TF', 'FT'])

# Plot bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x='Clusters', y='Random', hue='Clusters', data=data_melt)
plt.title('TT, FF, TF, FT Comparison')
plt.show()

In [None]:
# to prepare evaluation records for visualization analysis
C = pd.DataFrame()
C["LoanApproved"] = Y["LoanApproved"]
C["LoanApprovedAI"] = b["LoanApprovedAI"]
Y["LoanApprovedCompare"] = C["LoanApproved"] == C["LoanApprovedAI"]

accuracy = C["LoanApprovedCompare"].value_counts()[True]/(C["LoanApprovedCompare"].value_counts()[True] + C["LoanApprovedCompare"].value_counts()[False])
print(f"Accuracy={accuracy}")

la_tf = C.groupby(["LoanApprovedAI", "LoanApproved"]).size()
eval_result = pd.concat([eval_result, pd.Series({"Clusters": c, "Random": r, "Score": score, "Accuracy": accuracy, "TT": la_tf.loc[(True,True)], "FF": la_tf.loc[(False,False)], "TF": la_tf.loc[(True,False)], "FT": la_tf.loc[(False,True)]}).to_frame().T], ignore_index=True)


In [None]:
# Create scatter plot matrix
sns.pairplot(data[['TT', 'FF', 'TF', 'FT']].head(1))
plt.show()

In [None]:
# to draw live diagram to show the most accuracy data
max_acc_idx = eval_result['Accuracy'].idxmax()
max_score_idx = eval_result['Score'].idxmax()

# 3D scatter plot
fig = go.Figure(data=[
        go.Scatter3d(
            x=eval_result['Clusters'],
            y=eval_result['Random'],
            z=eval_result['Score'],
            mode='markers',
            marker=dict(size=eval_result['Accuracy']*50, color='blue'),
            hoverinfo='text',
            hovertext=[f"Accuracy: {acc:.2f}%" for acc in eval_result['Accuracy']*100]),
        go.Scatter3d(
            x=[eval_result.loc[max_acc_idx, 'Clusters']],
            y=[eval_result.loc[max_acc_idx, 'Random']],
            z=[eval_result.loc[max_acc_idx, 'Score']],
            mode='markers',
            marker=dict(
                size=[eval_result.loc[max_acc_idx, 'Accuracy']*50], 
                color='orange', 
                symbol='circle',
                line=dict(width=1)
            ),
            hoverinfo='text',
            hovertext=[f"Max Accuracy: {eval_result.loc[max_acc_idx, 'Accuracy']*100:.2f}%"]),
        go.Scatter3d(
            x=[eval_result.loc[max_score_idx, 'Clusters']],
            y=[eval_result.loc[max_score_idx, 'Random']],
            z=[eval_result.loc[max_score_idx, 'Score']],
            mode='markers',
            marker=dict(
                size=[eval_result.loc[max_score_idx, 'Accuracy']*50], 
                color='green', 
                symbol='circle',
                line=dict(width=1)
            ),
            hoverinfo='text',
            hovertext=[f"Accuracy of Max Score: {eval_result.loc[max_score_idx, 'Accuracy']*100:.2f}%"])
    ])

# Set labels and title
fig.update_layout(
    title='Clusters, Random, Score and Accuracy',
    scene=dict(
        xaxis_title='Clusters',
        xaxis=dict(tickformat="d", dtick=1),
        yaxis_title='Random',
        yaxis=dict(tickformat="d", dtick=1),
        zaxis_title='Score'
    )
)

# Show interactive plot
# fig.show()

In [None]:
# To evaluate the result add cluster labels to the original DataFrame

C1 = X1.copy(deep=True)
C1['LoanApproved'] = C["LoanApproved"]
C1['LoanApprovedAI'] = best_labels1["LoanApprovedAI"]
C1["LoanApprovedCompare"] = C1["LoanApproved"] == C1["LoanApprovedAI"]

C1["LoanApprovedCompare"].value_counts()
C1["LoanApprovedCompare"].value_counts()[True]/(C1["LoanApprovedCompare"].value_counts()[True] + C1["LoanApprovedCompare"].value_counts()[False])

df_group = C1.groupby(["LoanApprovedCompare", "LoanApprovedAI", "LoanApproved"]).agg({'MonthlyLoanPayment': ['sum', 'count']})
formatted_sum = df_group.applymap(lambda x: f"{x:,.2f}")
print(formatted_sum)

C1.iloc[LoanApprovedSeed.index[0]]


In [None]:
# to find out the relation between features and labels

# Calculate silhouette score
# silhouette_avg = silhouette_score(C, best_labels)
# print("Silhouette score:", silhouette_avg)

# Calculate correlation coefficients
# correlations = C.corrwith(pd.Series(best_labels))
# print("Correlations:")
# print(correlations)

# Create box plots
# sns.boxplot(x=labels, y='Age', data=C)
# plt.title('Age 1 vs. Cluster')
# plt.show()

# Create scatter plots
# sns.scatterplot(x='TotalAssets', y='EmploymentStatus_Employed', hue=C['LoanApproved'], data=C)
# plt.title('EmploymentStatus_Employed 1 vs. TotalAssets 2 by Cluster')
# plt.show()

# sns.pairplot(df_train[["MonthlyLoanPayment", "TotalAssets", "TotalLiabilities", "MonthlyDebtPayments", "AnnualIncome"] + ['LoanApproved']], hue='LoanApproved', diag_kind='kde', palette='viridis', markers=['o', 'o', 'o'])
# plt.show()

# sns.factorplot()

# to fine out what are the fields that may affect the LoadApproved
# to find out what are the fields that may affect CreditScore
# to find out what are the fields that may affect RiskScore
# to find out what are the fields that may affect the BaseInterestRate

In [None]:
# to analysis C DataFrame, to find out what is pattern for discrepancy between the real result and AI result.
CorrFields = [
    "TotalDebtToIncomeRatio", "BankruptcyHistory", "NetWorth", "PreviousLoanDefaults", \
    # "LoanAmount", "TotalAssets", "TotalLiabilities", "MonthlyDebtPayments", "AnnualIncome", \
              ]
df_false = C[C["LoanApprovedCompare"] == False][C["LoanApprovedAI"] == True]

# to show all fields correlation
correlation_matrix = df_false[CorrFields].corr()

threshold = 0.01
mask = abs(correlation_matrix) > threshold
correlation_matrix_filtered = correlation_matrix.where(mask, np.nan)

fz = len(CorrFields) * 2
plt.figure(figsize=(fz, fz))
sns.heatmap(correlation_matrix_filtered, annot=True, cmap='viridis', linewidths=0.5)
plt.xticks(rotation=45)
plt.title('AI Approved by Manual Rejected Correlation Matrix')
plt.show()


In [None]:
from sklearn.utils.validation import check_is_fitted

new_scaler = load('credit-risk-scaler.joblib')
try:
    check_is_fitted(new_scaler)  # This will raise an error if not fitted
except :
    print("Scaler is not fitted")

In [None]:
# Basic Processing

# object fields using factorize encoding
Best Score=0.45213374731665806, cluster=2, random_state=42, True=1658, False=18342, SeedIndex=9567, SeedLabel=0
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.452134	1658.0	18342.0	0.7225	0.6078	444.0	14006.0	1214.0	4336.0

# object fields using one-hot encoding
Best Score=0.03861680396615764, cluster=2, random_state=42, True=16773, False=3227, SeedIndex=9567, SeedLabel=0
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.038617	16773.0	3227.0	0.31475	0.61325	3924.0	2371.0	12849.0	856.0

# number fields using bin + one-hot encoding
Best Score=0.04699579069683076, cluster=2, random_state=42, True=15892, False=4108, SeedIndex=9567, SeedLabel=0
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.046996	15892.0	4108.0	0.3261	0.607345	3597.0	2925.0	12295.0	1183.0

# keep SavingsAccountBalance, CheckingAccountBalance
Best Score=0.03942458468960572, cluster=2, random_state=42, True=15541, False=4459, SeedIndex=9567, SeedLabel=0
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.039425	15541.0	4459.0	0.33935	0.60892	3554.0	3233.0	11987.0	1226.0

# removed calculated field
Best Score=0.18485707762202727, cluster=2, random_state=42, True=2103, False=17897, SeedIndex=9567, SeedLabel=0
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.184857	2103.0	17897.0	0.71775	0.611225	619.0	13736.0	1484.0	4161.0
# removed original fields
Best Score=0.22915200000000002, cluster=6, random_state=42, True=2640, False=17360, SeedIndex=9567, SeedLabel=2
Accuracy=0.6755
Clusters        6.000000
Random         42.000000
Score           0.229152
AT           2640.000000
AF          17360.000000
Accuracy        0.675500
TT            465.000000
FF          13045.000000
TF           2175.000000
FT           4315.000000

# to use kmean++
Best Score=0.20285659500000008, cluster=6, random_state=42, True=2291, False=17709, SeedIndex=9567, SeedLabel=4
Accuracy=0.70525
Clusters        6.000000
Random         42.000000
Score           0.202857
AT           2291.000000
AF          17709.000000
Accuracy        0.705250
TT            588.000000
FF          13517.000000
TF           1703.000000
FT           4192.000000

# to use silhouette_score to evaluate
Best Score=-0.00399347717175795, cluster=6, random_state=42, True=2640, False=17360, SeedIndex=9567, SeedLabel=2
Accuracy=0.6755
Clusters        6.000000
Random         42.000000
Score          -0.003993
AT           2640.000000
AF          17360.000000
Accuracy        0.675500
TT            465.000000
FF          13045.000000
TF           2175.000000
FT           4315.000000

## final best result with all above tuning
Best Score=0.45614964715122097, cluster=2, random_state=43, True=1793, False=18207, SeedIndex=9567, SeedLabel=1
Accuracy=0.74435
Clusters        2.00000
Random         43.00000
Score           0.45615
AT           1793.00000
AF          18207.00000
Accuracy        0.74435
TT            730.00000
FF          14157.00000
TF           1063.00000
FT           4050.00000


In [None]:
Best Score=0.499654155, cluster=2, random_state=42, True=10263, False=9737, SeedIndex=9567, SeedLabel=1
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.499654	10263.0	9737.0	0.68585	0.69887	4380.0	9337.0	5883.0	400.0


Best Score=0.4993043549999999, cluster=2, random_state=42, True=10373, False=9627, SeedIndex=9567, SeedLabel=1
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.499304	10373.0	9627.0	0.68585	0.700245	4435.0	9282.0	5938.0	345.0

Best Score=0.48019949999999995, cluster=2, random_state=42, True=8010, False=11990, SeedIndex=9567, SeedLabel=1
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.480199	8010.0	11990.0	0.5992	0.631715	2387.0	9597.0	5623.0	2393.0


Best Score=0.49982328000000004, cluster=2, random_state=42, True=10188, False=9812, SeedIndex=9567, SeedLabel=0
Clusters	Random	Score	AT	AF	Accuracy	Weighted_Score	TT	FF	TF	FT
0	2.0	42.0	0.499823	10188.0	9812.0	0.6571	0.684995	4055.0	9087.0	6133.0	725.0



In [20]:
# eval_result = pd.DataFrame()
j = [
    {"TT": 682, "FF": 13636, "TF": 1584, "FT": 4098, "Description": "Factorize Encoding"},
    {"TT": 484, "FF": 13601, "TF": 1619, "FT": 4296, "Description": "One-Hot Encoding"},
    {"TT": 88,  "FF": 15049, "TF": 171,  "FT": 4692, "Description": "Bin + One-Hot Encoding"},
    {"TT": 386, "FF": 13957, "TF": 1263, "FT": 4394, "Description": "Remove A/C Balance"},
    {"TT": 367, "FF": 14067, "TF": 1153, "FT": 4413, "Description": "Remove DebtToIncomeRatio"},
    {"TT": 465, "FF": 13045, "TF": 2175, "FT": 4315, "Description": "Remove Original Fields to calc DebtToIncomeRatio"},
    {"TT": 588, "FF": 13517, "TF": 1703, "FT": 4192, "Description": "Use KMeans ++"},
    {"TT": 465, "FF": 13045, "TF": 2175, "FT": 4315, "Description": "Use SilHouette Score"},
    {"TT": 730, "FF": 14157, "TF": 1063, "FT": 4050, "Description": "Use 2 Clusters and Random State 43"},
]
eval_result = pd.DataFrame(j)
eval_result

# Custom weights
W_TT = 1.0   # TT (True Positives), Actual Value is same as Predicted Value, both are True
W_FF = 0.7   # FF (True Negatives), Actual Value is same as Predicted Value, both are False
W_TF = 0.5   # TF (False Negatives), Actual Value is False, Predicted Value is True
W_FT = 0.3   # FT (False Positives), Actual Value is True, Predicted Value is False

for i in range(0,len(eval_result)):
    TP = eval_result.loc[i]["TT"]  # True Positive (TT)
    TN = eval_result.loc[i]["FF"]  # True Negative (FF)
    FN = eval_result.loc[i]["TF"]  # False Negative (TF)
    FP = eval_result.loc[i]["FT"]  # False Positive (FT)
    total_outcomes = TP + TN + FN + FP
    weighted_score = ((W_TT * TP) + (W_FF * TN) + (W_TF * FN) + (W_FT * FP)) / total_outcomes
    accuracy = (TP + TN) / total_outcomes
    eval_result.loc[i,"WeightedScore"] = weighted_score
    eval_result.loc[i,"Accuracy"] = accuracy
    
eval_result


Unnamed: 0,TT,FF,TF,FT,Description
0,682,13636,1584,4098,Factorize Encoding
1,484,13601,1619,4296,One-Hot Encoding
2,88,15049,171,4692,Bin + One-Hot Encoding
3,386,13957,1263,4394,Remove A/C Balance
4,367,14067,1153,4413,Remove DebtToIncomeRatio
5,465,13045,2175,4315,Remove Original Fields to calc DebtToIncomeRatio
6,588,13517,1703,4192,Use KMeans ++
7,465,13045,2175,4315,Use SilHouette Score
8,730,14157,1063,4050,Use 2 Clusters and Random State 43


Unnamed: 0,TT,FF,TF,FT,Description,WeightedScore,Accuracy
0,682,13636,1584,4098,Factorize Encoding,0.61243,0.7159
1,484,13601,1619,4296,One-Hot Encoding,0.60515,0.70425
2,88,15049,171,4692,Bin + One-Hot Encoding,0.60577,0.75685
3,386,13957,1263,4394,Remove A/C Balance,0.60528,0.71715
4,367,14067,1153,4413,Remove DebtToIncomeRatio,0.605715,0.7217
5,465,13045,2175,4315,Remove Original Fields to calc DebtToIncomeRatio,0.598925,0.6755
6,588,13517,1703,4192,Use KMeans ++,0.60795,0.70525
7,465,13045,2175,4315,Use SilHouette Score,0.598925,0.6755
8,730,14157,1063,4050,Use 2 Clusters and Random State 43,0.61932,0.74435


In [2]:
# Sample true and predicted labels
y_true = np.array([1, 1, 0, 0, 1, 1, 0, 0, 1, 0])  # True labels, Actual Value, 
y_pred = np.array([1, 1, 0, 1, 1, 0, 0, 0, 1, 0])  # Predicted labels, Predicted Value

y=pd.DataFrame()
y["LoanApproved"] = y_true
y["LoanApprovedAI"] = y_pred
# y["LoanApprovedCompare"] = y["LoanApproved"] == y["LoanApprovedAI"]

da_group = y.groupby(["LoanApprovedAI", "LoanApproved"]).size()
print(da_group)

# Custom weights
W_TT = 1.0   # TT (True Positives), Actual Value is same as Predicted Value, both are True
W_FF = 0.75  # FF (True Negatives), Actual Value is same as Predicted Value, both are False
W_TF = 0.5   # TF (False Negatives), Actual Value is True, Predicted Value is False
W_FT = 0.25  # FT (False Positives), Actual Value is False, Predicted Value is True

# Confusion matrix to count TT, FF, TF, FT
cm = confusion_matrix(y_true, y_pred)
TP = cm[1, 1]  # True Positive (TT)
TN = cm[0, 0]  # True Negative (FF)
FN = cm[1, 0]  # False Negative (TF)
FP = cm[0, 1]  # False Positive (FT)

# Total number of outcomes
total_outcomes = len(y_true)

# Custom weighted score
weighted_score = ((W_TT * TP) + (W_FF * TN) + (W_TF * FN) + (W_FT * FP)) / total_outcomes

# Basic accuracy
accuracy = np.sum(y_true == y_pred) / total_outcomes

# Precision, recall, and F1 score for comparison
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Display results
print(f"Weighted Score: {weighted_score}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


LoanApprovedAI  LoanApproved
0               0               4
                1               1
1               0               1
                1               4
dtype: int64
Weighted Score: 0.775
Accuracy: 0.8
Precision: 0.8
Recall: 0.8
F1 Score: 0.8


In [None]:
# to draw live diagram to show the most accuracy data
max_acc_idx = eval_result['Weighted'].idxmax()
max_score_idx = eval_result['Score'].idxmax()

# 3D scatter plot
fig = go.Figure(data=[
        go.Scatter3d(
            x=eval_result['Clusters'],
            y=eval_result['Random'],
            z=eval_result['Score'],
            mode='markers',
            marker=dict(size=eval_result['Weighted']*50, color='blue'),
            hoverinfo='text',
            hovertext=[f"Weighted: {acc:.2f}%" for acc in eval_result['Weighted']*100],
            name="Score",),
        go.Scatter3d(
            x=[eval_result.loc[max_acc_idx, 'Clusters']],
            y=[eval_result.loc[max_acc_idx, 'Random']],
            z=[eval_result.loc[max_acc_idx, 'Score']],
            mode='markers',
            marker=dict(
                size=[eval_result.loc[max_acc_idx, 'Weighted']*50], 
                color='orange', 
                symbol='circle',
                line=dict(width=1)
            ),
            hoverinfo='text',
            hovertext=[f"Max Weighted: {eval_result.loc[max_acc_idx, 'Weighted']*100:.2f}%"],
            name="Max Weighted",),
        go.Scatter3d(
            x=[eval_result.loc[max_score_idx, 'Clusters']],
            y=[eval_result.loc[max_score_idx, 'Random']],
            z=[eval_result.loc[max_score_idx, 'Score']],
            mode='markers',
            marker=dict(
                size=[eval_result.loc[max_score_idx, 'Weighted']*50], 
                color='green', 
                symbol='circle',
                line=dict(width=1)
            ),
            hoverinfo='text',
            hovertext=[f"Max Weighted: {eval_result.loc[max_score_idx, 'Weighted']*100:.2f}%"],
            name="Max Score",),
    ])

# Set labels and title
fig.update_layout(
    title='Clusters, Random, Score and Weighted',
    scene=dict(
        xaxis_title='Clusters',
        xaxis=dict(tickformat="d", dtick=1),
        yaxis_title='Random',
        yaxis=dict(tickformat="d", dtick=1),
        zaxis_title='Score'
    )
)

# Show interactive plot
# fig.show()

In [None]:

correlation_matrix = X.corr()
plt.figure(figsize=(40, 40))
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

sns.scatterplot(x="Age", y="AnnualIncome", data=X, hue=labels, palette='viridis')

plt.figure(figsize=(10, 6))
plt.title('Scatterplot of Cluster Categories')
plt.show()

1. "BaseInterestRate", "InterestRate", "MonthlyLoanPayment", could be Output fields. to be test in future.
2. to do pairplot for all feature fields.
3. to find out why outlier (IsolationForest) impact the accuracy