# **Part 1 (FINANCIAL LOAN DATA SIMULATION AND FEATURE ENGINEERING WITH LLM/SLM)**

### **Generating Financial Loan Dataset from ChatGPT**

In [None]:
# import necessary library for LLM Mistral 7B
!pip install torch torchvision torchaudio transformers accelerate
!pip install -U bitsandbytes

# create hugging face token and setup
import os
# remove the hugging face token in order to push to github sucessfully
os.environ["HUGGINGFACE_TOKEN"]="INSERTHUGGINGFACETOKEN"
from huggingface_hub import login
login(token=os.environ["HUGGINGFACE_TOKEN"])

# load llm mistral 7b model and move model to device
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
token = os.environ["HUGGINGFACE_TOKEN"]
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
# prompt to LLM (ChatGPT) for data generation

prompt = """

Role:
You are a data simulation assistant helping to generate a realistic financial loan applicant dataset.

Task:
Generate 1500 raw csv format data of synthetic customer records representing individual loan applicants.
The dataset must be realistic, diverse, and internally consistent, but fully synthetic.

Important Rules (VERY IMPORTANT):
Only generate raw, observable customer and financial attributes.
Values should reflect real-world financial constraints (eg., higher income usually correlates with higher balances).


8 Attributes to generate (exactly these):

CustomerID - Unique identifier (e.g., CUST0001, CUST0002, …)
Occupation - Examples: Student, Engineer, Teacher, Sales Executive, Self-Employed, Manager, Clerk, Freelancer, Retired
Monthly Income (MYR) - Numeric (Range: 1,500 – 25,000) and should correlate reasonably with occupation
Account Balance (MYR) - Numeric May be positive or low positive and should generally correlate with income level
Credit Score - Integer (Range: 300 – 850) and distribution should resemble real-world credit scores (more medium-range than extremes)
Total Loan Applied (MYR) - Numeric (Range: 5,000 – 500,000) and larger loans more common for higher-income customers
Loan Duration (Years) - Integer (Range: 1 – 30)
Loan Purpose Text - Short natural-language sentence (10–20 words)

For Loan Purpose Text:
Examples:
“Applying for a personal loan to consolidate existing debts”
“Seeking financing to expand my small business operations”
“Loan needed to cover education expenses and tuition fees”
Text should reflect realistic financial motivations

Output Format:
Output the dataset in CSV format
Include a header row
One row per customer
Ensure no missing values

"""

In [None]:
# display generated financial dataset by LLM
import pandas as pd

# manual
df = pd.read_csv("/content/loan_applicants_dataset.csv")
df.head(10)

### **Fisrt Feature Engineering on Topic Detection based on Loan Purpose Text by SLM BART-MNLI**

In [None]:
## Topic Detection by SLM
from transformers import pipeline

# Use SLM BART-MNLI in topic detection on loan
topic_slm = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")
topics = ["Business Expansion","Home Improvement","Education","Vehicle Purchase","Medical Expenses","Emergency Expenses"]
def get_topic(text):
  if pd.isna(text) or text.strip() == "":
    return "Missing"
  result = topic_slm(text, candidate_labels = topics)
  return result['labels'][0]
df['Topic'] = df['Loan Purpose Text'].apply(get_topic)
print(df[['CustomerID', 'Loan Purpose Text', 'Topic']].head(10))

# display aggregate topics
topic_counts = df['Topic'].value_counts()
print("Topic Counts:")
print(topic_counts)

In [None]:
# displaying latest dataset (included topic)

df.head(10)

### **Exporting New CSV file if Updated or Added New Feature**

In [None]:
# save for future use by exporting to new csv (run once if anyone of you letak one)

df.to_csv("loan_applicants_dataset_latest.csv", index=False)

# **Part 2 (PREDICTIVE MODELLING)** **&** **Part 3 (MODEL EVALUATION)**


## **2.1 Based on First Feature (Topic Detection) by Decision Tree & Random Forest**

### **Decision Tree Modelling**

In [None]:
## decision tree

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Encoded_Topic'] = label_encoder.fit_transform(df['Topic'])

X = df[['Monthly Income (MYR)','Account Balance (MYR)','Total Loan Applied (MYR)']]
y = df['Encoded_Topic']

In [None]:
# split dataset and use balanced weights across each class
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# compute class weights to handle imbalance dataset due to nature imbalance topic detection
classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)
weights_dict = dict(zip(classes, class_weights))
sample_weights = y_train.map(weights_dict)

### **Evaluation for Decision Tree Model by Classification Report & PieChart Distribution**

In [None]:
# use decision tree model
from sklearn.metrics import classification_report

dtc = DecisionTreeClassifier(
    max_depth=7,
    min_samples_leaf=10,
    class_weight='balanced',
    random_state = 42
)
dtc.fit(X_train, y_train, sample_weight = sample_weights)

# Predictions and evaluation by classification report
dtc_pred = dtc.predict(X_test)
from sklearn.metrics import classification_report
print("Decision Tree Results with Class Weights")
print(classification_report(y_test, dtc_pred))

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

classes = np.arange(6)  # all 6 classes: 0 to 5

# create a DataFrame with predictions
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': dtc_pred
})

# count predictions by class, reindex to include all classes
prediction_counts = results_df['Predicted'].value_counts().reindex(classes, fill_value=0)
actual_counts = results_df['Actual'].value_counts().reindex(classes, fill_value=0)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Donut chart
colors = plt.cm.Set3(np.arange(len(classes)))
wedges, texts, autotexts = ax1.pie(
    prediction_counts.values,
    labels=[f'Class {i}' for i in classes],
    colors=colors,
    autopct='%1.1f%%',
    pctdistance=0.85,
    startangle=90
)
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
ax1.add_artist(centre_circle)
ax1.set_title('DecisionTree Predictions Distribution\n(Donut Chart)', fontsize=14, fontweight='bold')
ax1.axis('equal')

# Bar chart: Actual vs Predicted
x = np.arange(len(classes))
width = 0.35
ax2.bar(x - width/2, actual_counts.values, width, label='Actual', alpha=0.7)
ax2.bar(x + width/2, prediction_counts.values, width, label='Predicted', alpha=0.7)
ax2.set_xlabel('Class')
ax2.set_ylabel('Count')
ax2.set_title('Comparison: Actual vs Predicted Distribution')
ax2.set_xticks(x)
ax2.set_xticklabels([f'Class {i}' for i in classes])
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


### **Visualise Decision Tree Rule (Only Show Once in This Project)**

In [None]:
# decision rule visualisation
from sklearn.tree import export_text
`
rules = export_text(model, feature_names=['Monthly Income (MYR)','Account Balance (MYR)','Total Loan Applied (MYR)'])
print("Decision Tree Rules:\n", rules)

### **Bias and Under/Overfitting Occured in Decision Tree: Overcome by Apply Bagging Method by Random Forest**

### **Random Forest Modelling & Evaluation by Classification Report & PieChart Distribution**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    random_state=42
)

rf.fit(X_train, y_train, sample_weight=sample_weights)

rf_pred = rf.predict(X_test)
print("Random Forest Results")
print(classification_report(y_test, rf_pred))

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

classes = np.arange(6)

# create a DataFrame with predictions
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': rf_pred
})

# count predictions by class, reindex to include all classes
prediction_counts = results_df['Predicted'].value_counts().reindex(classes, fill_value=0)
actual_counts = results_df['Actual'].value_counts().reindex(classes, fill_value=0)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Donut chart
colors = plt.cm.Set3(np.arange(len(classes)))
wedges, texts, autotexts = ax1.pie(
    prediction_counts.values,
    labels=[f'Class {i}' for i in classes],
    colors=colors,
    autopct='%1.1f%%',
    pctdistance=0.85,
    startangle=90
)
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
ax1.add_artist(centre_circle)
ax1.set_title('RandomForest Predictions Distribution\n(Donut Chart)', fontsize=14, fontweight='bold')
ax1.axis('equal')

# Bar chart: Actual vs Predicted
x = np.arange(len(classes))
width = 0.35
ax2.bar(x - width/2, actual_counts.values, width, label='Actual', alpha=0.7)
ax2.bar(x + width/2, prediction_counts.values, width, label='Predicted', alpha=0.7)
ax2.set_xlabel('Class')
ax2.set_ylabel('Count')
ax2.set_title('Comparison: Actual vs Predicted Distribution')
ax2.set_xticks(x)
ax2.set_xticklabels([f'Class {i}' for i in classes])
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()