In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from quantum_random_forest import QuantumRandomForest, set_multiprocessing
from split_function import SplitCriterion
from data_construction import data_preprocessing
from sklearn import metrics, datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
data = pd.read_csv('loan_approval_dataset.csv') 

In [3]:
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


### Data pre-processing

In [4]:
df = data.copy()
df.isnull().sum()
df.columns = df.columns.str.strip()
# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Apply label encoding to the 'education' column
df['education'] = 1 - label_encoder.fit_transform(df['education'])

# Apply label encoding to the 'self_employed' column
df['self_employed'] = label_encoder.fit_transform(df['self_employed'])

# Apply label encoding to the 'loan_status' column
df['loan_status'] = 1 - label_encoder.fit_transform(df['loan_status'])

# Display the updated DataFrame with encoded columns
print(df[['education', 'self_employed','loan_status']])

      education  self_employed  loan_status
0             1              0            1
1             0              1            0
2             1              0            0
3             1              0            0
4             0              1            0
...         ...            ...          ...
4264          1              1            0
4265          0              1            1
4266          0              0            0
4267          0              0            1
4268          1              0            1

[4269 rows x 3 columns]


#### normalization

In [5]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Define the feature columns (X) and target column (y)
X = df.drop(columns=['loan_status'])  # Drop 'loan_status' column to get feature columns
y = df['loan_status']  # Target variable
num_classes = 2

# Select only the numerical columns for scaling (excluding 'loan_status')
numerical_columns = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
                      'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value',
                      'bank_asset_value']

# Apply scaling to the numerical columns
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Display the scaled feature variables (X) and the target variable (y)
print("Scaled Feature Variables (x):")
print(X.head())

print("\nTarget Variable (y):")
print(y.head())

Scaled Feature Variables (x):
   loan_id  no_of_dependents  education  self_employed  income_annum  \
0        1         -0.294102          1              0      1.617979   
1        2         -1.473548          0              1     -0.341750   
2        3          0.295621          1              0      1.439822   
3        4          0.295621          1              0      1.119139   
4        5          1.475067          0              1      1.689242   

   loan_amount  loan_term  cibil_score  residential_assets_value  \
0     1.633052   0.192617     1.032792                 -0.780058   
1    -0.324414  -0.508091    -1.061051                 -0.733924   
2     1.610933   1.594031    -0.544840                 -0.057300   
3     1.721525  -0.508091    -0.771045                  1.649637   
4     1.002681   1.594031    -1.264055                  0.757724   

   commercial_assets_value  luxury_assets_value  bank_asset_value  
0                 2.877289             0.832028          0.9

### Classic training

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


# Create a RandomForestClassifier instance
random_forest = RandomForestClassifier(random_state=42)

# Train the random forest model
random_forest.fit(x_train, y_train)

# Predict on the test set
y_pred_rf = random_forest.predict(x_test)


# Compute confusion matrix
cm_rf = confusion_matrix(y_test, y_pred_rf.round())

# Compute classification report
report_rf = classification_report(y_test, y_pred_rf.round())

# Create heatmap trace
trace2 = go.Heatmap(z=cm_rf,
                   x=['Approved', 'Rejected'],
                   y=['Approved', 'Rejected'],
                   showscale=False,
                   colorscale=[
                       [0.0, "#041d4f"],  # Light blue
                       [0.5, "#365799"],  # Medium blue
                       [1.0, "#8cb1fa"],  # Dark purple
                   ],
                   xgap=18,
                   ygap=18,
                   text=cm_rf,
                   texttemplate="%{text}")

# Create layout
layout = go.Layout(title='Random Forest Confusion Matrix',
                   xaxis=dict(title='Predicted Label', range=[-0.5, 1.5]),
                   yaxis=dict(title='True Label', range=[-0.5, 1.5]))

# Create figure
fig = go.Figure(data=[trace2], layout=layout)

# Dynamically adjust figure size based on matrix size
fig.update_layout(width=630, height=630)  # Adjust the width and height as needed


# Display the figure
fig.show()

In [11]:
print(report_rf)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       318
           1       0.98      0.98      0.98       536

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



In [8]:
models1 = pd.DataFrame({'Model': ['Random Forest'],
                       'False Positives': [cm_rf[0,1]]})

models1.sort_values(by='False Positives', ascending=False)

Unnamed: 0,Model,False Positives
0,Random Forest,10


In [9]:
models3 = pd.DataFrame({'Model': ['Random Forest'],
                       'True negative': [cm_rf[1,1]]})

models3.sort_values(by='True negative', ascending=False)

Unnamed: 0,Model,True negative
0,Random Forest,527


### Quantum Training

In [52]:
training_set, testing_set = data_preprocessing(X, y, 
                                               train_prop=0.8,           # Proportion of dataset allocated for training
                                               X_dim=None)                # Determine the required dimension of the dataset. None for default.

### Model parameters 

In [54]:
n_qubits = len(training_set['X'][0])                 # Number of qubits for the embedding
dt_type = 'qke'                                      # Do not touch
ensemble_var = None                                  # Do not touch
branch_var = ['eff_anz_pqc_arch', 
              'iqp_anz_pqc_arch', 
              'eff_anz_pqc_arch']                    # Type of Anzatz, or as a list for different down the tree - as given 
num_trees = 3                                        # Number of trees in ensemble 
split_num = 2                                        # Do not touch
pqc_sample_num = 2024                                # Number of circuit samples per kernel estimation
num_classes = num_classes                            # Number of classes in dataset
max_depth = 4                                        # Maximum depth of the tree
num_params_split = n_qubits*(n_qubits +1)            # Number of parameters in the embedding (different for different anzatz), list for different down the tree [2 * n_qubits ** 2 , n_qubits*(n_qubits +1), 2 * n_qubits ** 2]
num_rand_gen = 1                                     # Do not touch
num_rand_meas_q = n_qubits                           # Do not touch 
svm_num_train = 5                                    # L, Number of Landmarks
svm_c = 10                                           # C term in SVM optimisation, or list down the tree [100, 50, 20]
min_samples_split = svm_num_train                    # Minimum number of samples
embedding_type = ['as_params_all', 
                  'as_params_iqp', 
                  'as_params_all']                   # Type of embedding, or as a list - as given
criterion = SplitCriterion.init_info_gain('clas')    # Do not touch
device = 'cirq'                                      # Choose a device. Also possible to run on IBM

### Set up model

In [55]:
qrf = QuantumRandomForest(n_qubits, 'clas', num_trees, criterion, max_depth=max_depth, 
                          min_samples_split=min_samples_split, tree_split_num=split_num, num_rand_meas_q=num_rand_meas_q,
                          ensemble_var=ensemble_var, dt_type=dt_type, num_classes=num_classes, ensemble_vote_type='ave',
                          num_params_split=num_params_split, num_rand_gen=num_rand_gen, pqc_sample_num=pqc_sample_num,
                          embed=embedding_type, branch_var=branch_var, svm_num_train=svm_num_train, svm_c=svm_c, 
                          nystrom_approx=True, device=device)

### Train

In [56]:
cores = 3
set_multiprocessing(True, cores)                   # Set to False if you don't want parallel computation
qrf.train(training_set, 
          partition_sample_size=180)               # Partition size is the number of instances given to each tree. Set to None to use all the data for all trees

### Test

In [57]:
acc, preds_qrf = qrf.test(testing_set, 
                          ret_pred=True, 
                          parallel=False,            # Set to False if you don't want parallel computation. Needs to be False for calc_tree_corr to be True.
                          calc_tree_corr=True)       # True is required to later look at correlations between trees

100%|██████████| 5/5 [00:01<00:00,  3.96it/s]
100%|██████████| 5/5 [00:02<00:00,  1.77it/s]
100%|██████████| 5/5 [00:00<00:00,  5.86it/s]
100%|██████████| 5/5 [00:00<00:00,  6.15it/s]
100%|██████████| 5/5 [00:02<00:00,  2.30it/s]
100%|██████████| 5/5 [00:00<00:00,  6.23it/s]
100%|██████████| 5/5 [00:01<00:00,  3.61it/s]
100%|██████████| 5/5 [00:02<00:00,  2.08it/s]
100%|██████████| 5/5 [00:00<00:00,  6.48it/s]
100%|██████████| 5/5 [00:00<00:00,  6.60it/s]s/it]
100%|██████████| 5/5 [00:01<00:00,  2.60it/s]
100%|██████████| 5/5 [00:00<00:00,  6.28it/s]
100%|██████████| 5/5 [00:00<00:00,  6.03it/s]
100%|██████████| 5/5 [00:02<00:00,  2.29it/s]
100%|██████████| 5/5 [00:00<00:00,  6.28it/s]
100%|██████████| 5/5 [00:00<00:00,  5.74it/s]
100%|██████████| 5/5 [00:02<00:00,  2.24it/s]
100%|██████████| 5/5 [00:00<00:00,  6.51it/s]
100%|██████████| 5/5 [00:00<00:00,  6.70it/s]s/it]
100%|██████████| 5/5 [00:01<00:00,  2.57it/s]
100%|██████████| 5/5 [00:00<00:00,  7.56it/s]
100%|██████████| 5/5 [00

In [58]:
# Classification report
print(f"Classification report for QRF:\n"
      f"{metrics.classification_report(testing_set.y, preds_qrf)}\n")

Classification report for QRF:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       375
           1       0.65      1.00      0.79       693

    accuracy                           0.65      1068
   macro avg       0.32      0.50      0.39      1068
weighted avg       0.42      0.65      0.51      1068




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
