## import library and load Data

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pyswarms as ps
from deap import base, creator, tools, algorithms
import warnings
warnings.filterwarnings('ignore')

In [21]:
train_df = pd.read_csv('loan_train.csv')
test_df = pd.read_csv('loan_test.csv')

print(f"Row Number Train: {train_df.shape[0]}")
print(f"Row Number Test: {test_df.shape[0]}")
print(f"Columns :\n{train_df.columns.tolist()}")

Row Number Train: 614
Row Number Test: 367
Columns :
['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Applicant_Income', 'Coapplicant_Income', 'Loan_Amount', 'Term', 'Credit_History', 'Area', 'Status']


## Data Preprocessing

In [22]:
def preprocess_data(train_data, test_data):
    """
    Data Preprocessing:
    1. NAN Delete
    2. Converting nominal variables to numeric with LabelEncoder
    """
    
    # Copy Data
    train = train_data.copy()
    test = test_data.copy()
    
    print(f"\n Number of NaN before deletion:")
    print(train.isnull().sum())
    
    # Delete rows with NaN
    
    train_clean = train.dropna()
    test_clean = test.dropna()
    
    
    # Nominal columns to be converted
    categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Area', 'Status']
    
    label_encoders = {}
    
    for col in categorical_columns:
        if col in train_clean.columns:
            le = LabelEncoder()
            
            # for Train
            train_clean[col] = le.fit_transform(train_clean[col].astype(str))
            
            # for test
            
            if col in test_clean.columns:
                # Use learned_classes on train
                # If it is a new value, it will be converted to NaN
                test_clean[col] = test_clean[col].map(
                    lambda x: le.transform([str(x)])[0] if str(x) in le.classes_ else -1
                )
                
                label_encoders[col] = le
                mapping = dict(zip(le.classes_, le.transform(le.classes_)))
                print(f"{col} mapping: {mapping}")

    return train_clean, test_clean, label_encoders

In [23]:
train_processed, test_processed, encoders = preprocess_data(train_df, test_df)



 Number of NaN before deletion:
Gender                13
Married                3
Dependents            15
Education              0
Self_Employed         32
Applicant_Income       0
Coapplicant_Income     0
Loan_Amount            0
Term                  14
Credit_History        50
Area                   0
Status                 0
dtype: int64
Gender mapping: {'Female': np.int64(0), 'Male': np.int64(1)}
Married mapping: {'No': np.int64(0), 'Yes': np.int64(1)}
Education mapping: {'Graduate': np.int64(0), 'Not Graduate': np.int64(1)}
Self_Employed mapping: {'No': np.int64(0), 'Yes': np.int64(1)}
Area mapping: {'Rural': np.int64(0), 'Semiurban': np.int64(1), 'Urban': np.int64(2)}


In [27]:
# Separating features and labels

X_train = train_processed.drop('Status', axis=1)
y_train = train_processed['Status']

if 'Status' in test_processed.columns:
    X_test = test_processed.drop('Status', axis=1)
    y_test = test_processed['Status']
else:
    X_test = test_processed.copy()
    y_test = None

In [29]:
print("Data ready for model:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")

for i, col in enumerate(X_train.columns):
    print(f"  {i}: {col}")

print(f"\n Label distribution:")
print(y_train.value_counts())


Data ready for model:
X_train shape: (499, 11)
y_train shape: (499,)
X_test shape: (293, 11)
  0: Gender
  1: Married
  2: Dependents
  3: Education
  4: Self_Employed
  5: Applicant_Income
  6: Coapplicant_Income
  7: Loan_Amount
  8: Term
  9: Credit_History
  10: Area

 Label distribution:
Status
1    341
0    158
Name: count, dtype: int64
