## Here we preprocess the data and save the training, testing, and validation data into their respective .csv files for easy access

In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE

In [129]:
data = pd.read_csv("loan_approval_dataset.csv")
data.columns = data.columns.str.strip() # remove whitespace

### one-hot encoding the education, self_employed, and loan_status attributes

In [131]:
encoder = OneHotEncoder(sparse_output=False)
categorical_features = ['education', 'self_employed', 'loan_status']
encoded_features = encoder.fit_transform(data[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Adding the encoded features back to the data
data = pd.concat([data, encoded_df], axis=1).drop(categorical_features, axis=1)

# change columns for education and self_employed. Weird formatting happened here
data['education'] = data['education_ Graduate']
data = data.drop(['education_ Graduate', 'education_ Not Graduate'], axis=1)
data['self_employed'] = data['self_employed_ Yes']
data = data.drop(['self_employed_ Yes', 'self_employed_ No'], axis=1)

# and of course for loan_status
data['loan_status'] = data['loan_status_ Approved']
data = data.drop(['loan_status_ Approved', 'loan_status_ Rejected'], axis=1)

print(data.columns) # ensure that all the column names are correct

Index(['loan_id', 'no_of_dependents', 'income_annum', 'loan_amount',
       'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'education', 'self_employed', 'loan_status'],
      dtype='object')


### feature scaling

In [133]:
# List of numerical features to scale
numeric_features = [
    'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
    'residential_assets_value', 'commercial_assets_value',
    'luxury_assets_value', 'bank_asset_value'
]

scaler = StandardScaler()

data[numeric_features] = scaler.fit_transform(data[numeric_features]) # numerical features should now have a mean of 0 and std deviation of 1

### data splitting

In [135]:
X = data.drop('loan_status', axis=1)  # Features
y = data['loan_status']              # Target

# 70% train, 15% validation, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# uneven class distribution, lets apply SMOTE to add synthetic data
print("Class distribution before SMOTE:", pd.Series(y_train).value_counts())
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
print("Class distribution after SMOTE:", pd.Series(y_train).value_counts())


# making sure the number of labels match up
print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

Class distribution before SMOTE: loan_status
1.0    1846
0.0    1142
Name: count, dtype: int64
Class distribution after SMOTE: loan_status
0.0    1846
1.0    1846
Name: count, dtype: int64
Training set: (3692, 12) (3692,)
Validation set: (640, 12) (640,)
Test set: (641, 12) (641,)


### Finally, save the training, validation, and testing samples to their respective .csv files

In [137]:
X_train.to_csv("X_train.csv", index=False)
X_val.to_csv("X_val.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_val.to_csv("y_val.csv", index=False)
y_test.to_csv("y_test.csv", index=False)