In [10]:
import pandas as pd

data = pd.read_csv('data/data.csv')
data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [11]:
# check null values
print(data.isnull().sum())

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64


In [12]:
# separate feature dataframes
# categorical features
c_column = ["person_gender", "person_education", "person_home_ownership", "previous_loan_defaults_on_file", "loan_intent"]
# numerical features
n_column = ["person_age", "person_income", "person_emp_exp", "loan_amnt","loan_int_rate",
        "loan_percent_income","cb_person_cred_hist_length","credit_score"]

In [13]:
# Encode the categorical variables
from sklearn.preprocessing import LabelEncoder

# Create a copy of the DataFrame to avoid modifying the original data
df_encoded = data.copy()

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode each categorical column
for col in c_column:
    df_encoded[col] = label_encoder.fit_transform(df_encoded[col])

# Display the first few rows of the encoded DataFrame
df_encoded.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,0,4,71948.0,0,3,35000.0,4,16.02,0.49,3.0,561,0,1
1,21.0,0,3,12282.0,0,2,1000.0,1,11.14,0.08,2.0,504,1,0
2,25.0,0,3,12438.0,3,0,5500.0,3,12.87,0.44,3.0,635,0,1
3,23.0,0,1,79753.0,0,3,35000.0,3,15.23,0.44,2.0,675,0,1
4,24.0,1,4,66135.0,1,3,35000.0,3,14.27,0.53,4.0,586,0,1


In [14]:
# Show encoding mapping for each categorical column
for col in c_column:
    label_encoder.fit(data[col])
    print(f"Encoding for '{col}':")
    for idx, class_ in enumerate(label_encoder.classes_):
        print(f"  {idx}: {class_}")
    print()

Encoding for 'person_gender':
  0: female
  1: male

Encoding for 'person_education':
  0: Associate
  1: Bachelor
  2: Doctorate
  3: High School
  4: Master

Encoding for 'person_home_ownership':
  0: MORTGAGE
  1: OTHER
  2: OWN
  3: RENT

Encoding for 'previous_loan_defaults_on_file':
  0: No
  1: Yes

Encoding for 'loan_intent':
  0: DEBTCONSOLIDATION
  1: EDUCATION
  2: HOMEIMPROVEMENT
  3: MEDICAL
  4: PERSONAL
  5: VENTURE



In [15]:
# Scale person_age, person_income, person_emp_exp, loan_amnt, loan_int_rate, loan_percent_income, cb_person_cred_hist_length, credit_score
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the specified columns
scaled_columns = ["person_age", "person_income", "person_emp_exp", "loan_amnt",
                   "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length",
                   "credit_score"]
scaled_data = scaler.fit_transform(data[scaled_columns])

# Create a DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=scaled_columns)

# Display the first few rows of the scaled DataFrame
scaled_df.head()

# Concatenate the scaled data with the original DataFrame (excluding the original columns)
df_scaled = pd.concat([df_encoded.drop(columns=scaled_columns), scaled_df], axis=1)
df_scaled.head()

Unnamed: 0,person_gender,person_education,person_home_ownership,loan_intent,previous_loan_defaults_on_file,loan_status,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score
0,0,4,3,4,0,1,-0.953538,-0.10409,-0.892284,4.024953,1.683039,4.016394,-0.739109,-1.419814
1,0,3,2,1,1,0,-1.118963,-0.846005,-0.892284,-1.359209,0.044782,-0.684829,-0.996863,-2.549975
2,0,3,0,3,0,1,-0.457264,-0.844065,-0.397517,-0.6466,0.625557,3.443074,-0.739109,0.047412
3,0,1,3,3,0,1,-0.788113,-0.007039,-0.892284,4.024953,1.417829,3.443074,-0.996863,0.840507
4,1,4,3,3,0,1,-0.622689,-0.176371,-0.727362,4.024953,1.095549,4.47505,-0.481354,-0.92413


In [16]:
# train test split
from sklearn.model_selection import train_test_split

# Define the feature columns and target column
X = df_scaled.drop(columns=["loan_status"])
y = df_scaled["loan_status"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (36000, 13)
X_test shape: (9000, 13)
y_train shape: (36000,)
y_test shape: (9000,)


In [18]:
# use decision tree and random forest classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the classifiers
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)

# Fit the Decision Tree classifier to the training data
decision_tree.fit(X_train, y_train)

# Make predictions using the Decision Tree classifier
y_pred_dt = decision_tree.predict(X_test)

# Evaluate the Decision Tree classifier's performance
print("Decision Tree Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print()

# Fit the Random Forest classifier to the training data
random_forest.fit(X_train, y_train)

# Make predictions using the Random Forest classifier
y_pred_rf = random_forest.predict(X_test)

# Evaluate the Random Forest classifier's performance
print("Random Forest Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Decision Tree Classifier:
Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      6990
           1       0.77      0.78      0.77      2010

    accuracy                           0.90      9000
   macro avg       0.85      0.86      0.85      9000
weighted avg       0.90      0.90      0.90      9000

Confusion Matrix:
[[6522  468]
 [ 448 1562]]

Random Forest Classifier:
Accuracy: 0.93
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      6990
           1       0.88      0.77      0.82      2010

    accuracy                           0.93      9000
   macro avg       0.91      0.87      0.89      9000
weighted avg       0.92      0.93      0.92      9000

Confusion Matrix:
[[6787  203]
 [ 460 1550]]
