In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix

# Step 1: Load the data
loan_data = pd.read_csv("newBankLoanApproval.csv")
applicants_data = pd.read_csv("newApplicants1.csv")

# Step 2: Separate features and target variable
X = loan_data.drop(columns=['HasMortgage'])  # Features
y = loan_data['HasMortgage']  # Target variable

# Step 3: Define preprocessing steps for numerical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Step 4: Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Implement the Decision Tree algorithm within a pipeline
decision_tree = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', DecisionTreeClassifier())])

# Step 7: Define hyperparameters to tune
param_grid = {
    'classifier__max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum samples required at each leaf node
}

# Step 8: Perform grid search
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Step 9: Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Step 10: Evaluate the model with best hyperparameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with Best Hyperparameters:", accuracy)

# Step 11: Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Step 12: Predict loan eligibility for applicants in newApplicants1.csv
predictions = best_model.predict(applicants_data)
for i, prediction in enumerate(predictions):
    print(f"Applicant {i+1}: {'Eligible' if prediction == 1 else 'Not Eligible'} for loan")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Best Hyperparameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5}
Accuracy with Best Hyperparameters: 0.494791054713508
Confusion Matrix:
[[13698 11840]
 [13959 11569]]
Applicant 1: Not Eligible for loan
Applicant 2: Not Eligible for loan
Applicant 3: Not Eligible for loan
Applicant 4: Not Eligible for loan
Applicant 5: Eligible for loan
Applicant 6: Eligible for loan
Applicant 7: Not Eligible for loan
Applicant 8: Not Eligible for loan
Applicant 9: Not Eligible for loan
Applicant 10: Not Eligible for loan
Applicant 11: Not Eligible for loan
Applicant 12: Not Eligible for loan
Applicant 13: Eligible for loan
Applicant 14: Eligible for loan
Applicant 15: Eligible for loan
Applicant 16: Not Eligible for loan
Applicant 17: Not Eligible for loan
Applicant 18: Not Eligible for loan
Applicant 19: Not Eligible for loan
Applicant 20: Not Eligible for loan
