In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Load the dataset
loan_data = pd.read_csv("loan_data_train.csv")

# Drop irrelevant features (Loan_ID)
loan_data.drop("Loan_ID", axis=1, inplace=True)

# Handle missing values
loan_data.ffill(inplace=True)  # Forward fill missing values

# Encode categorical variables
encoder = LabelEncoder()
categorical_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
for col in categorical_cols:
    loan_data[col] = encoder.fit_transform(loan_data[col])

# Manually encode 'Dependents' column to ensure consistency
dependents_mapping = {'0': 0, '1': 1, '2': 2, '3+': 3}
loan_data['Dependents'] = loan_data['Dependents'].map(dependents_mapping)

# Encode target variable 'Loan_Status'
loan_data['Loan_Status'] = loan_data['Loan_Status'].map({'Y': 1, 'N': 0})

# Split dataset into features and target variable
X = loan_data.drop("Loan_Status", axis=1)
y = loan_data["Loan_Status"]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features in the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Preprocess test data
# Scale numerical features in the test data
X_test_scaled = scaler.transform(X_test)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# # Output predictions
# test_data_with_predictions = X_test.copy()
# test_data_with_predictions['Loan_Status'] = y_pred
# test_data_with_predictions['Loan_Status'] = test_data_with_predictions['Loan_Status'].map({1: 'Y', 0: 'N'})

# Save the test data with predicted loan status to a CSV file
# test_data_with_predictions.to_csv('loan_data_test_with_predictions.csv', index=False)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.7886178861788617
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.47      0.61        43
           1       0.77      0.96      0.86        80

    accuracy                           0.79       123
   macro avg       0.82      0.71      0.73       123
weighted avg       0.80      0.79      0.77       123

