# Loan Prediction

### Load and clean data

In [15]:
import pandas as pd

# Load datasets
train = pd.read_csv('https://raw.githubusercontent.com/aafaz/Loan-Prediction-System/refs/heads/master/Data/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/aafaz/Loan-Prediction-System/refs/heads/master/Data/test.csv')

# Preserve Loan_IDs for test predictions
test_ids = test['Loan_ID']

# Add a marker to split later
train['is_train'] = 1
test['is_train'] = 0
test['Loan_Status'] = None  # Add placeholder column for consistency

# Combine for uniform preprocessing
combined = pd.concat([train, test], axis=0)

# Drop ID
combined.drop("Loan_ID", axis=1, inplace=True)

# Fill missing values
for col in combined.columns:
    if combined[col].dtype == "object":
        combined[col] = combined[col].fillna(combined[col].mode()[0])
    else:
        combined[col] = combined[col].fillna(combined[col].median())

# Encode categorical features
combined = pd.get_dummies(combined, drop_first=True)

# Split back
train_cleaned = combined[combined["is_train"] == 1].drop("is_train", axis=1)
test_cleaned = combined[combined["is_train"] == 0].drop(["is_train", "Loan_Status_Y"], axis=1)  # Drop target from test


### Train a decision tree classifier

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Separate features and target
X = train_cleaned.drop("Loan_Status_Y", axis=1)
y = train_cleaned["Loan_Status_Y"]

# Train-test split for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = model.predict(X_val)
print("Validation Accuracy:", round(accuracy_score(y_val, y_val_pred) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))


Validation Accuracy: 67.48 %

Classification Report:
               precision    recall  f1-score   support

       False       0.54      0.47      0.50        43
        True       0.73      0.79      0.76        80

    accuracy                           0.67       123
   macro avg       0.64      0.63      0.63       123
weighted avg       0.67      0.67      0.67       123



### Predict on Test set

In [17]:
# Predict on test set
test_preds = model.predict(test_cleaned)

# Convert predictions to 'Y' and 'N'
predicted_labels = ['Y' if pred == 1 else 'N' for pred in test_preds]

# Create DataFrame for submission
submission = pd.DataFrame({
    'Loan_ID': test_ids,
    'Loan_Status': predicted_labels
})

print(submission.head())


    Loan_ID Loan_Status
0  LP001015           Y
1  LP001022           N
2  LP001031           Y
3  LP001035           Y
4  LP001051           N
