In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# Load dataset
df = pd.read_csv("/content/loans_full_schema Task 4.csv")

# Drop irrelevant columns
cols_to_drop = [
    'Unnamed: 0', 'emp_title', 'annual_income_joint', 'verification_income_joint',
    'debt_to_income_joint', 'months_since_last_delinq', 'months_since_90d_late'
]
df.drop(columns=cols_to_drop, inplace=True)

# Create binary target
default_statuses = ['Charged Off', 'Default', 'Late (31-120 days)', 'Late (16-30 days)']
df['default'] = df['loan_status'].apply(lambda x: 1 if x in default_statuses else 0)
df.drop(columns=['loan_status'], inplace=True)

# Drop columns with >30% missing and fill the rest
df.dropna(thresh=0.7 * len(df), axis=1, inplace=True)
df.fillna(df.median(numeric_only=True), inplace=True)

# Encode categorical variables
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].apply(LabelEncoder().fit_transform)

# Split features and target
X = df.drop(columns=['default'])
y = df['default']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train LightGBM
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_resampled, y_resampled)
y_pred_lgbm = lgbm.predict(X_test)

# Train SVM
svm = SVC(random_state=42)
svm.fit(X_resampled, y_resampled)
y_pred_svm = svm.predict(X_test)

# Evaluation
print("LightGBM Performance:")
print(classification_report(y_test, y_pred_lgbm, target_names=["Non-Default", "Default"]))

print("\nSVM Performance:")
print(classification_report(y_test, y_pred_svm, target_names=["Non-Default", "Default"]))


[LightGBM] [Info] Number of positive: 7911, number of negative: 7911
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4936
[LightGBM] [Info] Number of data points in the train set: 15822, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM Performance:
              precision    recall  f1-score   support

 Non-Default       1.00      1.00      1.00      1978
     Default       0.93      0.59      0.72        22

    accuracy                           0.99      2000
   macro avg       0.96      0.80      0.86      2000
weighted avg       0.99      0.99      0.99      2000


SVM Performance:
              precision    recall  f1-score   support

 Non-Default       0.99      0.57      0.72      1978
     Default       0.01