In [12]:
import pandas as pd

# Loading dataset with a different encoding
file_path = "Anonymize_Loan_Default_data.csv"
data = pd.read_csv(file_path, encoding='ISO-8859-1')  # You can also try 'latin1' or 'cp1252'
data.head(10)

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,...,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,repay_fail
0,2,2,2,0.0,0.0,0.0,36 months,0.0,0.0,< 1 year,...,1.0,0.0,0.0,0.0,0.0,Jan-07,0.0,Jan-07,Jan-07,1
1,3,545583,703644,2500.0,2500.0,2500.0,36 months,13.98,85.42,4 years,...,10.0,3075.291779,3075.29,2500.0,575.29,Jul-13,90.85,Aug-13,Jun-16,0
2,4,532101,687836,5000.0,5000.0,5000.0,36 months,15.95,175.67,4 years,...,15.0,2948.76,2948.76,1909.02,873.81,Nov-11,175.67,,Mar-12,1
3,5,877788,1092507,7000.0,7000.0,7000.0,36 months,9.91,225.58,10+ years,...,20.0,8082.39188,8082.39,7000.0,1082.39,Mar-14,1550.27,,Mar-14,0
4,6,875406,1089981,2000.0,2000.0,2000.0,36 months,5.42,60.32,10+ years,...,15.0,2161.663244,2161.66,2000.0,161.66,Feb-14,53.12,,Jun-16,0
5,7,506439,652909,3600.0,3600.0,3600.0,36 months,10.25,116.59,10+ years,...,25.0,4206.031191,4206.03,3600.0,606.03,May-13,146.75,Jun-13,Jun-16,0
6,8,981465,1204637,8000.0,8000.0,8000.0,36 months,6.03,243.49,,...,49.0,8724.971815,8724.97,8000.0,724.97,Apr-14,1423.66,,Apr-14,0
7,9,749050,948200,6000.0,6000.0,6000.0,36 months,7.49,186.61,3 years,...,9.0,6717.950109,6717.95,6000.0,717.95,May-14,211.41,,May-14,0
8,10,1016373,1243872,25600.0,25600.0,25472.82947,60 months,14.27,599.26,4 years,...,32.0,32840.05674,32659.13,25600.0,7240.06,Apr-14,16083.78,,Jun-16,0
9,11,786870,990345,19750.0,19750.0,19750.0,60 months,23.22,559.27,10+ years,...,44.0,27544.89116,27544.89,19750.0,7794.89,Jun-13,15264.34,,Feb-16,0


In [13]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

columns_to_use = ['int_rate', 'dti', 'annual_inc', 'loan_amnt', 'inq_last_6mths', 'mths_since_last_delinq', 'repay_fail']
data = data[columns_to_use]

In [14]:
# Step 2: Separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

#check
print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: Index(['int_rate', 'dti', 'annual_inc', 'loan_amnt', 'inq_last_6mths',
       'mths_since_last_delinq', 'repay_fail'],
      dtype='object')
Categorical Columns: Index([], dtype='object')


In [15]:
# Step 3: Handle missing values for numerical and categorical columns
# Impute missing values for numerical columns with the median
numerical_imputer = SimpleImputer(strategy='median')
data[numerical_cols] = numerical_imputer.fit_transform(data[numerical_cols])

# Impute missing values for categorical columns with the most frequent value
#categorical_imputer = SimpleImputer(strategy='most_frequent')
#data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

# Step 4: Encode categorical columns using OneHotEncoder
#encoder = OneHotEncoder(sparse_output=False, drop='first')
#encoded_data = encoder.fit_transform(data[categorical_cols])

# Step 5: Apply scaling to numerical features excluding the target column
numerical_cols = [col for col in numerical_cols if col != 'repay_fail']  # Exclude target column
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[numerical_cols])

# Step 6: Combine numerical and categorical data
#encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))
scaled_df = pd.DataFrame(scaled_data, columns=numerical_cols)

# Re-include 'repay_fail' from the original dataset
data_preprocessed = pd.concat([scaled_df, data[['repay_fail']]], axis=1)

# Check the result
print(data_preprocessed.head())


   int_rate       dti  annual_inc  loan_amnt  inq_last_6mths  \
0 -3.254756 -1.983628   -1.070119  -1.498225       -0.706254   
1  0.485820  0.961095   -0.759854  -1.160625        2.551555   
2  1.012926  0.918095   -0.155020  -0.823026       -0.054692   
3 -0.603175 -0.382268   -0.235735  -0.552946        1.248431   
4 -1.804547 -1.449841   -0.604815  -1.228145       -0.706254   

   mths_since_last_delinq  repay_fail  
0               -0.054145         1.0  
1               -0.054145         0.0  
2                1.855116         1.0  
3               -2.257140         0.0  
4                2.809747         0.0  


In [16]:
data.columns

Index(['int_rate', 'dti', 'annual_inc', 'loan_amnt', 'inq_last_6mths',
       'mths_since_last_delinq', 'repay_fail'],
      dtype='object')

In [17]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Step 1: Split data into features and target
X = data_preprocessed.drop(columns=['repay_fail'], errors='ignore')  # Features
y = data_preprocessed['repay_fail']  # Target

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Apply scaling to the features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Initialize models
log_reg = LogisticRegression(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)

# Step 5: Train and evaluate Logistic Regression
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {log_reg_accuracy:.4f}")
print("Classification Report for Logistic Regression:\n", classification_report(y_test, y_pred_log_reg))

# Step 6: Train and evaluate Decision Tree
decision_tree.fit(X_train_scaled, y_train)
y_pred_tree = decision_tree.predict(X_test_scaled)
tree_accuracy = accuracy_score(y_test, y_pred_tree)
print(f"Decision Tree Accuracy: {tree_accuracy:.4f}")
print("Classification Report for Decision Tree:\n", classification_report(y_test, y_pred_tree))

# Step 7: Train and evaluate Random Forest
random_forest.fit(X_train_scaled, y_train)
y_pred_rf = random_forest.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print("Classification Report for Random Forest:\n", classification_report(y_test, y_pred_rf))


Logistic Regression Accuracy: 0.8413
Classification Report for Logistic Regression:
               precision    recall  f1-score   support

         0.0       0.84      1.00      0.91      6479
         1.0       0.41      0.01      0.01      1217

    accuracy                           0.84      7696
   macro avg       0.63      0.50      0.46      7696
weighted avg       0.77      0.84      0.77      7696

Decision Tree Accuracy: 0.7475
Classification Report for Decision Tree:
               precision    recall  f1-score   support

         0.0       0.85      0.84      0.85      6479
         1.0       0.22      0.23      0.23      1217

    accuracy                           0.75      7696
   macro avg       0.54      0.54      0.54      7696
weighted avg       0.75      0.75      0.75      7696

Random Forest Accuracy: 0.8394
Classification Report for Random Forest:
               precision    recall  f1-score   support

         0.0       0.85      0.99      0.91      6479
      

In [18]:
#shape
data_preprocessed.shape

(38480, 7)

In [19]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Step 1: Define features (X) and target (y)
X = data_preprocessed.drop(columns=['repay_fail'], errors='ignore')  # Features
y = data_preprocessed['repay_fail']  # Target

# Step 2: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X contains your features

# Step 3: Apply PCA to reduce dimensionality
pca = PCA(n_components=0.90)  # 95% variance explained, can adjust this threshold
X_pca = pca.fit_transform(X_scaled)

# Step 4: Check the shape of the transformed data
print(f"Original shape: {X.shape}")
print(f"Transformed shape: {X_pca.shape}")

# Step 5: Explained variance ratio (how much variance each principal component explains)
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")

# Step 6: Total explained variance
total_explained_variance = sum(pca.explained_variance_ratio_)
print(f"Total variance explained by selected components: {total_explained_variance:.4f}")

# Step 7: Now you can use X_pca in your model


Original shape: (38480, 6)
Transformed shape: (38480, 5)
Explained variance ratio: [0.24230202 0.19443921 0.17640912 0.16073272 0.12875249]
Total variance explained by selected components: 0.9026


In [20]:
# Step 1: Split PCA-transformed data into features and target
X_pca = X_pca  # PCA-transformed data (already done)
y = data['repay_fail']  # Target variable from the original data

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Step 3: Initialize models
log_reg = LogisticRegression(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)

# Step 4: Train and evaluate Logistic Regression
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {log_reg_accuracy:.4f}")
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_log_reg))

# Step 5: Train and evaluate Decision Tree
decision_tree.fit(X_train, y_train)
y_pred_tree = decision_tree.predict(X_test)
tree_accuracy = accuracy_score(y_test, y_pred_tree)
print(f"Decision Tree Accuracy: {tree_accuracy:.4f}")
print("Classification Report for Decision Tree:")
print(classification_report(y_test, y_pred_tree))

# Step 6: Train and evaluate Random Forest
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))


Logistic Regression Accuracy: 0.8412
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91      6479
         1.0       0.39      0.01      0.01      1217

    accuracy                           0.84      7696
   macro avg       0.62      0.50      0.46      7696
weighted avg       0.77      0.84      0.77      7696

Decision Tree Accuracy: 0.7427
Classification Report for Decision Tree:
              precision    recall  f1-score   support

         0.0       0.85      0.84      0.85      6479
         1.0       0.21      0.22      0.21      1217

    accuracy                           0.74      7696
   macro avg       0.53      0.53      0.53      7696
weighted avg       0.75      0.74      0.75      7696

Random Forest Accuracy: 0.8394
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.85      0.99      0.91      6479
         

In [21]:
import pickle

pickle.dump(random_forest, open('model.pkl', 'wb'))


In [22]:


pickle.dump(pca, open('pca.pkl', 'wb'))
