In [1]:
# Import necessary libraries
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For enhanced data visualization

# Suppress warnings
import warnings  # For managing warnings in the code
warnings.filterwarnings('ignore')  # Ignore all warnings

## display all columns of the dataframe
pd.options.display.max_columns=None
## display all rows of the dataframe
pd.options.display.max_rows=None

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier


from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc

from scipy import stats
from scipy.stats import shapiro, levene, mannwhitneyu
from scipy.stats import chi2_contingency

In [2]:
df = pd.read_csv('Data_without_Scaling_&_without_Outliers_Treatment.csv')

In [3]:
# Assuming df is your DataFrame
df['Total_Repayment'] = df['V6'] * df['V8']
df['Total_Interest_Paid'] = df['V7'] * (df['V12'] / 100) * df['V8']
df['Total_Past_Due'] = df['V29'] + df['V28'] + df['V30']
df['RepaymentRiskScore'] = (df['V29'] * 2) + (df['V28'] * 1.5) + (df['V30'] * 3) + (df['V5'] * 2)
df['LoanBurden'] = (df['V7'] / df['V8']) + df['V6']
df['Default_Risk_Score'] = (df['V29'] + df['V28'] + df['V30'] + df['V5']) / 4
df['Loan_Affordability_Index'] = df['V6'] / df['V12']


# Drop or keep the original columns as necessary
df.drop(['V5','V6', 'V8', 'V7', 'V12', 'V29', 'V28', 'V30'], axis=1, inplace=True)


In [4]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Select only numeric columns (VIF is calculated for numeric data)
numeric_df = df.select_dtypes(include=np.number)

# Handle missing values - option 1: drop rows with missing values
numeric_df = numeric_df.dropna()

# Alternatively, handle missing values - option 2: fill missing values
# numeric_df = numeric_df.fillna(numeric_df.mean())

# Check for infinite values and replace them with NaNs
numeric_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaNs after replacing inf values
numeric_df.dropna(inplace=True)

# Calculate VIF for each numeric feature
vif_data = pd.DataFrame()
vif_data["feature"] = numeric_df.columns
vif_data["VIF"] = [variance_inflation_factor(numeric_df.values, i) for i in range(numeric_df.shape[1])]

print(vif_data.sort_values(by='VIF',ascending=False))

                     feature         VIF
16        Default_Risk_Score  977.272398
14        RepaymentRiskScore  702.123663
13            Total_Past_Due  232.702680
11           Total_Repayment  199.875248
15                LoanBurden   55.604280
12       Total_Interest_Paid   47.668877
9                        V25   30.883121
2                         V4   21.360281
6                        V18   19.308391
7                        V19   13.204559
5                        V17   11.708599
1                         V3    4.815238
8                        V20    3.851674
4                        V11    2.039439
3                         V9    1.991506
0                         V2    1.122952
10                       V32    1.060361
17  Loan_Affordability_Index    1.035865


In [5]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Drop the 'Default_Risk_Score' column
numeric_df = numeric_df.drop(columns=['Default_Risk_Score', 'Total_Repayment', 'RepaymentRiskScore', 'V25', 'V18', 'V4']) 

# Recalculate VIF for the remaining features
numeric_df.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_df.dropna(inplace=True)

vif_data = pd.DataFrame()
vif_data["feature"] = numeric_df.columns
vif_data["VIF"] = [variance_inflation_factor(numeric_df.values, i) for i in range(numeric_df.shape[1])]

print(vif_data.sort_values(by='VIF', ascending=False))


                     feature       VIF
10                LoanBurden  8.791182
4                        V17  7.981317
8        Total_Interest_Paid  3.910787
2                         V9  1.912674
3                        V11  1.504192
1                         V3  1.309438
5                        V19  1.301457
6                        V20  1.257824
9             Total_Past_Due  1.247883
7                        V32  1.052889
11  Loan_Affordability_Index  1.035704
0                         V2  1.031324


In [6]:
df.drop(['Default_Risk_Score', 'Total_Repayment', 'RepaymentRiskScore', 'V25', 'V18', 'V4'], axis = 1, inplace = True)

In [8]:
# Save the cleaned data to a CSV file
df.to_csv('Data_for_Feature_Selection.csv', index=False)

**High VIFs: Features LoanBurden, V17 (> 5).**

**Moderate VIFs: Features Total_Interest_Paid (between 5 and 1).**

**Low VIFs: Features V9, V19, V20, Total_Past_Due, Loan_Affordability_Index ( = 1).**

**Removing features is not an option, explore alternative modeling techniques like ridge regression or LASSO regression that are less sensitive to multicollinearity**

# <font color = 'green'>Feature Selection

In [11]:
encode_df = pd.read_csv('Data_for_Feature_Selection.csv')


# Columns to be one-hot encoded
columns_to_encode = ['V10', 'V13', 'V14', 'V15', 'V31']


# One-Hot Encoding
encoded_data = pd.get_dummies(encode_df, columns=columns_to_encode, prefix=columns_to_encode, drop_first=True)

encoded_data.to_csv('Data_with_encoding.csv', index=False)

dt = pd.read_csv('Data_with_encoding.csv')
dt.shape

(85048, 26)

## <font color = 'purple'>Feature Selection using LassoCV Method

In [12]:
# Drop feature V32 and define target variable
X = dt.drop('V32', axis=1)
y = dt['V32']

# Replace infinity values with NaNs
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values in X (drop rows with NaN values)
X.dropna(inplace=True)
print(X.shape)

# Ensure the target variable 'y' matches the cleaned features 'X'
y = y.loc[X.index]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

# Check for remaining infinity values or very large values
print("Max value in X_train:", X_train.max().max())
print("Min value in X_train:", X_train.min().min())

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled features back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# LassoCV for feature selection
lasso_cv = LassoCV(cv=5).fit(X_train_scaled, y_train)
model = SelectFromModel(lasso_cv, prefit=True)
X_train_selected = model.transform(X_train_scaled)
X_test_selected = model.transform(X_test_scaled)

# Get selected features by model
selected_features = X_train.columns[model.get_support()]
print("Selected Features:", selected_features)

# Train Logistic Regression model
classifier = LogisticRegression().fit(X_train_selected, y_train)

# Make predictions and calculate accuracy
y_pred = classifier.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


(82742, 25)
Max value in X_train: 475084.8000000001
Min value in X_train: 0.0
Selected Features: Index(['V2', 'V3', 'V9', 'V11', 'V17', 'V19', 'V20', 'Total_Interest_Paid',
       'Total_Past_Due', 'LoanBurden', 'V10_MO', 'V10_SC', 'V10_TL',
       'V13_MALE', 'V14_SELF', 'V14_STUDENT', 'V15_RENT', 'V31_TIER 2',
       'V31_TIER 3', 'V31_TIER 4'],
      dtype='object')
Accuracy: 0.9779236997945454


## <font color = 'purple'>Feature Selection using RFECV Method

In [13]:
# Recursive Feature Elimination with Cross-Validation
estimator = LogisticRegression(max_iter = 1000)  # Use LogisticRegression as estimator
selector = RFECV(estimator, min_features_to_select = 10, step = 1)
selector.fit(X_train_scaled, y_train)

# Get selected features by model
selected_features = X_train.columns[selector.support_]
print("Selected Features with RFE:", selected_features)

X_train_rfe_selected = selector.transform(X_train_scaled)
X_test_rfe_selected = selector.transform(X_test_scaled)

classifier = LogisticRegression(max_iter = 1000).fit(X_train_rfe_selected, y_train)

# Make predictions and calculate accuracy
y_pred = classifier.predict(X_test_rfe_selected)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Selected Features with RFE: Index(['V2', 'V3', 'V9', 'V11', 'V17', 'V19', 'V20', 'Total_Interest_Paid',
       'Total_Past_Due', 'V10_MO', 'V10_TL', 'V13_MALE', 'V14_SELF',
       'V14_STUDENT', 'V31_TIER 2', 'V31_TIER 3', 'V31_TIER 4'],
      dtype='object')
Accuracy: 0.9779639850138984


## <font color = 'purple'>Feature Selection using Tree Method

In [14]:
# Tree-based method for feature selection
tree_clf = ExtraTreesClassifier(n_estimators=100)
tree_clf = tree_clf.fit(X_train_scaled, y_train)

# Select features based on importance
model = SelectFromModel(tree_clf, prefit=True)
X_train_selected = model.transform(X_train_scaled)
X_test_selected = model.transform(X_test_scaled)

# Get selected features by model
selected_features = X_train.columns[model.get_support()]
print("Selected Features:", selected_features)

# Train Logistic Regression model
classifier = LogisticRegression(max_iter=1000).fit(X_train_selected, y_train)

# Make predictions and calculate accuracy
y_pred = classifier.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Selected Features: Index(['V3', 'V9', 'V17', 'V19', 'V20', 'Total_Interest_Paid',
       'Total_Past_Due', 'LoanBurden', 'Loan_Affordability_Index'],
      dtype='object')
Accuracy: 0.9778431293558394
