In [None]:
# load the dataset 
import pandas as pd

# Load the dataset
data = pd.read_csv('loan_classification.csv')

# Display the first few rows of the dataframe
print(data.head())

# Getting the summary of the DataFrame
data_info = pd.DataFrame({
    'Data Type': data.dtypes,
    'Unique Values': data.nunique(),
    'Missing Values': data.isnull().sum(),
    'First Record': data.iloc[0]
})

print(data_info)



In [None]:
# Data preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Selecting features and target
X = data.drop('loan_status', axis=1)
y = data['loan_status'].apply(lambda x: 1 if x == 'Approved' else 0)

# Defining the columns that need encoding and scaling
categorical_features = ['gender', 'occupation', 'education_level', 'marital_status']
numeric_features = ['age', 'income', 'credit_score']

# Creating transformers for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Applying the transformations
X_preprocessed = preprocessor.fit_transform(X)

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Initialize the logistic regression model
logreg = LogisticRegression()

# Perform cross-validation
scores = cross_val_score(logreg, X_train, y_train, cv=5)

# Print the accuracy scores
print("Cross-validation scores:", scores)
print("Average cross-validation score:", scores.mean())


In [None]:
# hyperparameter tuning with GridSearchCV

from sklearn.model_selection import GridSearchCV

# Parameter grid definition
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Grid search initialization
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5)

# Fitting grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))


In [None]:
### FEATURE SELECTION - Univariate

from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# Applying Variance Threshold to remove features with zero variance
var_thresh = VarianceThreshold(threshold=0)
X_train_var = var_thresh.fit_transform(X_train)

# Reapply SelectKBest on the filtered dataset
selector_kbest = SelectKBest(f_classif, k=10)
X_train_kbest = selector_kbest.fit_transform(X_train_var, y_train)

# Extract feature names from the preprocessor that are not filtered out by VarianceThreshold
full_features_mask = var_thresh.get_support()  # This is the mask after applying VarianceThreshold
feature_names = preprocessor.transformers_[1][1].get_feature_names_out()
adjusted_feature_names = [name for name, var in zip(feature_names, full_features_mask) if var]

# Apply SelectKBest mask to the adjusted feature names
selected_features_mask = selector_kbest.get_support()  # This mask applies to the variance-filtered dataset
final_selected_features = [name for name, select in zip(adjusted_feature_names, selected_features_mask) if select]

print("Features selected by Univariate Selection:", final_selected_features)


In [None]:
## FEATURE SELECTION - Model based

from sklearn.feature_selection import SelectFromModel

# Applying model-based selection
selector_model = SelectFromModel(LogisticRegression(max_iter=1000))
X_train_model = selector_model.fit_transform(X_train, y_train)

# Identifying which features were selected
mask_model = selector_model.get_support()
selected_columns_model = [col for col, selected in zip(preprocessor.transformers_[1][1].get_feature_names_out(), mask_model) if selected]
print("Features selected by Model-based Selection:", selected_columns_model)


In [None]:
## FEATURE SELECTION - Recursive Feature Elimination (RFE)

from sklearn.feature_selection import RFE

# Applying RFE
rfe = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Identifying which features were selected
mask_rfe = rfe.support_
selected_columns_rfe = [col for col, selected in zip(preprocessor.transformers_[1][1].get_feature_names_out(), mask_rfe) if selected]
print("Features selected by RFE:", selected_columns_rfe)
