In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = r'C:\Users\Harshan\Documents\dfa\bankloans.csv'
data = pd.read_csv(file_path)

# Check and convert 'default' column to binary if necessary
data['default'] = data['default'].apply(lambda x: 1 if x > 0.5 else 0)

# Drop rows with missing values in 'default' column
data_cleaned = data.dropna(subset=['default'])

# Split features and target
X = data_cleaned.drop('default', axis=1)
y = data_cleaned['default']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle imbalanced data using SMOTE
smote = SMOTE()
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train_bal, y_train_bal)

# Make predictions
y_pred_log = log_reg.predict(X_test_scaled)

# Evaluate the Logistic Regression model
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_log))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Logistic Regression AUC-ROC:", roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:, 1]))

# Initialize Random Forest model
rf_model = RandomForestClassifier()

# Train the Random Forest model
rf_model.fit(X_train_bal, y_train_bal)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the Random Forest model
print("\nRandom Forest Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest AUC-ROC:", roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1]))


Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.88      0.70      0.78       284
           1       0.29      0.57      0.38        61

    accuracy                           0.68       345
   macro avg       0.59      0.64      0.58       345
weighted avg       0.78      0.68      0.71       345

Logistic Regression Accuracy: 0.6753623188405797
Logistic Regression AUC-ROC: 0.7456130223966752

Random Forest Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       284
           1       0.42      0.36      0.39        61

    accuracy                           0.80       345
   macro avg       0.64      0.63      0.63       345
weighted avg       0.79      0.80      0.79       345

Random Forest Accuracy: 0.8
Random Forest AUC-ROC: 0.7880108519972293


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = r'C:\Users\Harshan\Documents\dfa\bankloans.csv'
data = pd.read_csv(file_path)

# Check and convert 'default' column to binary if necessary
data['default'] = data['default'].apply(lambda x: 1 if x > 0.5 else 0)

# Drop rows with missing values in 'default' column
data_cleaned = data.dropna(subset=['default'])

# Split features and target
X = data_cleaned.drop('default', axis=1)
y = data_cleaned['default']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle imbalanced data using SMOTE
smote = SMOTE()
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train_bal, y_train_bal)

# Make predictions
y_pred_log = log_reg.predict(X_test_scaled)

# Evaluate the Logistic Regression model
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_log))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Logistic Regression AUC-ROC:", roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:, 1]))

# Initialize Random Forest model
rf_model = RandomForestClassifier()

# Train the Random Forest model
rf_model.fit(X_train_bal, y_train_bal)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the Random Forest model
print("\nRandom Forest Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest AUC-ROC:", roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1]))


# Function to predict default status for new input
def predict_default(input_data):
    # Convert input to DataFrame
    input_df = pd.DataFrame([input_data], columns=X.columns)
    
    # Scale the input data
    input_scaled = scaler.transform(input_df)
    
    # Predict using Logistic Regression
    log_reg_pred = log_reg.predict(input_scaled)[0]
    log_reg_prob = log_reg.predict_proba(input_scaled)[0][1]
    
    # Predict using Random Forest
    rf_pred = rf_model.predict(input_scaled)[0]
    rf_prob = rf_model.predict_proba(input_scaled)[0][1]
    
    return {
        'Logistic Regression': {'Default Prediction': log_reg_pred, 'Probability': log_reg_prob},
        'Random Forest': {'Default Prediction': rf_pred, 'Probability': rf_prob}
    }

# Example input for a new customer: [age, ed, employ, address, income, debtinc, creddebt, othdebt]
new_input = [45, 2, 10, 5, 85, 10, 5.5, 3]

# Predict default status for the new input
result = predict_default(new_input)

# Display the result
print("\nPrediction for the new input:")
print(result)


Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.89      0.68      0.77       284
           1       0.29      0.59      0.39        61

    accuracy                           0.67       345
   macro avg       0.59      0.64      0.58       345
weighted avg       0.78      0.67      0.70       345

Logistic Regression Accuracy: 0.6666666666666666
Logistic Regression AUC-ROC: 0.7418032786885247

Random Forest Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       284
           1       0.44      0.39      0.41        61

    accuracy                           0.80       345
   macro avg       0.65      0.64      0.65       345
weighted avg       0.80      0.80      0.80       345

Random Forest Accuracy: 0.8028985507246377
Random Forest AUC-ROC: 0.7781401523897483

Prediction for the new input:
{'Logistic Regression': {'Default Prediction': 1, 'Probability': 0.659341654781644

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = r'C:\Users\Harshan\Documents\dfa\bankloans.csv'
data = pd.read_csv(file_path)

# Check and convert 'default' column to binary if necessary
data['default'] = data['default'].apply(lambda x: 1 if x > 0.5 else 0)

# Drop rows with missing values in 'default' column
data_cleaned = data.dropna(subset=['default'])

# Split features and target
X = data_cleaned.drop('default', axis=1)
y = data_cleaned['default']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle imbalanced data using SMOTE
smote = SMOTE()
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train_bal, y_train_bal)

# Make predictions
y_pred_log = log_reg.predict(X_test_scaled)

# Evaluate the Logistic Regression model
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_log))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Logistic Regression AUC-ROC:", roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:, 1]))

# Initialize Random Forest model
rf_model = RandomForestClassifier()

# Train the Random Forest model
rf_model.fit(X_train_bal, y_train_bal)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the Random Forest model
print("\nRandom Forest Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest AUC-ROC:", roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1]))


# Function to predict default status for new input
def predict_default(input_data):
    # Convert input to DataFrame
    input_df = pd.DataFrame([input_data], columns=X.columns)
    
    # Scale the input data
    input_scaled = scaler.transform(input_df)
    
    # Predict using Logistic Regression
    log_reg_pred = log_reg.predict(input_scaled)[0]
    log_reg_prob = log_reg.predict_proba(input_scaled)[0][1]
    
    # Predict using Random Forest
    rf_pred = rf_model.predict(input_scaled)[0]
    rf_prob = rf_model.predict_proba(input_scaled)[0][1]
    
    return {
        'Logistic Regression': {'Default Prediction': log_reg_pred, 'Probability': log_reg_prob},
        'Random Forest': {'Default Prediction': rf_pred, 'Probability': rf_prob}
    }

# Manual input from user
age = float(input("Enter age: "))
ed = int(input("Enter education level (e.g., 1, 2, 3...): "))
employ = float(input("Enter years of employment: "))
address = float(input("Enter number of years at current address: "))
income = float(input("Enter annual income (in thousands): "))
debtinc = float(input("Enter debt-to-income ratio (as a percentage): "))
creddebt = float(input("Enter credit card debt: "))
othdebt = float(input("Enter other debts: "))

# Create the input list
new_input = [age, ed, employ, address, income, debtinc, creddebt, othdebt]

# Predict default status for the new input
result = predict_default(new_input)

# Display the result
print("\nPrediction for the new input:")
print(result)


Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.89      0.67      0.77       284
           1       0.28      0.61      0.39        61

    accuracy                           0.66       345
   macro avg       0.59      0.64      0.58       345
weighted avg       0.78      0.66      0.70       345

Logistic Regression Accuracy: 0.6608695652173913
Logistic Regression AUC-ROC: 0.7466520434079889

Random Forest Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       284
           1       0.47      0.46      0.47        61

    accuracy                           0.81       345
   macro avg       0.68      0.67      0.68       345
weighted avg       0.81      0.81      0.81       345

Random Forest Accuracy: 0.8144927536231884
Random Forest AUC-ROC: 0.782555991687832


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# Load and preprocess data
file_path = r'C:\Users\Harshan\Documents\dfa\bankloans.csv'
data = pd.read_csv(file_path)
data['default'] = data['default'].apply(lambda x: 1 if x > 0.5 else 0)
data_cleaned = data.dropna(subset=['default'])

# Split features and target
X = data_cleaned.drop('default', axis=1)
y = data_cleaned['default']

# Train-test split and scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle imbalanced data
smote = SMOTE()
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# Train and evaluate Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_bal, y_train_bal)
y_pred_log = log_reg.predict(X_test_scaled)
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_log))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Logistic Regression AUC-ROC:", roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:, 1]))

# Train and evaluate Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train_bal, y_train_bal)
y_pred_rf = rf_model.predict(X_test_scaled)
print("\nRandom Forest Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest AUC-ROC:", roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1]))

# Function to predict default status
def predict_default(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    input_scaled = scaler.transform(input_df)
    
    log_reg_pred = log_reg.predict(input_scaled)[0]
    log_reg_prob = log_reg.predict_proba(input_scaled)[0][1]
    
    rf_pred = rf_model.predict(input_scaled)[0]
    rf_prob = rf_model.predict_proba(input_scaled)[0][1]
    
    return {
        'Logistic Regression': {'Default Prediction': log_reg_pred, 'Probability': log_reg_prob},
        'Random Forest': {'Default Prediction': rf_pred, 'Probability': rf_prob}
    }

# Get user input and make prediction
new_input = [
    float(input("Enter age: ")),
    int(input("Enter education level (e.g., 1, 2, 3...): ")),
    float(input("Enter years of employment: ")),
    float(input("Enter number of years at current address: ")),
    float(input("Enter annual income (in thousands): ")),
    float(input("Enter debt-to-income ratio (as a percentage): ")),
    float(input("Enter credit card debt: ")),
    float(input("Enter other debts: "))
]

result = predict_default(new_input)
print("\nPrediction for the new input:")
print(result)

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.88      0.67      0.76       284
           1       0.28      0.59      0.38        61

    accuracy                           0.66       345
   macro avg       0.58      0.63      0.57       345
weighted avg       0.78      0.66      0.69       345

Logistic Regression Accuracy: 0.6550724637681159
Logistic Regression AUC-ROC: 0.7462479796813668

Random Forest Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       284
           1       0.47      0.46      0.47        61

    accuracy                           0.81       345
   macro avg       0.68      0.67      0.68       345
weighted avg       0.81      0.81      0.81       345

Random Forest Accuracy: 0.8144927536231884
Random Forest AUC-ROC: 0.7843742784576312


Enter age:  18
Enter education level (e.g., 1, 2, 3...):  3
Enter years of employment:  23
Enter number of years at current address:  2
Enter annual income (in thousands):  100
Enter debt-to-income ratio (as a percentage):  23
Enter credit card debt:  3.3
Enter other debts:  23



Prediction for the new input:
{'Logistic Regression': {'Default Prediction': 0, 'Probability': 0.21364808821584302}, 'Random Forest': {'Default Prediction': 1, 'Probability': 0.51}}
