In [None]:
import json
import pandas as pd

# Load JSON Data
file_path = 'D:\loan pre\loan_approval_dataset.json'

# If the file is extremely large, consider using chunks
chunks = []
with open(file_path, 'r') as file:
    for line in file:
        chunks.append(json.loads(line))

# Convert JSON to DataFrame
data = pd.DataFrame(chunks)

# Display basic information about the dataset
print(data.head())
print(data.info())
print(data.describe())


In [None]:
import json

# Load JSON Data
file_path = 'D:\loan pre\loan_approval_dataset.json'

with open(file_path, 'r') as file:
    data = json.load(file)

# Inspect the JSON structure
print(type(data))  # Check the type of the loaded data
print(data.keys())  # Print the keys of the top-level dictionary

In [None]:
import pandas as pd

# Assuming the JSON structure is a dictionary of dictionaries
# Example structure:
# {
#     "Car_Ownership": {"0": "no", "1": "no", ...},
#     "Profession": {"0": "Mechanical_engineer", "1": "Software_Developer", ...},
#     ...
# }

# Create an empty DataFrame
df = pd.DataFrame()

# Iterate through each key in the top-level dictionary
for key in data.keys():
    # Convert each nested dictionary to a DataFrame and concatenate it to the main DataFrame
    nested_df = pd.DataFrame.from_dict(data[key], orient='index', columns=[key])
    df = pd.concat([df, nested_df], axis=1)

# Reset index to have a proper DataFrame
df.reset_index(drop=True, inplace=True)

# Display basic information about the DataFrame
print(df.head())
print(df.info())
print(df.describe())


In [None]:
# Check for missing values
print(df.isnull().sum())

# Handle missing values if necessary
df.fillna(method='ffill', inplace=True)

# Ensure correct data types
print(df.dtypes)

# Convert data types if necessary
# Example: df['column_name'] = df['column_name'].astype('desired_type')

# Display the final DataFrame
print(df.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of Risk_Flag (Target Variable)
plt.figure(figsize=(8, 6))
sns.countplot(x='Risk_Flag', data=df)
plt.title('Distribution of Risk_Flag')
plt.xlabel('Risk_Flag')
plt.ylabel('Count')
plt.show()

# Distribution of Age
plt.figure(figsize=(10, 6))
sns.histplot(x='Age', data=df, bins=30, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Distribution of Income
plt.figure(figsize=(10, 6))
sns.histplot(x='Income', data=df, bins=30, kde=True)
plt.title('Distribution of Income')
plt.xlabel('Income')
plt.ylabel('Count')
plt.show()

In [None]:
# Relationship between Age and Risk_Flag
plt.figure(figsize=(10, 6))
sns.boxplot(x='Risk_Flag', y='Age', data=df)
plt.title('Relationship between Age and Risk_Flag')
plt.xlabel('Risk_Flag')
plt.ylabel('Age')
plt.show()

# Relationship between Income and Risk_Flag
plt.figure(figsize=(10, 6))
sns.boxplot(x='Risk_Flag', y='Income', data=df)
plt.title('Relationship between Income and Risk_Flag')
plt.xlabel('Risk_Flag')
plt.ylabel('Income')
plt.show()

In [None]:
# Select only numeric columns for the correlation matrix
numeric_cols = df.select_dtypes(include=['number']).columns
correlation_matrix = df[numeric_cols].corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Select only numeric columns for the correlation matrix
numeric_cols = df.select_dtypes(include=['number']).columns

# Create a subset of the DataFrame with numeric columns
numeric_df = df[numeric_cols]

# Plot the pairplot to visualize correlations and distributions
sns.pairplot(numeric_df)
plt.suptitle('Pairplot of Numeric Features')
plt.show()


In [None]:
# Bar Plot for Married/Single vs. Risk_Flag
plt.figure(figsize=(8, 6))
sns.countplot(x='Married/Single', hue='Risk_Flag', data=df)
plt.title('Married/Single vs. Risk_Flag')
plt.xlabel('Married/Single')
plt.ylabel('Count')
plt.legend(title='Risk_Flag', loc='upper right')
plt.show()

# Bar Plot for House_Ownership vs. Risk_Flag
plt.figure(figsize=(8, 6))
sns.countplot(x='House_Ownership', hue='Risk_Flag', data=df)
plt.title('House_Ownership vs. Risk_Flag')
plt.xlabel('House_Ownership')
plt.ylabel('Count')
plt.legend(title='Risk_Flag', loc='upper right')
plt.show()

# Bar Plot for Car_Ownership vs. Risk_Flag
plt.figure(figsize=(8, 6))
sns.countplot(x='Car_Ownership', hue='Risk_Flag', data=df)
plt.title('Car_Ownership vs. Risk_Flag')
plt.xlabel('Car_Ownership')
plt.ylabel('Count')
plt.legend(title='Risk_Flag', loc='upper right')
plt.show()

# Bar Plot for Profession vs. Risk_Flag (displaying top professions)
top_professions = df['Profession'].value_counts().nlargest(10).index
plt.figure(figsize=(10, 6))
sns.countplot(x='Profession', hue='Risk_Flag', data=df[df['Profession'].isin(top_professions)])
plt.title('Top Professions vs. Risk_Flag')
plt.xlabel('Profession')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Risk_Flag', loc='upper right')
plt.show()


In [None]:
# Box Plot for Age vs. Married/Single
plt.figure(figsize=(10, 6))
sns.boxplot(x='Married/Single', y='Age', data=df)
plt.title('Age vs. Married/Single')
plt.xlabel('Married/Single')
plt.ylabel('Age')
plt.show()

# Box Plot for Income vs. House_Ownership
plt.figure(figsize=(10, 6))
sns.boxplot(x='House_Ownership', y='Income', data=df)
plt.title('Income vs. House_Ownership')
plt.xlabel('House_Ownership')
plt.ylabel('Income')
plt.show()

# Box Plot for Experience vs. Car_Ownership
plt.figure(figsize=(10, 6))
sns.boxplot(x='Car_Ownership', y='Experience', data=df)
plt.title('Experience vs. Car_Ownership')
plt.xlabel('Car_Ownership')
plt.ylabel('Experience')
plt.show()

# Box Plot for Age vs. Profession (displaying top professions)
top_professions = df['Profession'].value_counts().nlargest(10).index
plt.figure(figsize=(12, 8))
sns.boxplot(x='Profession', y='Age', data=df[df['Profession'].isin(top_professions)])
plt.title('Age vs. Profession (Top Professions)')
plt.xlabel('Profession')
plt.ylabel('Age')
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:
!pip install matplotlib
!pip install seaborn


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sample DataFrame (replace this with your actual DataFrame)
data = {
    'Income': [1303834, 7574516, 3991815, 6256451, 5768871],
    'Age': [23, 40, 66, 41, 47],
    'Experience': [3, 10, 4, 2, 11],
    'Married/Single': ['single', 'single', 'married', 'single', 'single'],
    'House_Ownership': ['rented', 'rented', 'rented', 'rented', 'rented'],
    'Car_Ownership': ['no', 'no', 'no', 'yes', 'no'],
    'Profession': ['Mechanical_engineer', 'Software_Developer', 'Technical_writer', 'Software_Developer', 'Civil_servant'],
    'CITY': ['Rewa', 'Parbhani', 'Alappuzha', 'Bhubaneswar', 'Tiruchirappalli[10]'],
    'STATE': ['Madhya_Pradesh', 'Maharashtra', 'Kerala', 'Odisha', 'Tamil_Nadu'],
    'CURRENT_JOB_YRS': [3, 9, 4, 2, 3],
    'CURRENT_HOUSE_YRS': [13, 13, 10, 12, 14],
    'Risk_Flag': [0, 0, 0, 1, 1]
}
df = pd.DataFrame(data)

# Scatter Plot for Income vs. Age (colored by Risk_Flag)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Income', y='Age', hue='Risk_Flag', data=df, palette='Set2')
plt.title('Income vs. Age (Colored by Risk_Flag)')
plt.xlabel('Income')
plt.ylabel('Age')
plt.legend(title='Risk_Flag')
plt.show()


In [None]:
pip install dask


In [None]:
pip install dask-ml


In [None]:
import os

# Specify the path to your JSON file
json_file_path = 'D:\loan pre\loan_approval_dataset.json'

# Check if the file exists
if os.path.isfile(json_file_path):
    print("JSON file exists.")
else:
    print("JSON file does not exist or path is incorrect.")


In [None]:
import json

# Specify the absolute path to your JSON file
absolute_json_path = 'D:\loan pre\loan_approval_dataset.json'

# Load JSON data from the file
with open(absolute_json_path, 'r') as json_file:
    data = json.load(json_file)

# Now 'data' contains the JSON data from your file, and you can work with it as needed
print(data)


In [None]:
# Assuming your DataFrame is named 'df'

# Convert object columns to categorical if needed
object_cols = df.select_dtypes(include=['object']).columns
df[object_cols] = df[object_cols].astype('category')

# Perform any additional preprocessing steps, such as encoding categorical variables, scaling numerical variables, etc.

# For example, if you want to encode categorical variables using one-hot encoding:
df_encoded = pd.get_dummies(df, columns=object_cols, drop_first=True)

# Standardize numerical variables if needed
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=['int64']).columns
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Now df_encoded contains your preprocessed data


In [None]:
import pandas as pd

# Assuming your DataFrame is named df
# Columns to encode
columns_to_encode = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=columns_to_encode)

# Display the first few rows of the encoded DataFrame
print(df_encoded.head())


In [None]:
from sklearn.model_selection import train_test_split

# Assuming df_encoded is your encoded DataFrame
# Splitting the data into features (X) and target variable (y)
X = df_encoded.drop('Risk_Flag', axis=1)  # Features
y = df_encoded['Risk_Flag']  # Target variable

# Splitting the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:
from sklearn.model_selection import train_test_split

# Assuming df_encoded is your encoded DataFrame
# Splitting the data into features (X) and target variable (y)
X = df_encoded.drop('Risk_Flag', axis=1)  # Features
y = df_encoded['Risk_Flag']  # Target variable

# Splitting the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:
import pandas as pd

# Your dictionary data
data = {
    'Id': {'0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6},
    'Income': {'0': 1303834, '1': 7574516, '2': 3991815, '3': 6256451, '4': 5768871, '5': 2419865},
    'Age': {'0': 23, '1': 40, '2': 66, '3': 41, '4': 47, '5': 48},
    'Experience': {'0': 3, '1': 10, '2': 4, '3': 2, '4': 11, '5': 9},
    'Married/Single': {'0': 'single', '1': 'single', '2': 'married', '3': 'single', '4': 'single', '5': 'married'},
    'House_Ownership': {'0': 'rented', '1': 'rented', '2': 'rented', '3': 'rented', '4': 'rented', '5': 'owned'},
    'Car_Ownership': {'0': 'no', '1': 'no', '2': 'no', '3': 'yes', '4': 'no', '5': 'yes'},
    'Profession': {'0': 'Mechanical_engineer', '1': 'Software_Developer', '2': 'Technical_writer', '3': 'Software_Developer', '4': 'Civil_servant', '5': 'Economist'},
    'CITY': {'0': 'Rewa', '1': 'Parbhani', '2': 'Alappuzha', '3': 'Bhubaneswar', '4': 'Tiruchirappalli[10]', '5': 'Pune'},
    'STATE': {'0': 'Madhya_Pradesh', '1': 'Maharashtra', '2': 'Kerala', '3': 'Odisha', '4': 'Tamil_Nadu', '5': 'Maharashtra'},
    'CURRENT_JOB_YRS': {'0': 3, '1': 9, '2': 4, '3': 2, '4': 3, '5': 8},
    'CURRENT_HOUSE_YRS': {'0': 13, '1': 13, '2': 10, '3': 12, '4': 14, '5': 15},
    'Risk_Flag': {'0': 0, '1': 0, '2': 0, '3': 1, '4': 1, '5': 0}
}

# Convert dictionary to DataFrame
df = pd.DataFrame(data)

# Print the number of data points
print("Number of data points:", df.shape[0])


In [None]:
import pandas as pd
import json

# Load the dataset
file_path = 'D:\loan pre\loan_approval_dataset.json'  # Update with actual path
with open(file_path) as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)
print("Number of data points:", df.shape[0])


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your DataFrame
X = df.drop('Risk_Flag', axis=1)
y = df['Risk_Flag']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', 'passthrough', categorical_cols)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the transformations to the training and test sets
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Convert the transformed data back to a DataFrame for easier inspection
X_train = pd.DataFrame(X_train, columns=numerical_cols + categorical_cols)
X_test = pd.DataFrame(X_test, columns=numerical_cols + categorical_cols)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)


In [None]:
# Predict on the test set
y_pred = rf_model.predict(X_test)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Classification report and confusion matrix
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Predict probabilities for ROC AUC
y_prob = rf_model.predict_proba(X_test)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("ROC AUC Score:", roc_auc)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Initialize the XGBoost Classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Reduced parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2]
}

# RandomizedSearchCV with fewer iterations
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, n_iter=10, cv=3, scoring='recall', n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model
best_xgb_model = random_search.best_estimator_

# Fit the best model
best_xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = best_xgb_model.predict(X_test)

# Classification report and confusion matrix
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Predict probabilities for ROC AUC
y_prob = best_xgb_model.predict_proba(X_test)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("ROC AUC Score:", roc_auc)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
from xgboost import XGBClassifier

# Define the model with class weight adjustments
xgb_model = XGBClassifier(scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]))

# Train the model
xgb_model.fit(X_train, y_train)

# Evaluate the model
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("ROC AUC Score:", roc_auc)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train the model on resampled data
xgb_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model as before
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("ROC AUC Score:", roc_auc)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [1, len(y_train[y_train == 0]) / len(y_train[y_train == 1])]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='recall', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_xgb_model = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_xgb_model.predict(X_test)
y_prob = best_xgb_model.predict_proba(X_test)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("ROC AUC Score:", roc_auc)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate the model
def evaluate_model(y_test, y_pred):
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))

# Load your data (assuming X_train, X_test, y_train, y_test are already defined)

# Step 1: Adjust Class Weights
class_weight_ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
model = XGBClassifier(scale_pos_weight=class_weight_ratio)

# Train the model with class weights
model.fit(X_train, y_train)

# Evaluate the model with class weights
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

# Step 2: Apply SMOTE for Balancing
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Train the model on the balanced data
model.fit(X_smote, y_smote)

# Evaluate the model with SMOTE
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

# Step 3: Hyperparameter Tuning
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Use RandomizedSearchCV instead of GridSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings that are sampled
    scoring='recall',
    cv=3,
    n_jobs=-1,  # Use all available cores
    verbose=2
)

# Fit the random search model
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the best model after hyperparameter tuning
y_pred = best_model.predict(X_test)
evaluate_model(y_test, y_pred)


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight

# Adjust the scale_pos_weight parameter
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Initialize the model with class weights
model = XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Use RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings that are sampled
    scoring='recall',  # Focus on improving recall
    cv=3,
    n_jobs=-1,  # Use all available cores
    verbose=2
)

# Fit the random search model
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the best model after hyperparameter tuning
y_pred = best_model.predict(X_test)
evaluate_model(y_test, y_pred)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate_model(y_test, y_pred):
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred))


In [None]:
from sklearn.metrics import precision_recall_curve

# Get the probabilities for the positive class
y_probs = best_model.predict_proba(X_test)[:, 1]

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Find the optimal threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

# Apply the optimal threshold
y_pred_optimal = (y_probs >= optimal_threshold).astype(int)

# Evaluate the model with the optimal threshold
evaluate_model(y_test, y_pred_optimal)

print(f"Optimal Threshold: {optimal_threshold}")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Predict probabilities
y_probs = best_model.predict_proba(X_test)[:, 1]

# Apply the optimal threshold
optimal_threshold = 0.5866
y_pred_optimal = (y_probs >= optimal_threshold).astype(int)

# Evaluate the model with the optimal threshold
print("Classification Report:")
print(classification_report(y_test, y_pred_optimal))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_optimal))

roc_auc = roc_auc_score(y_test, y_probs)
print(f"ROC AUC Score: {roc_auc}")
print(f"Optimal Threshold: {optimal_threshold}")


In [None]:
pip install reportlab


In [None]:
pip install fpdf


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils import resample
from fpdf import FPDF

# Load the dataset
df = pd.read_json('loan_approval_dataset.json')

# Data Exploration
# Display basic information about the dataset
print("Dataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check class balance
print("\nClass Balance:")
print(df['Risk_Flag'].value_counts())

# Visualize class distribution
sns.countplot(x='Risk_Flag', data=df)
plt.title('Class Distribution')
plt.show()

# Feature Engineering
# Split data into features and target
X = df.drop('Risk_Flag', axis=1)
y = df['Risk_Flag']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Define preprocessing steps for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps for both types of data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing to training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
# Apply preprocessing to test data
X_test_preprocessed = preprocessor.transform(X_test)

# Model Building
# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_preprocessed)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nROC AUC Score:")
roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test_preprocessed)[:, 1])
print(roc_auc)

# Resampling for Class Imbalance
# Upsample the minority class
df_majority = df[df['Risk_Flag'] == 0]
df_minority = df[df['Risk_Flag'] == 1]
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Perform train-test split again
X_upsampled = df_upsampled.drop('Risk_Flag', axis=1)
y_upsampled = df_upsampled['Risk_Flag']
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

# Apply preprocessing to training data for upsampled data
X_train_up_preprocessed = preprocessor.fit_transform(X_train_up)
# Apply preprocessing to test data for upsampled data
X_test_up_preprocessed = preprocessor.transform(X_test_up)

# Train a new model on upsampled data
rf_model_upsampled = RandomForestClassifier(random_state=42)
rf_model_upsampled.fit(X_train_up_preprocessed, y_train_up)

# Evaluate the upsampled model
y_pred_up = rf_model_upsampled.predict(X_test_up_preprocessed)
print("\nUpsampled Classification Report:")
print(classification_report(y_test_up, y_pred_up))
print("\nUpsampled Confusion Matrix:")
print(confusion_matrix(y_test_up, y_pred_up))
print("\nUpsampled ROC AUC Score:")
roc_auc_up = roc_auc_score(y_test_up, rf_model_upsampled.predict_proba(X_test_up_preprocessed)[:, 1])
print(roc_auc_up)

# Generate PDF Report
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Loan Approval Prediction Analysis Report', align='C', ln=True)
        self.ln(10)

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, ln=True)
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_plot(self, title, plot):
        self.chapter_title(title)
        self.image(plot, x=None, y=None, w=180)
        self.ln()

pdf = PDF()
pdf.add_page()

# Introduction
pdf.chapter_title("Introduction")
pdf.chapter_body("This report presents the analysis of loan approval prediction using machine learning models.")

# Data Exploration
pdf.chapter_title("Data Exploration")
pdf.chapter_body("The dataset contains information about loan applications, including various features like income, credit score, and loan amount.")

# Feature Engineering
pdf.chapter_title("Feature Engineering")
pdf.chapter_body("Numerical features were scaled using StandardScaler for model training.")

# Model Building
pdf.chapter_title("Model Building")
pdf.chapter_body("A Random Forest Classifier was trained on the dataset.")

# Evaluation Metrics
pdf.chapter_title("Evaluation Metrics")
pdf.chapter_body(f"ROC AUC Score: {roc_auc}\n\nClassification Report:\n{classification_report(y_test, y_pred)}\n\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

# Recommendations
pdf.chapter_title("Recommendations")
pdf.chapter_body("Upsampling the minority class improved model performance. Consider using resampling techniques for imbalanced datasets.")

# Save PDF Report
pdf_file = "loan_approval_analysis_report.pdf"
pdf.output(pdf_file)
print(f"\nPDF Report generated successfully: {pdf_file}")


In [None]:
# Train a new model on upsampled data
rf_model_upsampled = RandomForestClassifier(random_state=42)
rf_model_upsampled.fit(X_train_up_preprocessed, y_train_up)

# Evaluate the upsampled model
y_pred_up = rf_model_upsampled.predict(X_test_up_preprocessed)
print("\nUpsampled Classification Report:")
print(classification_report(y_test_up, y_pred_up))
print("\nUpsampled Confusion Matrix:")
print(confusion_matrix(y_test_up, y_pred_up))
print("\nUpsampled ROC AUC Score:")
roc_auc_up = roc_auc_score(y_test_up, rf_model_upsampled.predict_proba(X_test_up_preprocessed)[:, 1])
print(roc_auc_up)

# Generate PDF Report
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Loan Approval Prediction Analysis Report', align='C', ln=True)
        self.ln(10)

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, ln=True)
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_code(self, code):
        self.set_font('Courier', '', 10)
        self.multi_cell(0, 10, code)
        self.ln()

    def add_plot(self, title, plot):
        self.chapter_title(title)
        self.image(plot, x=None, y=None, w=180)
        self.ln()

pdf = PDF()
pdf.add_page()

# Introduction
pdf.chapter_title("Introduction")
pdf.chapter_body("This report presents the analysis of loan approval prediction using machine learning models.")

# Data Exploration
pdf.chapter_title("Data Exploration")
pdf.chapter_body("The dataset contains information about loan applications, including various features like income, credit score, and loan amount.")
pdf.add_plot("Class Distribution", "class_distribution.png")

# Feature Engineering
pdf.chapter_title("Feature Engineering")
pdf.chapter_body("Numerical features were scaled using StandardScaler for model training.")

# Model Building
pdf.chapter_title("Model Building")
pdf.chapter_body("A Random Forest Classifier was trained on the dataset.")

# Evaluation Metrics
pdf.chapter_title("Evaluation Metrics")
pdf.chapter_body(f"ROC AUC Score: {roc_auc}\n\nClassification Report:\n{classification_report(y_test, y_pred)}\n\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

# Upsampled Model Evaluation Metrics
pdf.chapter_title("Upsampled Model Evaluation Metrics")
pdf.chapter_body(f"ROC AUC Score: {roc_auc_up}\n\nClassification Report:\n{classification_report(y_test_up, y_pred_up)}\n\nConfusion Matrix:\n{confusion_matrix(y_test_up, y_pred_up)}")

# Recommendations
pdf.chapter_title("Recommendations")
pdf.chapter_body("Upsampling the minority class improved model performance. Consider using resampling techniques for imbalanced datasets.")

# Code Section
pdf.chapter_title("Code Implementation")
code = """
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils import resample
from fpdf import FPDF

# Load the dataset
df = pd.read_json('loan_approval_dataset.json')

# Data Exploration
print("Dataset Info:")
print(df.info())
print("\\nMissing Values:")
print(df.isnull().sum())
print("\\nClass Balance:")
print(df['Risk_Flag'].value_counts())

# Visualize class distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Risk_Flag', data=df)
plt.title('Class Distribution')
plt.savefig('class_distribution.png')
plt.show()

# Feature Engineering
X = df.drop('Risk_Flag', axis=1)
y = df['Risk_Flag']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Define preprocessing steps for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps for both types of data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Model Building
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_preprocessed)
print("\\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\\nROC AUC Score:")
roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test_preprocessed)[:, 1])
print(roc_auc)

# Resampling for Class Imbalance
df_majority = df[df['Risk_Flag'] == 0]
df_minority = df[df['Risk_Flag'] == 1]
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Perform train-test split again
X_upsampled = df_upsampled.drop('Risk_Flag', axis=1)
y_upsampled = df_upsampled['Risk_Flag']
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

# Apply preprocessing to training data for upsampled data
X_train_up_preprocessed = preprocessor.fit_transform(X_train_up)
X_test_up_preprocessed = preprocessor.transform(X_test_up)

# Train a new model on upsampled data
rf_model_upsampled = RandomForestClassifier(random_state=42)
rf_model_upsampled.fit(X_train_up_preprocessed, y_train_up)

# Evaluate the upsampled model
y_pred_up = rf_model_upsampled.predict(X_test_up_preprocessed)
print("\\nUpsampled Classification Report:")
print(classification_report(y_test_up, y_pred_up))
print("\\nUpsampled Confusion Matrix:")
print(confusion_matrix(y_test_up, y_pred_up))
print("\\nUpsampled ROC AUC Score:")
roc_auc_up = roc_auc_score(y_test_up, rf_model_upsampled.predict_proba(X_test_up_preprocessed)[:, 1])
print(roc_auc_up)
"""
pdf.add_code(code)

# Save the PDF
pdf.output('Loan_Approval_Prediction_Report.pdf')
pdf.output(pdf_file)
print(f"\nPDF Report generated successfully: {pdf_file}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils import resample
from fpdf import FPDF

# Load the dataset
df = pd.read_json('loan_approval_dataset.json')

# Data Exploration
print("Dataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nClass Balance:")
print(df['Risk_Flag'].value_counts())

# Visualize class distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Risk_Flag', data=df)
plt.title('Class Distribution')
plt.savefig('class_distribution.png')
plt.show()

# Feature Engineering
X = df.drop('Risk_Flag', axis=1)
y = df['Risk_Flag']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Define preprocessing steps for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps for both types of data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Model Building
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_preprocessed)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nROC AUC Score:")
roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test_preprocessed)[:, 1])
print(roc_auc)

# Resampling for Class Imbalance
df_majority = df[df['Risk_Flag'] == 0]
df_minority = df[df['Risk_Flag'] == 1]
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Perform train-test split again
X_upsampled = df_upsampled.drop('Risk_Flag', axis=1)
y_upsampled = df_upsampled['Risk_Flag']
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)
X_train_up_preprocessed = preprocessor.fit_transform(X_train_up)
X_test_up_preprocessed = preprocessor.transform(X_test_up)

# Train a new model on upsampled data
rf_model_upsampled = RandomForestClassifier(random_state=42)
rf_model_upsampled.fit(X_train_up_preprocessed, y_train_up)

# Evaluate the upsampled model
y_pred_up = rf_model_upsampled.predict(X_test_up_preprocessed)
print("\nUpsampled Classification Report:")
print(classification_report(y_test_up, y_pred_up))
print("\nUpsampled Confusion Matrix:")
print(confusion_matrix(y_test_up, y_pred_up))
print("\nUpsampled ROC AUC Score:")
roc_auc_up = roc_auc_score(y_test_up, rf_model_upsampled.predict_proba(X_test_up_preprocessed)[:, 1])
print(roc_auc_up)

# Generate PDF Report
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Loan Approval Prediction Analysis Report', align='C', ln=True)
        self.ln(10)

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, ln=True)
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_code(self, code):
        self.set_font('Courier', '', 10)
        self.multi_cell(0, 10, code)
        self.ln()

    def add_plot(self, title, plot):
        self.chapter_title(title)
        self.image(plot, x=None, y=None, w=180)
        self.ln()

pdf = PDF()
pdf.add_page()

# Introduction
pdf.chapter_title("Introduction")
pdf.chapter_body("This report presents the analysis of loan approval prediction using machine learning models.")

# Data Exploration
pdf.chapter_title("Data Exploration")
pdf.chapter_body("The dataset contains information about loan applications, including various features like income, credit score, and loan amount.")
pdf.add_plot("Class Distribution", "class_distribution.png")

# Feature Engineering
pdf.chapter_title("Feature Engineering")
pdf.chapter_body("Numerical features were scaled using StandardScaler for model training.")

# Model Building
pdf.chapter_title("Model Building")
pdf.chapter_body("A Random Forest Classifier was trained on the dataset.")

# Evaluation Metrics
pdf.chapter_title("Evaluation Metrics")
pdf.chapter_body(f"ROC AUC Score: {roc_auc}\n\nClassification Report:\n{classification_report(y_test, y_pred)}\n\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

# Recommendations
pdf.chapter_title("Recommendations")
pdf.chapter_body("Upsampling the minority class improved model performance. Consider using resampling techniques for imbalanced datasets.")

# Code Section
pdf.chapter_title("Code Implementation")
code = """
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils import resample
from fpdf import FPDF

# Load the dataset
df = pd.read_json('loan_approval_dataset.json')

# Data Exploration
print("Dataset Info:")
print(df.info())
print("\\nMissing Values:")
print(df.isnull().sum())
print("\\nClass Balance:")
print(df['Risk_Flag'].value_counts())

# Visualize class distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Risk_Flag', data=df)
plt.title('Class Distribution')
plt.savefig('class_distribution.png')
plt.show()

# Feature Engineering
X = df.drop('Risk_Flag', axis=1)
y = df['Risk_Flag']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Define preprocessing steps for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps for both types of data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Model Building
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_preprocessed)
print("\\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\\nROC AUC Score:")
roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test_preprocessed)[:, 1])
print(roc_auc)

# Resampling for Class Imbalance
df_majority = df[df['Risk_Flag'] == 0]
df_minority = df[df['Risk_Flag'] == 1]
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Perform train-test split again
X_upsampled = df_upsampled.drop('Risk_Flag', axis=1)
y_upsampled = df_upsampled['Risk_Flag']
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)
X_train_up_preprocessed = preprocessor.fit_transform(X_train_up)
X_test_up_preprocessed = preprocessor.transform(X_test_up)

# Train a new model on upsampled data
rf_model_upsampled = RandomForestClassifier(random_state=42)
rf_model_upsampled.fit(X_train_up_preprocessed, y_train_up)

# Evaluate the upsampled model
y_pred_up = rf_model_upsampled.predict(X_test_up_preprocessed)
print("\\nUpsampled Classification Report:")
print(classification_report(y_test_up, y_pred_up))
print("\\nUpsampled Confusion Matrix:")
print(confusion_matrix(y_test_up, y_pred_up))
print("\\nUpsampled ROC AUC Score:")
roc_auc_up = roc_auc_score(y_test_up, rf_model_upsampled.predict_proba(X_test_up_preprocessed)[:, 1])
print(roc_auc_up)
"""
pdf.add_code(code)

pdf.output("loan_approval_analysis_report_v2.pdf")
