In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_excel("loans_current.xlsx")

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
loan=df.drop(['principal_amount_proposed','principal_amount','principal_disbursed_derived','principal_amount',
             'repay_every','term_frequency', 'duedate','obligations_met_on_date','loan_status_600','Unnamed: 24','total_withdrawals_derived','account_balance_derived'],axis=1)

In [None]:
#rename columns
dict = {'approved_principal':'loan_amount',
       'principal_amount_arrears':'loan_arrear_amount',
       'gender_cv_id ':'client_gender'
       }
loan.rename(columns = dict, inplace = True)

In [None]:
#checking for null values
loan.isnull().sum()

In [None]:
# drop null values
loan=loan.dropna()
loan.isnull().sum()

In [None]:
loan.info()

In [None]:
column_names = loan.columns
print(column_names)

In [None]:
# Label ratio
loan['Loan_status'].value_counts(normalize=True).get(1, 0) * 100


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Assuming 'loan' is your DataFrame and 'Loan_status' is the binary label column
label_counts = loan['Loan_status'].value_counts(normalize=True)

# Create a bar chart to visualize the label ratio
plt.figure(figsize=(6, 4))
label_counts.plot(kind='bar', color=['#008ac5', '#00c698'])
plt.xlabel('Loan Status')
plt.ylabel('Percentage')
plt.title('Loan Status Ratio')
plt.xticks([0, 1], ['Non-Default', 'Default'], rotation=0)  # Customize labels if needed

# Display the graph
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (6,4))
sns.countplot(x = 'gender_cv_id', hue = 'Loan_status', data = loan)

In [None]:
plt.figure(figsize = (6,4))
sns.countplot(x = 'loan_year', hue = 'Loan_status', data = loan)

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the percentage of loan statuses per year
percentage_df = (loan.groupby(['loan_year', 'Loan_status']).size() / loan.groupby('loan_year').size()).reset_index(name='percentage')

# Create a countplot
plt.figure(figsize=(8, 4))
ax = sns.barplot(x='loan_year', y='percentage', hue='Loan_status', data=percentage_df)

# Add percentage labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()*100:.2f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom')

# Set plot labels and title
plt.xlabel('Loan Year')
plt.ylabel('Percentage')
plt.title('Loan Status Percentage by Year')

# Show the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the percentage of loan statuses per loan_status_id
percentage_df = (loan.groupby(['loan_status_id', 'Loan_status']).size() / loan.groupby('loan_status_id').size()).reset_index(name='percentage')

# Create a countplot
plt.figure(figsize=(8, 4))
ax = sns.barplot(x='loan_status_id', y='percentage', hue='Loan_status', data=percentage_df)

# Add percentage labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()*100:.2f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom')

# Set plot labels and title
plt.xlabel('Loan Status ID')
plt.ylabel('Percentage')
plt.title('Loan Status Percentage by Loan Status ID')

# Show the plot
plt.show()


In [None]:
plt.figure(figsize=(4, 2))  # Adjust the figure size as needed

sns.boxplot(x=loan['client_age'], showmeans=True, color='#008ac5')

plt.xlabel('Client Age')
plt.ylabel('Value')
plt.title('Box Plot for Client Age')

# Show the plot
plt.show()

In [None]:
# Set the desired font size for axis labels
sns.set(font_scale=1)  # Adjust the value as needed to increase or decrease the font size

# Create the pair plot
sns.pairplot(loan)

# Show the plot
plt.show()

In [None]:
corr_matrix = loan.corr()
print(corr_matrix)

In [None]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
upper

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Generate a heatmap of the correlation matrix
plt.figure(figsize=(8, 4))
sns.heatmap(upper, annot=True, cmap='coolwarm', fmt=".2f")

# Save the heatmap as an image
plt.savefig('correlation_heatmap.png', format='png', dpi=300, bbox_inches='tight')

In [None]:
import seaborn as sns
import pandas as pd

# Calculate the correlation matrix
correlation_matrix = loan.corr()  # Replace 'your_dataframe' with 'loan'

# Convert the upper triangle of the correlation matrix to a DataFrame
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Display the upper triangle of the correlation matrix as a table
upper = upper.stack().reset_index()
upper.columns = ['Variable 1', 'Variable 2', 'Correlation']
upper.to_csv('correlation_table.csv', index=False)  # Save as CSV file



In [None]:
data=loan

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:
# Features for GRU model
gru_features = [
    "loan_amount", "principal_writtenoff_derived", "gender_cv_id", 
    "loan_year", "number_of_repayments",'loan_arrear_amount'
    "client_age", "total_deposits_derived", "account_balance", 
     "days_in_arrears"
]


In [None]:
# Splitting data into training and testing sets
X = data[gru_features]
y = data["Loan_status"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Normalizing the features for the GRU model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape


The data has been successfully split into training and testing sets. We have 694,739 samples in the training set and 173,685 samples in the testing set, with each sample having 11 features.

Let's get started with defining and training the GRU model


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU


In [None]:
# Defining the GRU model
model_gru = Sequential([
    GRU(32, input_shape=(X_train_scaled.shape[1], 1), return_sequences=True),
    GRU(16, return_sequences=False),
    Dense(1, activation='sigmoid')
])


In [None]:
# Compiling the model
model_gru.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Reshaping data for GRU input
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)


In [None]:
# Training the model
history_gru = model_gru.fit(X_train_reshaped, y_train, epochs=5, batch_size=256, validation_data=(X_test_reshaped, y_test), verbose=1)


In [None]:
train_predictions = model_gru.predict(X_train).flatten()
train_results = pd.DataFrame(data={'Train Predictions': train_predictions,
                                  'Actual':y_train})
train_results.head()

In [None]:
test_predictions = model_gru.predict(X_test).flatten()
test_results = pd.DataFrame(data={'Test Predictions': test_predictions,
                                  'Actual': y_test})
test_results.head()

In [None]:
# Predicting on the test set
y_gru_pred = model_gru.predict(X_test_reshaped)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
# Initialize the GBT model
gbt_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)


In [None]:
# Train the GBT model
gbt_model.fit(X_train, y_train)


In [None]:
# Predicting on the test set
y_gbt_pred = gbt_model.predict(X_test)


Prepare Data for Hybrid Model: To prepare the input for the hybrid model, we need to combine the original features with the predictions from the GRU and GBT models

In [None]:
# Combine original features with predictions from GRU and GBT
X_test_hybrid = np.column_stack((X_test, y_gru_pred, y_gbt_pred))


In [None]:
# Define the hybrid model
model_hybrid = Sequential([
    Dense(64, activation='relu', input_shape=(X_test_hybrid.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])


In [None]:
# Compile the model
model_hybrid.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
model_hybrid.fit(X_test_hybrid, y_test, epochs=5, batch_size=256, verbose=1)


In [None]:
# Predict with the hybrid model
y_hybrid_pred = model_hybrid.predict(X_test_hybrid)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
accuracy = accuracy_score(y_test, y_hybrid_pred > 0.5)
precision = precision_score(y_test, y_hybrid_pred > 0.5)
recall = recall_score(y_test, y_hybrid_pred > 0.5)
f1 = f1_score(y_test, y_hybrid_pred > 0.5)


In [None]:
print(accuracy,precision,recall,f1)

In [None]:


# Assuming you have these metrics calculated
accuracy = 0.98
precision = 0.99
recall = 0.93
f1 = 0.96

# Create a DataFrame
metrics_df = pd.DataFrame({'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
                           'Value': [accuracy, precision, recall, f1]})

# Print the DataFrame
print(metrics_df)


Logistic regression Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Define your features (independent variables) and the target (dependent variable)
features = ['loan_amount', 'principal_writtenoff_derived', 'gender_cv_id', 'loan_year',
            'number_of_repayments', 'client_age', 'total_deposits_derived', 'account_balance']

target = 'Loan_status'

In [None]:
# Split the data into training and testing sets
X = data[features]
y = data[target]

In [None]:
# Split the data into a training set and a testing set (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize the logistic regression model
logistic_model = LogisticRegression()


In [None]:
# Fit the model on the training data
logistic_model.fit(X_train, y_train)


In [None]:
# Make predictions on the testing data
y_pred = logistic_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [None]:
# Print the results
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')