In [32]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE

In [33]:
# Load datasets
app_record = pd.read_csv('/content/sample_data/application_record.csv')
credit_record = pd.read_csv('/content/sample_data/credit_record.csv')

In [34]:
# Display the first few rows of each dataset
print(app_record.head())
print(credit_record.head())

        ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
0  5008804           M            Y               Y             0   
1  5008805           M            Y               Y             0   
2  5008806           M            Y               Y             0   
3  5008808           F            N               Y             0   
4  5008809           F            N               Y             0   

   AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0          427500.0               Working               Higher education   
1          427500.0               Working               Higher education   
2          112500.0               Working  Secondary / secondary special   
3          270000.0  Commercial associate  Secondary / secondary special   
4          270000.0  Commercial associate  Secondary / secondary special   

     NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
0        Civil marriage   Rented apartment      -12005 

In [35]:
# Creating subplots for distributions of various features
fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=('Gender', 'Income', 'Education Level', 'Family Status', 'Housing Type', 'Age', 'Years Employed')
)

# Gender Distribution
gender_fig = px.histogram(app_record, x='CODE_GENDER', color='CODE_GENDER', title='Gender Distribution',
                          color_discrete_sequence=px.colors.qualitative.Pastel, template='plotly_dark').data[0]
fig.add_trace(gender_fig, row=1, col=1)

# Income Distribution
income_fig = px.histogram(app_record, x='AMT_INCOME_TOTAL', nbins=50, title='Income Distribution',
                          color_discrete_sequence=['#00CC96'], template='plotly_dark').data[0]
fig.add_trace(income_fig, row=1, col=2)

# Education Level Distribution
education_fig = px.histogram(app_record, x='NAME_EDUCATION_TYPE', title='Education Level Distribution',
                             color='NAME_EDUCATION_TYPE', color_discrete_sequence=px.colors.qualitative.Set2, template='plotly_dark').data[0]
fig.add_trace(education_fig, row=1, col=3)

# Family Status Distribution
family_fig = px.histogram(app_record, x='NAME_FAMILY_STATUS', title='Family Status Distribution',
                          color='NAME_FAMILY_STATUS', color_discrete_sequence=px.colors.qualitative.Vivid, template='plotly_dark').data[0]
fig.add_trace(family_fig, row=2, col=1)

# Housing Type Distribution
housing_fig = px.histogram(app_record, x='NAME_HOUSING_TYPE', title='Housing Type Distribution',
                           color='NAME_HOUSING_TYPE', color_discrete_sequence=px.colors.qualitative.Alphabet, template='plotly_dark').data[0]
fig.add_trace(housing_fig, row=2, col=2)

# Age Distribution
app_record['AGE'] = app_record['DAYS_BIRTH'] // -365
age_fig = px.histogram(app_record, x='AGE', nbins=50, title='Age Distribution',
                       color_discrete_sequence=['#FFA15A'], template='plotly_dark').data[0]
fig.add_trace(age_fig, row=2, col=3)

# Years Employed Distribution
app_record['YEARS_EMPLOYED'] = app_record['DAYS_EMPLOYED'] // -365
employed_fig = px.histogram(app_record, x='YEARS_EMPLOYED', nbins=50, title='Years Employed Distribution',
                            color_discrete_sequence=['#AB63FA'], template='plotly_dark').data[0]
fig.add_trace(employed_fig, row=3, col=1)

fig.update_layout(height=900, width=900, title_text='Distributions of Various Features', title_font_size=24, title_x=0.5)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [36]:
# income vs. age colored by gender
scatter_fig = px.scatter(app_record, x='AGE', y='AMT_INCOME_TOTAL', color='CODE_GENDER',
                         title='Income vs. Age by Gender', color_discrete_sequence=px.colors.qualitative.Pastel, template='plotly_dark')
scatter_fig.update_layout(title_font_size=20, title_x=0.5, xaxis_title='Age', yaxis_title='Income')
scatter_fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [37]:
# Income by family status
box_fig = px.box(app_record, x='NAME_FAMILY_STATUS', y='AMT_INCOME_TOTAL', color='NAME_FAMILY_STATUS',
                 title='Income by Family Status', color_discrete_sequence=px.colors.qualitative.Vivid, template='plotly_dark')
box_fig.update_layout(title_font_size=20, title_x=0.5, xaxis_title='Family Status', yaxis_title='Income')
box_fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [38]:
# Heatmap of correlation matrix
numerical_cols = app_record.select_dtypes(include=[np.number]).columns
correlation_matrix = app_record[numerical_cols].corr()

# Create the heatmap figure
heatmap_fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Viridis',
    colorbar=dict(title='Correlation'),
    zmin=-1, zmax=1,  # Setting color scale range
    showscale=True  # Display color scale
))

# Add annotations to the heatmap
annotations = []
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        annotations.append(
            dict(
                text=str(round(value, 2)),
                x=correlation_matrix.columns[j],
                y=correlation_matrix.columns[i],
                xref='x1', yref='y1',
                font=dict(color='black', size=12),
                showarrow=False
            )
        )

heatmap_fig.update_layout(
    title='Correlation Matrix',
    xaxis_nticks=36,
    template='plotly_dark',
    annotations=annotations  # Add annotations to the layout
)

heatmap_fig.show()

In [40]:
# Preprocessing
# Merge datasets on ID
merged_df = pd.merge(app_record, credit_record, on='ID')

In [41]:
# Create a new column to classify clients as 'good' or 'bad' based on STATUS
def classify_client(status):
    if status in ['2', '3', '4', '5']:
        return 'bad'
    else:
        return 'good'

merged_df['client_status'] = merged_df['STATUS'].apply(classify_client)

In [42]:
# Label encoding for the target variable
label_encoder = LabelEncoder()
merged_df['client_status'] = label_encoder.fit_transform(merged_df['client_status'])

In [43]:
# Drop columns that won't be used
merged_df.drop(columns=['ID', 'STATUS', 'MONTHS_BALANCE'], inplace=True)

In [44]:
# Encode categorical variables
categorical_cols = merged_df.select_dtypes(include=['object']).columns
merged_df = pd.get_dummies(merged_df, columns=categorical_cols, drop_first=True)

In [45]:
# Handle missing values
merged_df.fillna(merged_df.median(), inplace=True)

In [46]:
# Split data into features and target variable
X = merged_df.drop(columns=['client_status'])
y = merged_df['client_status']

In [47]:
# Handle imbalanced data using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [48]:
# Standardize features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

In [49]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [18]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
accuracy_log_reg

0.644570986110887

In [19]:
# Random Forest Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
accuracy_rf_clf = accuracy_score(y_test, y_pred_rf)
accuracy_rf_clf

0.9822619412495363

In [20]:
# Determine which model has higher accuracy
if accuracy_log_reg > accuracy_rf_clf:
    chosen_model = log_reg
    y_pred_chosen = y_pred_log_reg
    chosen_model_name = 'Logistic Regression'
else:
    chosen_model = rf_clf
    y_pred_chosen = y_pred_rf
    chosen_model_name = 'Random Forest Classifier'

In [21]:
# Print the classification reports
print(f"{chosen_model_name} Classification Report:")
print(classification_report(y_test, y_pred_chosen))

Random Forest Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98    155425
           1       1.00      0.97      0.98    154530

    accuracy                           0.98    309955
   macro avg       0.98      0.98      0.98    309955
weighted avg       0.98      0.98      0.98    309955



In [22]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, chosen_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

# Display ROC Curve
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'{chosen_model_name} (AUC = {roc_auc:.2f})', line=dict(color='firebrick', width=2)))
fig.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', template='plotly_dark', title_font_size=20, title_x=0.5)
fig.show()

In [23]:
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_chosen)

# Define labels for the matrix
labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive']

# Create an annotated heatmap
fig = ff.create_annotated_heatmap(cm, x=['Predicted ' + label for label in ['0', '1']],
                                  y=['Actual ' + label for label in ['0', '1']],
                                  colorscale='Viridis')

# Update the figure layout
fig.update_layout(title_text='Confusion Matrix',
                  xaxis=dict(title='Predicted label'),
                  yaxis=dict(title='True label'))

fig.show()