The objective of this notebook is to build a baseline machine learning model for deqliquncy variables 


# 1. Import Libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [0]:
# Ignore SettingWithCopyWarning
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# 2. Loading the datasets and basic Exploration

In [0]:
appl_train = spark.sql("SELECT * FROM default.application_train")
# bu_features1 = spark.sql("SELECT * FROM default.bu_features_level1")
# bu_features2 = spark.sql("SELECT * FROM default.bu_features_level2")
bu_features_final2 = spark.sql("SELECT * FROM default.bu_features_final2")





In [0]:
appl_train.createOrReplaceTempView("appl_train")
# bu_features1.createOrReplaceTempView("bu_features1")
# bu_features2.createOrReplaceTempView("bu_features2")
bu_features_final2.createOrReplaceTempView("bu_features_final2")

In [0]:
%sql
select count(distinct SK_ID_CURR) AS UNIQUE_CUSTOMER_COUNT, count(*) as row_count FROM appl_train

In [0]:
%sql
select count(distinct SK_ID_CURR) AS UNIQUE_CUSTOMER_COUNT, count(*) as row_count FROM bu_features_final2

In [0]:
appl_train.limit(10).display()

In [0]:
%sql
select count(distinct SK_ID_CURR) AS UNIQUE_CUSTOMER_COUNT, count(*) as row_count from bu_features_final

In [0]:
# We want to map the targer values against the corresponding SK_ID_CURR from appl_train
# Hence we are Left joining and keeping only the target value
deq_model_train_data_1 = bu_features_final2.join(appl_train.select("SK_ID_CURR", "TARGET"), 
                          on="SK_ID_CURR", 
                          how="inner") 

In [0]:
deq_model_train_data_1.limit(2).display()
deq_model_train_data_1.createOrReplaceTempView("deq_model_train_data_1")

In [0]:
%sql
select count(distinct SK_ID_CURR) AS UNIQUE_CUSTOMER_COUNT, count(*) as row_count from deq_model_train_data_1

Now our base dataframe for training is ready and we are going to store it as a table in the hivestore catalog

In [0]:
%sql
drop table if exists default.deq_model_train_data_1;
create table default.deq_model_train_data_1 as
select
  *
from
  deq_model_train_data_1;

# 3. Model Training Lifecycle 

In [0]:
!pip install tensorflow

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from sklearn.ensemble import RandomForestClassifier

from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping



## 3.1 Basic EDA

In [0]:
# Set the maximum number of rows to display
pd.set_option('display.max_rows', None)

In [0]:
deq_model_train_data_1 = spark.sql("SELECT * FROM default.deq_model_train_data_1")
deq_model_train_data_1.createOrReplaceTempView("deq_model_train_data_1")

In [0]:
%sql
select count(distinct SK_ID_CURR) as UNIQUE_CUSTOMER_COUNT, count(*) as row_count from deq_model_train_data_1

In [0]:
data_raw = deq_model_train_data_1.toPandas()

In [0]:
data_raw.shape

In [0]:
print(data_raw.dtypes)

In [0]:
print(data_raw.isnull().sum())

In [0]:
# Calculate fill rate for each column
fill_rate = data_raw.notnull().mean() * 100

# Display the fill rate as a DataFrame
fill_rate_df = fill_rate.reset_index()
fill_rate_df.columns = ['Column', 'Fill Rate (%)']
print(fill_rate_df)

In [0]:
# Calculate fill rate
fill_rate = data_raw.notnull().mean() * 100

# Filter and keep only columns with fill rate >= 20%
data_1 = data_raw.loc[:, fill_rate >= 20]


In [0]:
data_1.shape

In [0]:
target_counts = data_1["TARGET"].value_counts()
print(target_counts)

# Plotting the pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    target_counts,
    labels=target_counts.index,
    autopct="%1.1f%%",
    colors=["skyblue", "orange"],
    startangle=90,
    explode=(0.05, 0.05),  # Slightly separate the slices for better visibility
)
plt.title("Distribution of Target Variable")
plt.show()

Clearly the TARGET Variable distribution is highly imabalanced

## 3.2 Data Preprocessing

Since I plan to use a neural network for this problem, we will have to handle null values. This is a finanical dataset and null values have meaning (example no loan taken etc). Hence we cannot just replace it with measures of central tendancy or build some model to impute values. Instead we will use some absurd number like -999 etc as placeholders. Ideally I should figure out categories inside the null values  - represting different reasons for the null values and assign each of them separate placeholders. But for the sake of simplicity and also because I have less time, I am just gonna replace all null values with -999 and add an indicator variable.

In [0]:
# Replace nulls with a placeholder (-999)
data_cleaned = data_1.copy()
placeholder_value = -999
data_cleaned = data_cleaned.fillna(placeholder_value)

# # Add missing value indicators for all features with nulls
# for col in data_1.columns:
#     if data_1[col].isnull().any():
#         data_cleaned[f"{col}_isnull"] = data_1[col].isnull().astype(int)



In [0]:
data_cleaned.head()

In [0]:
data_cleaned.shape

In [0]:
# Splitting features and target
X = data_cleaned.drop(columns=["TARGET","SK_ID_CURR"])
y = data_cleaned["TARGET"]


In [0]:
# Identify indicator columns (ending in '_isnull')
indicator_columns = [col for col in X.columns if col.endswith('_isnull')]
non_indicator_columns = [col for col in X.columns if col not in indicator_columns]

# Scale only non-indicator columns
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[non_indicator_columns] = scaler.fit_transform(X[non_indicator_columns])

In [0]:
X_scaled.head()

In [0]:
X_scaled.shape

In [0]:


# Fit a Random Forest to rank features
rf = RandomForestClassifier(random_state=42)
rf.fit(X_scaled, y)

# Get feature importances
feature_importances = pd.DataFrame({
    "Feature": X_scaled.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

In [0]:
feature_importances

Not making a decision based on the above. Choosing experimemt structure 2.1.2 for now

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)


In [0]:
X_train.shape

## 3.3 Building Neural Network


In [0]:
# # Define the neural network model
# model = Sequential()

# # Input layer (155 features)
# model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # 128 neurons in the first layer

# # Hidden layers
# model.add(Dense(64, activation='relu'))  # Second hidden layer with 64 neurons
# model.add(Dropout(0.5))  # Dropout to prevent overfitting

# model.add(Dense(32, activation='relu'))  # Third hidden layer with 32 neurons
# model.add(Dropout(0.5))  # Dropout

# # Output layer (binary classification, single output unit)
# model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

In [0]:
model = Sequential()

# Input layer (256 features)
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))  # First hidden layer with 256 neurons

# Hidden layers
model.add(Dense(128, activation='relu'))  # Second hidden layer with 128 neurons
model.add(Dropout(0.6))  # Dropout to prevent overfitting

model.add(Dense(64, activation='relu'))  # Third hidden layer with 64 neurons
model.add(Dropout(0.6))  # Dropout

# Output layer (binary classification, single output unit)
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification 

# Compile the model with binary cross-entropy loss and Adam optimizer
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Summary of the model architecture
model.summary()

In [0]:
# Calculate class weights for the imbalance


class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Compile the model with binary cross-entropy loss
model.compile(optimizer=Adam(learning_rate=0.001),
              loss=BinaryCrossentropy(),
              metrics=['accuracy'])

In [0]:
# Check if class weights are calculated correctly
print(class_weight_dict)

In [0]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [0]:

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_data=(X_test, y_test),
                    class_weight=class_weight_dict,  # Apply class weights here
                    callbacks=[early_stopping],
                    verbose=1)

In [0]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc:.4f}, Test Loss: {test_loss:.4f}')

In [0]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5)  # Convert probabilities to class labels (0 or 1)

print(classification_report(y_test, y_pred_binary))

In [0]:
# Create a directory for your model
dbutils.fs.mkdirs("dbfs:/FileStore/my_model_directory")

In [0]:

# Save model to the driver node local disk
local_path = "/tmp/ann_model_exp1.keras"
model.save(local_path)

# Copy the model from the local disk to DBFS
dbutils.fs.cp(f"file:{local_path}", "dbfs:/FileStore/my_model_directory/ann_model_exp1.keras")

# 4. Predicting using the model

In [0]:
# Keep a copy of SK_ID_CURR before resetting the index
X_scaled["SK_ID_CURR"] = data_cleaned["SK_ID_CURR"].values


In [0]:
X_scaled.head()

In [0]:
# Generate predictions
predictions = model.predict(X_scaled.drop(columns=["SK_ID_CURR"]))  # Drop SK_ID_CURR if it was re-added
predicted_probabilities = predictions.flatten()

# Create a DataFrame for predictions
prediction_df = pd.DataFrame({
    "SK_ID_CURR": X_scaled["SK_ID_CURR"],  # Use the retained SK_ID_CURR
    "Prediction_Probability": predicted_probabilities
})

final_data_with_predictions = data_cleaned.merge(prediction_df, on="SK_ID_CURR", how="left")

In [0]:
prediction_df.head()

In [0]:
prediction_df.shape

In [0]:
#converting to pyspark df
bu_final_pred_df = spark.createDataFrame(prediction_df)

In [0]:
bu_final_pred_df.createOrReplaceTempView("bu_final_pred_df")


In [0]:
%sql
drop table if exists default.bu_final_prediction_data;
create table default.bu_final_prediction_data as
select
  *
from
  bu_final_pred_df;

## Creating Final Residual **DF**

In [0]:
from pyspark.sql import functions as F

In [0]:
deq_model_train_data_1 = spark.sql("SELECT * FROM default.deq_model_train_data_1")
df_pred = spark.sql("SELECT * FROM default.bu_final_prediction_data")

In [0]:
# Perform inner join on the common column
joined_df = df_pred.join(deq_model_train_data_1, on="SK_ID_CURR", how="inner")

In [0]:
# Create a new DataFrame with SK_ID_CURR, predicted_prob, and residual

final_df = joined_df.select(
    F.col("SK_ID_CURR"),
    F.col("Prediction_Probability"),
    (F.col("TARGET") - F.col("Prediction_Probability")).alias("residual"),
    F.col("b_DAYS_CREDIT_mean"),
    F.col("b_720_DAYS_CREDIT_PLAN_sum"),
    F.col("b_720_AMT_CREDIT_MAX_OVERDUE_sum"),
    F.col("b_consumer_DAYS_CREDIT_ENDDATE_mean"),
    F.col("b_credit_AMT_CREDIT_SUM_DEBT_sum"),
    F.col("b_CNT_CREDIT_PROLONG_sum"),
    F.col("b_365_AMT_CREDIT_SUM_OVERDUE_sum"),
    F.col("b_credit_AMT_CREDIT_DEBT_DIFF_mean"),
    F.col("b_credit_DAYS_CREDIT_mean"),
    F.col("b_active_DAYS_CREDIT_mean"),
    F.col("DEQ_AVG_COUNT_DPD0P_36MOB_ALL"),
    F.col("DEQ_AVG_COUNT_DPD0P_3MOB_ALL")
)

# Show the result for verification
final_df.show()



In [0]:
final_df.createOrReplaceTempView("final_df")

In [0]:
%sql
drop table if exists default.bu_final_prediction_data2;
create table default.bu_final_prediction_data2 as
select
  *
from
  final_df;

In [0]:
display(final_df)

In [0]:
%python
# Replace spaces with underscores in all column names
for col in pyspark_df.columns:
    new_col = col.replace(" ", "_")
    pyspark_df = pyspark_df.withColumnRenamed(col, new_col)

# Display the DataFrame to verify the changes
display(pyspark_df)