In [0]:
%pip install tensorflow

In [0]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from sklearn.ensemble import RandomForestClassifier

from sklearn.utils.class_weight import compute_class_weight

from sklearn.model_selection import StratifiedKFold


# from sklearn.naive_bayes import GaussianNB
# from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
# from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score

from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping

In [0]:
# Ignore SettingWithCopyWarning
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [0]:
#Loading the dataset
data_raw = spark.table("hive_metastore.default.credit_card_installment_features_level_3")
data_raw.createOrReplaceTempView("data_raw")

In [0]:
%sql
select count(distinct SK_ID_CURR) AS UNIQUE_CUSTOMER_COUNT, count(*) as row_count FROM data_raw

In [0]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [0]:
data_raw = data_raw.toPandas()

In [0]:
data_raw.head()

In [0]:
data_raw.shape

In [0]:
data_raw.info()

In [0]:
# Calculate fill rate for each column
fill_rate = data_raw.notnull().mean() * 100

# Display the fill rate as a DataFrame
fill_rate_df = fill_rate.reset_index()
fill_rate_df.columns = ['Column', 'Fill Rate (%)']
print(fill_rate_df)

In [0]:
# Calculate fill rate
fill_rate = data_raw.notnull().mean() * 100

# Filter and keep only columns with fill rate >= 50%
data_1 = data_raw.loc[:, fill_rate >= 50]

In [0]:
data_1.info()

In [0]:
target_counts = data_1["TARGET"].value_counts()
print(target_counts)

# Plotting the pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    target_counts,
    labels=target_counts.index,
    autopct="%1.1f%%",
    colors=["skyblue", "orange"],
    startangle=90,
    explode=(0.05, 0.05),  # Slightly separate the slices for better visibility
)
plt.title("Distribution of Target Variable")
plt.show()

In [0]:
data_1.isnull().sum()

In [0]:
data_1.drop(columns=["NAME_CONTRACT_TYPE"], inplace=True)

In [0]:
# Function to get all categorical variables
def get_categorical_variables(df):
    return df.select_dtypes(include=['object', 'category']).columns.tolist()

categorical_vars = get_categorical_variables(data_1)
print("Categorical Variables:", categorical_vars)

In [0]:
data_1['NAME_FAMILY_STATUS'].value_counts()

In [0]:
data_1['CODE_GENDER'] = data_1['CODE_GENDER'].map({'M': 1, 'F': 0})
data_1['FLAG_OWN_CAR'] = data_1['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
data_1['FLAG_OWN_REALTY'] = data_1['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})


In [0]:
label_encoder = LabelEncoder()

# Apply LabelEncoder to all columns
data_2 = data_1.apply(lambda col: label_encoder.fit_transform(col) if col.dtypes == 'object' else col)

In [0]:
data_2.shape

In [0]:
data_2.info()


#### Missing value imputation

In [0]:
data_2.isnull().sum()

In [0]:
#drop columns where code_gender is null
# Drop rows where CODE_GENDER is null
data_2 = data_2.dropna(subset=['CODE_GENDER'])

In [0]:
# Replace nulls with a placeholder (-999)
data_cleaned = data_2.copy()
placeholder_value = -999
data_cleaned = data_cleaned.fillna(placeholder_value)

In [0]:
data_cleaned.isnull().sum()

In [0]:
# Calculate the correlation matrix
correlation_matrix = data_cleaned.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Create the heatmap
sns.heatmap(
    correlation_matrix, 
    annot=False,  # Set to True to display correlation coefficients on the heatmap
    fmt=".2f", 
    cmap="coolwarm", 
    cbar=True, 
    square=True
)

# Add title
plt.title("Correlation Heatmap", fontsize=16)
plt.show()

In [0]:
# Splitting features and target
X = data_cleaned.drop(columns=["TARGET","SK_ID_CURR"])
y = data_cleaned["TARGET"]

In [0]:
#feature scaling - standardization
# Identify indicator columns (ending in '_isnull')
indicator_columns = [col for col in X.columns if col.endswith('_isnull')]
non_indicator_columns = [col for col in X.columns if col not in indicator_columns]

# Scale only non-indicator columns
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[non_indicator_columns] = scaler.fit_transform(X[non_indicator_columns])

In [0]:
X_scaled.shape

In [0]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [0]:
# # Some useful parameters which will come in handy later on

# ntrain = X_train.shape[0]
# ntest = X_test.shape[0]
# SEED = 42 # for reproducibility
# NFOLDS = 5 # set folds for out-of-fold prediction
# kf = StratifiedKFold(n_splits=NFOLDS, random_state=SEED, shuffle=True)

# # Class to extend the Sklearn classifier
# class SklearnHelper(object):
#     def __init__(self, clf, seed=42, params=None):
#         # Check if the classifier accepts the `random_state` parameter
#         if 'random_state' in clf().get_params():
#             params['random_state'] = seed
#         self.clf = clf(**params)

    
#     def train(self, x_train, y_train):
#         self.clf.fit(x_train, y_train)

#     def predict(self, x):
#         return self.clf.predict(x)
    
#     def fit(self,x,y):
#         return self.clf.fit(x,y)
    
#     def feature_importances(self,x,y):
#         print(self.clf.fit(x,y).feature_importances_)

In [0]:
# def get_oof(clf, x_train, y_train, x_test):
#     oof_train = np.zeros((ntrain,))
#     oof_test = np.zeros((ntest,))
#     oof_test_skf = np.empty((kf.get_n_splits(), ntest))
#     fold_scores = []  # Store scores for each fold

#     for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
#         x_tr = x_train.iloc[train_index]
#         y_tr = y_train.iloc[train_index]
#         x_te = x_train.iloc[test_index]

#         clf.train(x_tr, y_tr)

#         oof_train[test_index] = clf.predict(x_te)
#         oof_test_skf[i, :] = clf.predict(x_test)

#         # Evaluate performance on validation set
#         fold_score = roc_auc_score(y_train.iloc[test_index], oof_train[test_index])  # ROC-AUC
#         fold_scores.append(fold_score)

#     oof_test[:] = oof_test_skf.mean(axis=0)
#     print(f"Mean CV Score for {clf.clf.__class__.__name__}: {np.mean(fold_scores):.4f}")
#     return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1), fold_scores 

In [0]:
# Random Forest parameters
# rf_params = {
#     'n_jobs': -1,
#     'n_estimators': 50,
#      'warm_start': True, 
#      #'max_features': 0.2,
#     'max_depth': 6,
#     'min_samples_leaf': 2,
#     'max_features' : 'sqrt',
#     'verbose': 0
# }


# # AdaBoost parameters
# ada_params = {
#     'n_estimators': 500,
#     'learning_rate' : 0.75
# }

# # Gradient Boosting parameters
# gb_params = {
#     'n_estimators': 50,
#      #'max_features': 0.2,
#     'max_depth': 5,
#     'min_samples_leaf': 2,
#     'verbose': 0
# }

# # Support Vector Classifier parameters 
# svc_params = {
#     'kernel' : 'linear',
#     'C' : 0.025
#     }


# # K-Nearest Neighbors Parameters
# knn_params = {
#     'n_neighbors': 5, 
#     'weights': 'uniform',  
#     'algorithm': 'auto'  
# }

# LightGBM Parameters
# lgb_params = {
#     'n_estimators': 4000,
#     'learning_rate': 0.03,
#     'num_leaves' : 30,
#     'colsample_bytree' : 0.8,
#     'subsample': 0.9,
#     'max_depth': 7,
#     'reg_alpha': 0.1,
#     'reg_lambda': 0.1,
#     'min_split_gain': 0.01,
#     'min_child_weight': 2,
#     'silent': -1,
#     'verbose': -1,
# }

In [0]:
# creating 5 objects utlizing the sklearnHelperclass

# rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
# knn = SklearnHelper(clf=KNeighborsClassifier, seed=SEED, params=knn_params)
# lgb = SklearnHelper(clf=LGBMClassifier, seed=SEED, params=lgb_params)
# gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
# svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [0]:
# Create OOF train and test predictions for all selected models

# gb_oof_train, gb_oof_test, gb_scores = get_oof(gb, X_train, y_train, X_test)
# svc_oof_train, svc_oof_test, svc_scores = get_oof(svc, X_train, y_train, X_test)
# rf_oof_train, rf_oof_test, rf_scores = get_oof(rf,  X_train, y_train, X_test)
# knn_oof_train, knn_oof_test, knn_scores = get_oof(knn,  X_train, y_train, X_test)
# lgb_oof_train, lgb_oof_test, lgb_scores = get_oof(lgb,  X_train, y_train, X_test)

# print("Training is complete")

In [0]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [0]:
model = Sequential()

# Input layer (256 features)
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))  # First hidden layer with 256 neurons

# Hidden layers
model.add(Dense(128, activation='relu'))  # Second hidden layer with 128 neurons
model.add(Dropout(0.6))  # Dropout to prevent overfitting

model.add(Dense(64, activation='relu'))  # Third hidden layer with 64 neurons
model.add(Dropout(0.6))  # Dropout

# Output layer (binary classification, single output unit)
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification 

# Compile the model with binary cross-entropy loss and Adam optimizer
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Summary of the model architecture
model.summary()

In [0]:
# Calculate class weights for the imbalance


class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Compile the model with binary cross-entropy loss
model.compile(optimizer=Adam(learning_rate=0.001),
              loss=BinaryCrossentropy(),
              metrics=['accuracy'])

In [0]:
# Check if class weights are calculated correctly
print(class_weight_dict)

In [0]:
# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_data=(X_test, y_test),
                    class_weight=class_weight_dict,  # Apply class weights here
                    callbacks=[early_stopping],
                    verbose=1)

In [0]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc:.4f}, Test Loss: {test_loss:.4f}')

In [0]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5)  # Convert probabilities to class labels (0 or 1)

print(classification_report(y_test, y_pred_binary))

In [0]:
# Keep a copy of SK_ID_CURR before resetting the index
X_scaled["SK_ID_CURR"] = data_cleaned["SK_ID_CURR"].values

In [0]:
# Generate predictions
predictions = model.predict(X_scaled.drop(columns=["SK_ID_CURR"]))  # Drop SK_ID_CURR if it was re-added
predicted_probabilities = predictions.flatten()

# Create a DataFrame for predictions
prediction_df = pd.DataFrame({
    "SK_ID_CURR": X_scaled["SK_ID_CURR"],  # Use the retained SK_ID_CURR
    "Prediction_Probability_pos_cash": predicted_probabilities
})

final_data_with_predictions = data_cleaned.merge(prediction_df, on="SK_ID_CURR", how="left")

In [0]:
#converting to pyspark df
amtf_cc_final_pred_df = spark.createDataFrame(prediction_df)

In [0]:
amtf_cc_final_pred_df.createOrReplaceTempView("amtf_cc_final_pred_df")

In [0]:
df_pred = spark.table("hive_metastore.default.amtf_cc_final_pred_df")

In [0]:
data_raw = spark.table("hive_metastore.default.credit_card_installment_features_level_3")

In [0]:
from pyspark.sql import functions as F

In [0]:
df_pred.count()

In [0]:
# Perform inner join on the common column
joined_df = data_raw.join(amtf_cc_final_pred_df, on="SK_ID_CURR", how="inner")


In [0]:
joined_df.count()

In [0]:


# Create a new DataFrame with SK_ID_CURR, predicted_prob, residual, and additional columns
final_df = joined_df.select(
    F.col("SK_ID_CURR"),
    F.col("Prediction_Probability_pos_cash"),
    (F.col("TARGET") - F.col("Prediction_Probability_pos_cash")).alias("residual_cc"),
    F.col("amtf_avg_days_installment"),
    F.col("amtf_avg_balance"),
    F.col("amtf_total_distinct_installments"),
    F.col("amtf_max_credit_limit"),
    F.col("amtf_avg_completed_status_proportion"),
    F.col("amtf_max_credit_utilization_ratio"),
    F.col("amtf_total_active_contracts"),
    F.col("amtf_deferral_proportion"),
    F.col("amtf_overdue_proportion")
)

# Show the result for verification
final_df.show()


In [0]:
final_df.createOrReplaceTempView("final_df")

In [0]:
%sql
drop table if exists default.amtf_cc_final_pred_df;
create table default.amtf_cc_final_pred_df as
select
  *
from
  final_df;

In [0]:
display(final_df)