In [1]:
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pprint
import pickle

import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

import pyspark
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType, BooleanType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.base import BaseEstimator, TransformerMixin

import xgboost as xgb

# Import CreditProcessor 
from model_train_credit_processor import CreditProcessor

import model_inference

In [2]:
# Build a .py script that takes a snapshot date, loads a model artefact and make an inference and save to datamart

## set up pyspark session

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("model_inference") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/22 09:17:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## set up config

In [3]:
snapshot_date_str = "2024-01-01"
model_name = "credit_model_2024_09_01.pkl"

In [4]:
config = {}
config["snapshot_date_str"] = snapshot_date_str
config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")
config["model_name"] = model_name
config["model_bank_directory"] = "model_bank/"
config["model_artefact_filepath"] = config["model_bank_directory"] + config["model_name"]

pprint.pprint(config)

{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 1, 1, 0, 0),
 'snapshot_date_str': '2024-01-01'}


## load model artefact from model bank

In [5]:
# Load the model from the pickle file
with open(config["model_artefact_filepath"], 'rb') as file:
    model_artefact = pickle.load(file)

print("Model loaded successfully! " + config["model_artefact_filepath"])

Model loaded successfully! model_bank/credit_model_2024_09_01.pkl


## load feature store based on specific `snapshot_date_str`

For model inference, the tricky part is:
* The inference happens at the month of loan start (or loan application date) 
* the labels only appear at Month on Book (MOB) = 6. (snapshot_date_str + 6 months)

So for example
* CUS_0x1000 applies for loan on 2023_05_01. We get his information from this month and make an inference
* Label for CUS_0x1000 appears only on 2023_11_01

Features from silver attributes table

In [28]:
# connect to silver attributes table
folder_path = "datamart/silver/attr/"

# read specific parquet file for that snapshot_date_str
attributes_sdf = spark.read.parquet(folder_path + 'silver_attr_mthly_' + config["snapshot_date_str"].replace("-", "_") + '.parquet')

# take only important features
attributes_cols = ['Customer_ID', 'Age', 'Occupation', 'snapshot_date']
attributes_sdf_subset = attributes_sdf[attributes_cols]
print("attributes row_count:",attributes_sdf_subset.count())
attributes_sdf_subset.show(5)

attributes row_count: 485
+-----------+---+------------+-------------+
|Customer_ID|Age|  Occupation|snapshot_date|
+-----------+---+------------+-------------+
| CUS_0x102d| 31|Entrepreneur|   2024-01-01|
| CUS_0x1051| 42|    Engineer|   2024-01-01|
| CUS_0x1269| 22|     Manager|   2024-01-01|
| CUS_0x1290| 31|   Architect|   2024-01-01|
| CUS_0x12d1| 41|  Accountant|   2024-01-01|
+-----------+---+------------+-------------+
only showing top 5 rows



Features from financials table

In [29]:
# connect to silver financials table
folder_path = "datamart/silver/fin/"

# read specific parquet file for that snapshot_date_str
financials_sdf = spark.read.parquet(folder_path + 'silver_fin_mthly_' + config["snapshot_date_str"].replace("-", "_") + '.parquet')

# take only important features
financials_cols = [
    'Customer_ID', 'Annual_Income', 'Monthly_Inhand_Salary',
    'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
    'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries',
    'Credit_Mix', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
    'Total_EMI_per_month', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
    'Num_Fin_Pdts', 'Loans_per_Credit_Item', 'Debt_to_Salary', 'EMI_to_Salary', 'Repayment_Ability', 'Loan_Extent'
]
financials_sdf_subset = financials_sdf[financials_cols]
print("financials row_count:",financials_sdf_subset.count())
financials_sdf_subset.show(5)

financials row_count: 485
+-----------+-------------+---------------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+--------------------+----------+----------------+------------------------+------------------+-------------------+-----------------------+-----------------+---------------+------------+---------------------+--------------------+--------------------+-----------------+-----------+
|Customer_ID|Annual_Income|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|Num_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix|Outstanding_Debt|Credit_Utilization_Ratio|Credit_History_Age|Total_EMI_per_month|Amount_invested_monthly|Payment_Behaviour|Monthly_Balance|Num_Fin_Pdts|Loans_per_Credit_Item|      Debt_to_Salary|       EMI_to_Salary|Repayment_Ability|Loan_Extent|
+-----------+-------------+---------------------+-----------------+---

## preprocess data for modeling

In [17]:
# Merge attributes and financials into 1 table (no labels at this point in time)
# use inner join coz all customer ID records must have all features from both tables to make inference
merged_df = attributes_sdf_subset.select([col(c) for c in attributes_sdf_subset.columns]) # make a fresh copy of one table
merged_df = merged_df.join(financials_sdf_subset, on="Customer_ID", how="inner")

# Check size of resultant table. 
print(f"merged_df row_count: {merged_df.count()}")

Num of rows: 485


In [18]:
# Convert to Python pandas, prepare data for modeling
merged_df = merged_df.toPandas()
merged_df

Unnamed: 0,Customer_ID,Age,Occupation,snapshot_date,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Num_Fin_Pdts,Loans_per_Credit_Item,Debt_to_Salary,EMI_to_Salary,Repayment_Ability,Loan_Extent
0,CUS_0x102d,31.0,Entrepreneur,2024-01-01,89064.520,7256.043,5,3,1,1,...,37.573,296.094,4.0,641.937,9,0.111111,0.089342,0.005177,7218.470,6
1,CUS_0x1051,42.0,Engineer,2024-01-01,35022.220,2859.518,3,5,4,1,...,21.215,259.345,0.0,295.392,9,0.111111,0.349741,0.007416,2838.303,6
2,CUS_0x1269,22.0,Manager,2024-01-01,42031.090,3762.591,2,1,8,3,...,66.506,108.856,4.0,450.897,6,0.750000,0.022200,0.017671,3696.085,3
3,CUS_0x1290,31.0,Architect,2024-01-01,10455.875,729.323,6,5,12,3,...,25.748,24.491,1.0,302.693,14,0.250000,0.113867,0.035256,703.575,33
4,CUS_0x12d1,41.0,Accountant,2024-01-01,21384.940,1646.078,4,4,15,2,...,22.096,168.245,1.0,254.267,10,0.222222,0.419853,0.013415,1623.982,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,CUS_0xc68d,41.0,Musician,2024-01-01,15497.020,1193.418,8,4,19,2,...,24.167,107.719,1.0,267.456,14,0.153846,0.988398,0.020233,1169.251,38
481,CUS_0xc6d8,34.0,Developer,2024-01-01,120955.920,10021.660,2,7,4,1,...,57.505,241.974,5.0,942.686,10,0.100000,0.036852,0.005737,9964.155,28
482,CUS_0xe8d,26.0,Journalist,2024-01-01,14441.510,1453.459,9,8,26,2,...,21.032,48.390,4.0,325.924,19,0.111111,0.918768,0.014460,1432.427,30
483,CUS_0xf9e,43.0,Teacher,2024-01-01,142475.320,12013.943,8,5,11,2,...,124.400,81.779,5.0,1235.216,15,0.142857,0.052316,0.010354,11889.543,46


In [19]:
merged_df.columns

Index(['Customer_ID', 'Age', 'Occupation', 'snapshot_date', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Num_Fin_Pdts', 'Loans_per_Credit_Item', 'Debt_to_Salary',
       'EMI_to_Salary', 'Repayment_Ability', 'Loan_Extent'],
      dtype='object')

In [20]:
# After merging successfully, remove Customer_ID (join key) as it is not a feature
merged_df_clean = merged_df.drop(columns=['Customer_ID', 'snapshot_date'])

In [21]:
merged_df_clean.columns

Index(['Age', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Num_Fin_Pdts', 'Loans_per_Credit_Item', 'Debt_to_Salary',
       'EMI_to_Salary', 'Repayment_Ability', 'Loan_Extent'],
      dtype='object')

In [22]:
# Apply data processing steps from saved transformers
transformer_processor = model_artefact['preprocessing_transformers']['credit_cleaner']
transformer_ohe = model_artefact['preprocessing_transformers']['one_hot_encoder']
cat_cols = model_artefact['preprocessing_transformers']['one_hot_encoder_columns']

# data cleaning
X_inference = transformer_processor.transform(merged_df_clean)

# one hot encoding
X_inference_cat = transformer_ohe.transform(X_inference[cat_cols])
X_inference_cat_df = pd.DataFrame(X_inference_cat, columns=transformer_ohe.get_feature_names_out(cat_cols), index=X_inference.index)
X_inference_fe = pd.concat([X_inference, X_inference_cat_df], axis=1)
X_inference_fe = X_inference_fe.drop(columns=cat_cols)

In [23]:
print('X_inference_fe rows: ', X_inference_fe.shape[0])
X_inference_fe

X_inference_fe rows:  485


Unnamed: 0,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Num_Credit_Inquiries,Outstanding_Debt,Monthly_Balance,...,Credit_Mix_1.0,Credit_Mix_2.0,Credit_Mix_nan,Payment_Behaviour_0.0,Payment_Behaviour_1.0,Payment_Behaviour_2.0,Payment_Behaviour_3.0,Payment_Behaviour_4.0,Payment_Behaviour_5.0,Payment_Behaviour_nan
0,89064.520,7256.043,5,3,1,6,5,26,648.36,641.937,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,35022.220,2859.518,3,5,1,6,11,26,1000.44,295.392,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42031.090,3762.591,2,1,3,1,5,20,83.55,450.897,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,10455.875,729.323,6,5,3,11,19,0,83.16,302.693,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,21384.940,1646.078,4,4,2,24,15,26,691.53,254.267,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,15497.020,1193.418,8,4,2,19,14,26,1180.56,267.456,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
481,120955.920,10021.660,2,7,1,28,8,26,369.36,942.686,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
482,14441.510,1453.459,9,8,2,15,17,26,1336.31,325.924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
483,142475.320,12013.943,8,5,2,23,18,26,628.57,1235.216,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [24]:
X_inference_fe.columns

Index(['Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Num_Credit_Inquiries', 'Outstanding_Debt',
       'Monthly_Balance', 'Num_Fin_Pdts', 'Loans_per_Credit_Item',
       'Debt_to_Salary', 'EMI_to_Salary', 'Repayment_Ability', 'Loan_Extent',
       'Is_Interest_Rate_Low', 'Is_Credit_History_Age_Low', 'Credit_Mix_0.0',
       'Credit_Mix_1.0', 'Credit_Mix_2.0', 'Credit_Mix_nan',
       'Payment_Behaviour_0.0', 'Payment_Behaviour_1.0',
       'Payment_Behaviour_2.0', 'Payment_Behaviour_3.0',
       'Payment_Behaviour_4.0', 'Payment_Behaviour_5.0',
       'Payment_Behaviour_nan'],
      dtype='object')

## model prediction inference

In [25]:
# load model
model = model_artefact["model"]

# predict model
y_inference = model.predict_proba(X_inference_fe)[:, 1]

# prepare output
y_inference_pdf = merged_df[["Customer_ID","snapshot_date"]].copy()
y_inference_pdf["model_name"] = config["model_name"]
y_inference_pdf["model_predictions"] = y_inference
y_inference_pdf

Unnamed: 0,Customer_ID,snapshot_date,model_name,model_predictions
0,CUS_0x102d,2024-01-01,credit_model_2024_09_01.pkl,0.080443
1,CUS_0x1051,2024-01-01,credit_model_2024_09_01.pkl,0.165757
2,CUS_0x1269,2024-01-01,credit_model_2024_09_01.pkl,0.029614
3,CUS_0x1290,2024-01-01,credit_model_2024_09_01.pkl,0.109924
4,CUS_0x12d1,2024-01-01,credit_model_2024_09_01.pkl,0.160403
...,...,...,...,...
480,CUS_0xc68d,2024-01-01,credit_model_2024_09_01.pkl,0.198916
481,CUS_0xc6d8,2024-01-01,credit_model_2024_09_01.pkl,0.325137
482,CUS_0xe8d,2024-01-01,credit_model_2024_09_01.pkl,0.385016
483,CUS_0xf9e,2024-01-01,credit_model_2024_09_01.pkl,0.265649


## save model inference to datamart gold table

In [30]:
# create gold datalake
gold_directory = f"datamart/gold/model_predictions/{config['model_name'][:-4]}/"
print(gold_directory)

if not os.path.exists(gold_directory):
    os.makedirs(gold_directory)

# save gold table - IRL connect to database to write
partition_name = config["model_name"][:-4] + "_preds_" + config["snapshot_date_str"].replace('-','_') + '.parquet'
filepath = gold_directory + partition_name

# Convert pandas df to spark df and write to parquet
spark.createDataFrame(y_inference_pdf).write.mode("overwrite").parquet(filepath)
print('saved to:', filepath)

datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_preds_2024_01_01.parquet


## backfill

In [2]:
# set up config
# snapshot_date_str = "2023-01-01"
# backfill for all 2 years of data given (regardless of whether labels have appeared yet)

start_date_str = "2023-01-01"
end_date_str = "2024-12-01"

In [3]:
# Specify model_name again (if you want to skip the Jupyter cell right at the top)
model_name = "credit_model_2024_09_01.pkl"

In [4]:
# generate list of dates to process
def generate_first_of_month_dates(start_date_str, end_date_str):
    # Convert the date strings to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    
    # List to store the first of month dates
    first_of_month_dates = []

    # Start from the first of the month of the start_date
    current_date = datetime(start_date.year, start_date.month, 1)

    while current_date <= end_date:
        # Append the date in yyyy-mm-dd format
        first_of_month_dates.append(current_date.strftime("%Y-%m-%d"))
        
        # Move to the first of the next month
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    return first_of_month_dates

dates_str_lst = generate_first_of_month_dates(start_date_str, end_date_str)

In [5]:
for snapshot_date in dates_str_lst:
    print('snapshot_date: ', snapshot_date)
    model_inference.main(snapshot_date, model_name)

snapshot_date:  2023-01-01

---starting job---



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/22 09:59:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 1, 1, 0, 0),
 'snapshot_date_str': '2023-01-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
attributes row_count: 530
financials row_count: 530
merged_df row_count: 530
X_inference_fe rows:  530
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_preds_2023_01_01.parquet
Stopping Spark session...


---completed job---


snapshot_date:  2023-02-01

---starting job---

{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 2, 1, 0, 0),
 'snapshot_date_str': '2023-02-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
attributes row_count: 501
financials row_count: 501
merged_df row_count: 501
X_inference_fe rows:  501
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_preds_2023_02_01.parquet
Stopping Spark session...


---completed job---


snapshot_date:  2023-03-01

---starting job---

{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 3, 1, 0, 0),
 'snapshot_date_str': '2023-03-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
attributes row_count: 506
financials row_count: 506
merged_df row_count: 506
X_inference_fe rows:  506
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_preds_2023_03_01.parquet
Stopping Spark session...


---completed job---


snapshot_date:  2023-04-01

---starting job---

{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 4, 1, 0, 0),
 'snapshot_date_str': '2023-04-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
attributes row_count: 510
financials row_count: 510
merged_df row_count: 510
X_inference_fe rows:  510
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_preds_2023_04_01.parquet
Stopping Spark session...


---completed job---


snapshot_date:  2023-05-01

---starting job---

{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 5, 1, 0, 0),
 'snapshot_date_str': '2023-05-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
attributes row_count: 521
financials row_count: 521
merged_df row_count: 521
X_inference_fe rows:  521
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_preds_2023_05_01.parquet
Stopping Spark session...


---completed job---


snapshot_date:  2023-06-01

---starting job---

{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 6, 1, 0, 0),
 'snapshot_date_str': '2023-06-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
attributes row_count: 517
financials row_count: 517
merged_df row_count: 517
X_inference_fe rows:  517
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_preds_2023_06_01.parquet
Stopping Spark session...


---completed job---




## Check datamart

The `model_inference.main()` function initialises a Spark session and stops it after processing each date's dataset. Therefore, as of above code completion, there is no active spark session.   

Therefore, we need to initialize a SparkSession again to check

In [6]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("model_prediction_datamart") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [7]:
folder_path = "datamart/gold/model_predictions/credit_model_2024_09_01/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
df = spark.read.parquet(*files_list)
print("row_count:",df.count())

df.show()

                                                                                

row_count: 11974
+-----------+-------------+--------------------+-------------------+
|Customer_ID|snapshot_date|          model_name|  model_predictions|
+-----------+-------------+--------------------+-------------------+
| CUS_0xb80a|   2024-08-01|credit_model_2024...| 0.5157557725906372|
| CUS_0xb8b3|   2024-08-01|credit_model_2024...|0.39145877957344055|
| CUS_0xb904|   2024-08-01|credit_model_2024...| 0.5961624383926392|
| CUS_0xb975|   2024-08-01|credit_model_2024...|0.03880820795893669|
| CUS_0xb97b|   2024-08-01|credit_model_2024...|0.17554503679275513|
| CUS_0xba90|   2024-08-01|credit_model_2024...|0.15578925609588623|
| CUS_0xbaf2|   2024-08-01|credit_model_2024...| 0.7179205417633057|
| CUS_0xbb93|   2024-08-01|credit_model_2024...|0.04955011233687401|
| CUS_0xbbc6|   2024-08-01|credit_model_2024...|  0.132219597697258|
| CUS_0xbbda|   2024-08-01|credit_model_2024...|0.08531688153743744|
| CUS_0xbc04|   2024-08-01|credit_model_2024...| 0.5796205401420593|
| CUS_0xbc5b|   2

In [8]:
# Stop session after using
spark.stop()