In [31]:
import xgboost as xgb; 
print(xgb.__version__)

1.6.2


In [1]:
import os
import glob
import pandas as pd
import pickle
#import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import model_inference


In [2]:
# Build a .py script that takes a snapshot date, loads a model artefact and make an inference and save to datamart

## set up pyspark session

In [14]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

## set up config

In [21]:
snapshot_date_str = "2016-10-10"
model_name = "reg_2017_12_04.pkl"

In [22]:
config = {}
config["snapshot_date_str"] = snapshot_date_str
config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")
config["model_name"] = model_name
config["model_bank_directory"] = "model_bank/"
config["model_artefact_filepath"] = config["model_bank_directory"] + config["model_name"]

## load model artefact from model bank

In [23]:
# Load the model from the pickle file
with open(config["model_artefact_filepath"], 'rb') as file:
    model_artefact = pickle.load(file)

print("Model loaded successfully! " + config["model_artefact_filepath"])

Model loaded successfully! model_bank/reg_2017_12_04.pkl


## load feature store

In [28]:
from functools import reduce
feature_location = "datamart/gold/feature_store/"
# Load CSV into DataFrame - connect to feature store
files_list = glob.glob(os.path.join(feature_location, '*.parquet'))
features_store_sdf = spark.read.option("header", "true").parquet(*files_list)
print("row_count for features:",features_store_sdf.count(),"\n")

# Filter out NA
features_store_sdf = features_store_sdf.drop("concentration","act_days_to_deliver","total_freight_value","avg_processing_time","same_state","total_volume_cm3","seller_city","seller_state")
rows_with_nulls = features_store_sdf.filter(
    reduce(lambda a, b: a | b, (col(c).isNull() for c in features_store_sdf.columns))
)
order_ids_to_drop = [row["order_id"] for row in rows_with_nulls.select("order_id").distinct().collect()]
features_store_sdf = features_store_sdf.filter(~col("order_id").isin(order_ids_to_drop))

#Extract relevant features
features_store_sdf = features_store_sdf.filter(col("order_status") == "delivered")
features_sdf = features_store_sdf.toPandas()
print("extracted features_sdf", features_sdf.count(), config["snapshot_date"])

features_sdf

row_count for features: 39755 

extracted features_sdf order_id             38474
order_status         38474
total_qty            38474
total_price          38474
total_weight_g       38474
total_density        38474
delivery_distance    38474
same_city            38474
is_weekend           38474
avg_rating           38474
avg_delay_rate       38474
day_of_week          38474
season               38474
dtype: int64 2016-10-10 00:00:00


Unnamed: 0,order_id,order_status,total_qty,total_price,total_weight_g,total_density,delivery_distance,same_city,is_weekend,avg_rating,avg_delay_rate,day_of_week,season
0,00137e170939bba5a3134e2386413108,delivered,1,397.00,2650.0,0.147222,641.981369,0,0,4.000000,1.000000,6,Spring
1,001c85b5f68d2be0cb0797afc9e8ce9a,delivered,1,99.00,1550.0,0.074007,309.811008,0,0,2.526316,0.717949,6,Spring
2,00275bce676303c3bfd7292aefdfa223,delivered,1,279.90,300.0,0.128205,358.851160,0,0,5.000000,0.000000,6,Spring
3,0030d783f979fbc5981e75613b057344,delivered,1,60.60,1000.0,0.031888,694.222979,0,0,3.750000,0.600000,6,Spring
4,0035c0b07126fe9c24a325216fb96064,delivered,1,131.90,2275.0,0.112346,183.726855,0,0,2.400000,0.400000,6,Spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38469,89898719573722b59b173f76fb213501,delivered,1,34.90,16300.0,0.603704,109.956042,0,1,1.000000,1.000000,7,Summer
38470,9a3e437ab219133c20a7033be9087edf,delivered,1,149.00,300.0,0.110947,595.871101,0,1,5.000000,0.000000,7,Summer
38471,d809ddde66fee6223df16b11231491f9,delivered,2,798.00,6576.0,0.116907,604.672600,0,1,1.000000,1.000000,7,Summer
38472,bd50a7fe9fd97ea4b7663031a319e150,delivered,1,10.90,200.0,0.568182,10.536861,1,0,5.000000,0.000000,6,Spring


Processing for modelling

In [29]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(features_sdf[['season']])  # Only fit on training data
encoded_feature = encoder.transform(features_sdf[['season']])
encoded_f = pd.DataFrame(encoded_feature, columns=encoder.get_feature_names_out(['season']), index=features_sdf.index)
features_sdf = pd.concat([features_sdf.drop(columns=['season']), encoded_f], axis=1)

features_sdf

Unnamed: 0,order_id,order_status,total_qty,total_price,total_weight_g,total_density,delivery_distance,same_city,is_weekend,avg_rating,avg_delay_rate,day_of_week,season_Autumn,season_Spring,season_Summer,season_Winter
0,00137e170939bba5a3134e2386413108,delivered,1,397.00,2650.0,0.147222,641.981369,0,0,4.000000,1.000000,6,0.0,1.0,0.0,0.0
1,001c85b5f68d2be0cb0797afc9e8ce9a,delivered,1,99.00,1550.0,0.074007,309.811008,0,0,2.526316,0.717949,6,0.0,1.0,0.0,0.0
2,00275bce676303c3bfd7292aefdfa223,delivered,1,279.90,300.0,0.128205,358.851160,0,0,5.000000,0.000000,6,0.0,1.0,0.0,0.0
3,0030d783f979fbc5981e75613b057344,delivered,1,60.60,1000.0,0.031888,694.222979,0,0,3.750000,0.600000,6,0.0,1.0,0.0,0.0
4,0035c0b07126fe9c24a325216fb96064,delivered,1,131.90,2275.0,0.112346,183.726855,0,0,2.400000,0.400000,6,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38469,89898719573722b59b173f76fb213501,delivered,1,34.90,16300.0,0.603704,109.956042,0,1,1.000000,1.000000,7,0.0,0.0,1.0,0.0
38470,9a3e437ab219133c20a7033be9087edf,delivered,1,149.00,300.0,0.110947,595.871101,0,1,5.000000,0.000000,7,0.0,0.0,1.0,0.0
38471,d809ddde66fee6223df16b11231491f9,delivered,2,798.00,6576.0,0.116907,604.672600,0,1,1.000000,1.000000,7,0.0,0.0,1.0,0.0
38472,bd50a7fe9fd97ea4b7663031a319e150,delivered,1,10.90,200.0,0.568182,10.536861,1,0,5.000000,0.000000,6,0.0,1.0,0.0,0.0


In [30]:
from pyspark.sql.types import NumericType
if features_sdf.empty == True:
    y_inference_pdf = features_sdf[['order_id','order_status']]
    y_inference_pdf['model_name'] = ''
    y_inference_pdf['model_predictions'] = ''
    print('y_inference', y_inference_pdf.shape[0])
else: 
    # prepare X_inference
    features_pdf = features_sdf.select_dtypes(include='number')
    #features_pdf = features_sdf.select(numeric_cols)
    #features_pdf = features_sdf.drop(columns=['order_id', 'order_status']).values
    # apply transformer - standard scaler
    transformer_stdscaler = model_artefact["preprocessing_transformers"]["stdscaler"]
    X_inference = transformer_stdscaler.transform(features_pdf)
    print('X_inference', X_inference.shape[0])


X_inference 38474




## model prediction inference

In [20]:
threshold = model_artefact['threshold']
threshold

0.25

In [21]:
# load model
model = model_artefact["model"]
threshold = model_artefact['threshold']

# predict model
y_inference = model.predict_proba(X_inference)[:, 1]

# prepare output
y_inference_pdf = features_sdf[["order_id", "order_status"]].copy()
y_inference_pdf["model_name"] = config["model_name"]
y_inference_pdf["model_predictions"] = y_inference.round(4)
y_inference_pdf["model_predictions"] = (y_inference_pdf["model_predictions"] > threshold).astype(int)

y_inference_pdf

Unnamed: 0,order_id,order_status,model_name,model_predictions
0,03128233e78ed8ade6738f2043f4cf8d,delivered,reg_2017_12_04.pkl,1
1,0a0837a5eee9e7a9ce2b1fa831944d27,delivered,reg_2017_12_04.pkl,0
2,1c4deb98d216d2fda120204a5fcfb57d,delivered,reg_2017_12_04.pkl,1
3,1ff217aa612f6cd7c4255c9bfe931c8b,delivered,reg_2017_12_04.pkl,0
4,22613579f7d11cc59c4347526fc3c79e,delivered,reg_2017_12_04.pkl,1
...,...,...,...,...
261,be5bc2f0da14d8071e2d45451ad119d9,delivered,reg_2017_12_04.pkl,1
262,cd3b8574c82b42fc8129f6d502690c3e,delivered,reg_2017_12_04.pkl,0
263,d207cc272675637bfed0062edffd0818,delivered,reg_2017_12_04.pkl,0
264,ef1b29b591d31d57c0d7337460dd83c9,delivered,reg_2017_12_04.pkl,1


## save model inference to datamart gold table

In [22]:
# create bronze datalake
gold_directory = f"datamart/gold/model_predictions/{config['model_name'][:-4]}/"
print(gold_directory)

if not os.path.exists(gold_directory):
    os.makedirs(gold_directory)

# save gold table - IRL connect to database to write
partition_name = config["model_name"][:-4] + "_predictions_" + snapshot_date_str.replace('-','_') + '.parquet'
filepath = gold_directory + partition_name
spark.createDataFrame(y_inference_pdf).write.mode("overwrite").parquet(filepath)
# df.toPandas().to_parquet(filepath,
#           compression='gzip')
print('saved to:', filepath)

datamart/gold/model_predictions/reg_2017_12_04/


[Stage 12:>                                                         (0 + 8) / 8]

saved to: datamart/gold/model_predictions/reg_2017_12_04/reg_2017_12_04_predictions_2016_10_10.parquet


                                                                                

## backfill

In [10]:
# set up config
snapshot_date_str = "2023-01-01"

start_date_str = "2023-01-01"
end_date_str = "2024-12-01"

In [11]:
# generate list of dates to process
def generate_first_of_month_dates(start_date_str, end_date_str):
    # Convert the date strings to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    
    # List to store the first of month dates
    first_of_month_dates = []

    # Start from the first of the month of the start_date
    current_date = datetime(start_date.year, start_date.month, 1)

    while current_date <= end_date:
        # Append the date in yyyy-mm-dd format
        first_of_month_dates.append(current_date.strftime("%Y-%m-%d"))
        
        # Move to the first of the next month
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    return first_of_month_dates

dates_str_lst = generate_first_of_month_dates(start_date_str, end_date_str)


In [12]:
for snapshot_date in dates_str_lst:
    print(snapshot_date)
    model_inference.main(snapshot_date, model_name)

2023-01-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 1, 1, 0, 0),
 'snapshot_date_str': '2023-01-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl


                                                                                

extracted features_sdf 8974 2023-01-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/
saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_01_01.parquet


---completed job---


2023-02-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 2, 1, 0, 0),
 'snapshot_date_str': '2023-02-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-02-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_02_01.parquet


---completed job---


2023-03-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 3, 1, 0, 0),
 'snapshot_date_str': '2023-03-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-03-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_03_01.parquet


---completed job---


2023-04-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 4, 1, 0, 0),
 'snapshot_date_str': '2023-04-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-04-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_04_01.parquet


---completed job---


2023-05-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 5, 1, 0, 0),
 'snapshot_date_str': '2023-05-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-05-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_05_01.parquet


---completed job---


2023-06-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 6, 1, 0, 0),
 'snapshot_date_str': '2023-06-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-06-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_06_01.parquet


---completed job---


2023-07-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 7, 1, 0, 0),
 'snapshot_date_str': '2023-07-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-07-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_07_01.parquet


---completed job---


2023-08-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 8, 1, 0, 0),
 'snapshot_date_str': '2023-08-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-08-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_08_01.parquet


---completed job---


2023-09-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 9, 1, 0, 0),
 'snapshot_date_str': '2023-09-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-09-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_09_01.parquet


---completed job---


2023-10-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 10, 1, 0, 0),
 'snapshot_date_str': '2023-10-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-10-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_10_01.parquet


---completed job---


2023-11-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 11, 1, 0, 0),
 'snapshot_date_str': '2023-11-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-11-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_11_01.parquet


---completed job---


2023-12-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 12, 1, 0, 0),
 'snapshot_date_str': '2023-12-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-12-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_12_01.parquet


---completed job---


2024-01-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 1, 1, 0, 0),
 'snapshot_date_str': '2024-01-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-01-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_01_01.parquet


---completed job---


2024-02-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 2, 1, 0, 0),
 'snapshot_date_str': '2024-02-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-02-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_02_01.parquet


---completed job---


2024-03-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 3, 1, 0, 0),
 'snapshot_date_str': '2024-03-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-03-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_03_01.parquet


---completed job---


2024-04-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 4, 1, 0, 0),
 'snapshot_date_str': '2024-04-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-04-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_04_01.parquet


---completed job---


2024-05-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 5, 1, 0, 0),
 'snapshot_date_str': '2024-05-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-05-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_05_01.parquet


---completed job---


2024-06-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 6, 1, 0, 0),
 'snapshot_date_str': '2024-06-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-06-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_06_01.parquet


---completed job---


2024-07-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 7, 1, 0, 0),
 'snapshot_date_str': '2024-07-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-07-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_07_01.parquet


---completed job---


2024-08-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 8, 1, 0, 0),
 'snapshot_date_str': '2024-08-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-08-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_08_01.parquet


---completed job---


2024-09-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 9, 1, 0, 0),
 'snapshot_date_str': '2024-09-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-09-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_09_01.parquet


---completed job---


2024-10-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 10, 1, 0, 0),
 'snapshot_date_str': '2024-10-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl


                                                                                

extracted features_sdf 8974 2024-10-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_10_01.parquet


---completed job---


2024-11-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 11, 1, 0, 0),
 'snapshot_date_str': '2024-11-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-11-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_11_01.parquet


---completed job---


2024-12-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 12, 1, 0, 0),
 'snapshot_date_str': '2024-12-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-12-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_12_01.parquet


---completed job---




## Check datamart

In [29]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [31]:
folder_path = "scripts/datamart/gold/model_predictions/credit_model_2024_06_01/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
df = spark.read.option("header", "true").parquet(*files_list)
print("row_count:",df.count())

df.show()

row_count: 485
+-----------+-------------+--------------------+--------------------+
|customer_id|snapshot_date|          model_name|   model_predictions|
+-----------+-------------+--------------------+--------------------+
|  CUS_0x405|   2024-01-01|credit_model_2024...| 0.15490520000457764|
| CUS_0x4655|   2024-01-01|credit_model_2024...| 0.07329188287258148|
| CUS_0x4953|   2024-01-01|credit_model_2024...|  0.7080382704734802|
| CUS_0x4a0d|   2024-01-01|credit_model_2024...| 0.05151388421654701|
|  CUS_0x4d3|   2024-01-01|credit_model_2024...|  0.8074637651443481|
| CUS_0x4d40|   2024-01-01|credit_model_2024...|  0.6069353818893433|
| CUS_0x50e9|   2024-01-01|credit_model_2024...|  0.2434036284685135|
| CUS_0x54d7|   2024-01-01|credit_model_2024...|0.033143628388643265|
| CUS_0x57e5|   2024-01-01|credit_model_2024...| 0.21770469844341278|
| CUS_0x58cd|   2024-01-01|credit_model_2024...| 0.21139536798000336|
| CUS_0x5e61|   2024-01-01|credit_model_2024...|0.048095766454935074|
| CUS