In [1]:
import os
import glob
import pandas as pd
import pickle
#import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import model_inference


In [2]:
# Build a .py script that takes a snapshot date, loads a model artefact and make an inference and save to datamart

## set up pyspark session

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

25/06/20 11:26:56 WARN Utils: Your hostname, Baohongs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.169.0.196 instead (on interface en0)
25/06/20 11:26:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/20 11:26:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/20 11:26:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## set up config

In [3]:
snapshot_date_str = "2017-01-01"
model_name = "reg_2017_12_04.pkl"

In [4]:
config = {}
config["snapshot_date_str"] = snapshot_date_str
config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")
config["model_name"] = model_name
config["model_bank_directory"] = "model_bank/"
config["model_artefact_filepath"] = config["model_bank_directory"] + config["model_name"]

## load model artefact from model bank

In [5]:
# Load the model from the pickle file
with open(config["model_artefact_filepath"], 'rb') as file:
    model_artefact = pickle.load(file)

print("Model loaded successfully! " + config["model_artefact_filepath"])

Model loaded successfully! model_bank/reg_2017_12_04.pkl


## load feature store

In [7]:
from functools import reduce
feature_location = "datamart/gold/feature_store/"
# Load CSV into DataFrame - connect to feature store
files_list = glob.glob(os.path.join(feature_location, '*.parquet'))
features_store_sdf = spark.read.option("header", "true").parquet(*files_list)
print("row_count for features:",features_store_sdf.count(),"\n")

# Filter out NA
rows_with_nulls = features_store_sdf.filter(
    reduce(lambda a, b: a | b, (col(c).isNull() for c in features_store_sdf.columns))
)
order_ids_to_drop = [row["order_id"] for row in rows_with_nulls.select("order_id").distinct().collect()]
features_store_sdf = features_store_sdf.filter(~col("order_id").isin(order_ids_to_drop))

#Extract relevant features
features_store_sdf = features_store_sdf.filter(col("order_status") == "delivered")
features_sdf = features_store_sdf.toPandas()
print("extracted features_sdf", features_sdf.count(), config["snapshot_date"])


                                                                                

row_count for features: 39767 

extracted features_sdf order_id               38657
total_qty              38657
total_price            38657
total_freight_value    38657
total_weight_g         38657
total_volume_cm3       38657
order_status           38657
total_density          38657
dtype: int64 2017-01-01 00:00:00


## preprocess data for modeling

In [10]:
# prepare X_inference
features_pdf = features_sdf.drop(columns=['order_id', 'order_status']).values

# apply transformer - standard scaler
transformer_stdscaler = model_artefact["preprocessing_transformers"]["stdscaler"]
X_inference = transformer_stdscaler.transform(features_pdf)

print('X_inference', X_inference.shape[0])
X_inference

X_inference 38657


array([[-2.82649479e-01,  1.27916498e+00,  3.45926693e-01,
         4.62914007e-03, -4.43825417e-02, -1.03102549e-01],
       [-2.82649479e-01, -1.73277184e-01, -4.02506558e-01,
        -1.97166152e-01,  3.07890357e-02, -3.20065876e-01],
       [-2.82649479e-01,  7.08423448e-01, -4.19609695e-01,
        -4.26478984e-01, -4.44242223e-01, -1.59457018e-01],
       ...,
       [ 1.85660059e+00,  3.23362575e+00,  3.19245800e-01,
         7.24854883e-01,  9.32286220e-01, -1.92938411e-01],
       [-2.82649479e-01, -6.02673677e-01, -7.43885162e-01,
        -4.44824011e-01, -4.95003465e-01,  1.14435169e+00],
       [ 3.99585066e+00,  2.03940951e-03, -1.14683506e+00,
         6.88367331e-02, -1.90231743e-01,  1.84101433e-01]])

## model prediction inference

In [11]:
threshold = model_artefact['threshold']
threshold

0.25

In [14]:
# load model
model = model_artefact["model"]
threshold = model_artefact['threshold']

# predict model
y_inference = model.predict_proba(X_inference)[:, 1]

# prepare output
y_inference_pdf = features_sdf[["order_id", "order_status"]].copy()
y_inference_pdf["model_name"] = config["model_name"]
y_inference_pdf["model_predictions"] = y_inference.round(4)
y_inference_pdf["model_predictions"] = (y_inference_pdf["model_predictions"] > threshold).astype(int)

y_inference_pdf

Unnamed: 0,order_id,order_status,model_name,model_predictions
0,00137e170939bba5a3134e2386413108,delivered,reg_2017_12_04.pkl,1
1,001c85b5f68d2be0cb0797afc9e8ce9a,delivered,reg_2017_12_04.pkl,0
2,00275bce676303c3bfd7292aefdfa223,delivered,reg_2017_12_04.pkl,0
3,0030d783f979fbc5981e75613b057344,delivered,reg_2017_12_04.pkl,1
4,0068468c453d28c8ef3fd089e50a5847,delivered,reg_2017_12_04.pkl,1
...,...,...,...,...
38652,4e36d0170c671378f49f701a810360ae,delivered,reg_2017_12_04.pkl,1
38653,9a3e437ab219133c20a7033be9087edf,delivered,reg_2017_12_04.pkl,1
38654,d809ddde66fee6223df16b11231491f9,delivered,reg_2017_12_04.pkl,1
38655,bd50a7fe9fd97ea4b7663031a319e150,delivered,reg_2017_12_04.pkl,0


## save model inference to datamart gold table

In [15]:
# create bronze datalake
gold_directory = f"datamart/gold/model_predictions/{config['model_name'][:-4]}/"
print(gold_directory)

if not os.path.exists(gold_directory):
    os.makedirs(gold_directory)

# save gold table - IRL connect to database to write
partition_name = config["model_name"][:-4] + "_predictions_" + snapshot_date_str.replace('-','_') + '.parquet'
filepath = gold_directory + partition_name
spark.createDataFrame(y_inference_pdf).write.mode("overwrite").parquet(filepath)
# df.toPandas().to_parquet(filepath,
#           compression='gzip')
print('saved to:', filepath)

datamart/gold/model_predictions/reg_2017_12_04/


[Stage 15:>                                                         (0 + 8) / 8]

saved to: datamart/gold/model_predictions/reg_2017_12_04/reg_2017_12_04_predictions_2017_01_01.parquet


                                                                                

## backfill

In [10]:
# set up config
snapshot_date_str = "2023-01-01"

start_date_str = "2023-01-01"
end_date_str = "2024-12-01"

In [11]:
# generate list of dates to process
def generate_first_of_month_dates(start_date_str, end_date_str):
    # Convert the date strings to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    
    # List to store the first of month dates
    first_of_month_dates = []

    # Start from the first of the month of the start_date
    current_date = datetime(start_date.year, start_date.month, 1)

    while current_date <= end_date:
        # Append the date in yyyy-mm-dd format
        first_of_month_dates.append(current_date.strftime("%Y-%m-%d"))
        
        # Move to the first of the next month
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    return first_of_month_dates

dates_str_lst = generate_first_of_month_dates(start_date_str, end_date_str)


In [12]:
for snapshot_date in dates_str_lst:
    print(snapshot_date)
    model_inference.main(snapshot_date, model_name)

2023-01-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 1, 1, 0, 0),
 'snapshot_date_str': '2023-01-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl


                                                                                

extracted features_sdf 8974 2023-01-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/
saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_01_01.parquet


---completed job---


2023-02-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 2, 1, 0, 0),
 'snapshot_date_str': '2023-02-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-02-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_02_01.parquet


---completed job---


2023-03-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 3, 1, 0, 0),
 'snapshot_date_str': '2023-03-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-03-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_03_01.parquet


---completed job---


2023-04-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 4, 1, 0, 0),
 'snapshot_date_str': '2023-04-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-04-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_04_01.parquet


---completed job---


2023-05-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 5, 1, 0, 0),
 'snapshot_date_str': '2023-05-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-05-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_05_01.parquet


---completed job---


2023-06-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 6, 1, 0, 0),
 'snapshot_date_str': '2023-06-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-06-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_06_01.parquet


---completed job---


2023-07-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 7, 1, 0, 0),
 'snapshot_date_str': '2023-07-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-07-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_07_01.parquet


---completed job---


2023-08-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 8, 1, 0, 0),
 'snapshot_date_str': '2023-08-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-08-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_08_01.parquet


---completed job---


2023-09-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 9, 1, 0, 0),
 'snapshot_date_str': '2023-09-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-09-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_09_01.parquet


---completed job---


2023-10-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 10, 1, 0, 0),
 'snapshot_date_str': '2023-10-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-10-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_10_01.parquet


---completed job---


2023-11-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 11, 1, 0, 0),
 'snapshot_date_str': '2023-11-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-11-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_11_01.parquet


---completed job---


2023-12-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 12, 1, 0, 0),
 'snapshot_date_str': '2023-12-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-12-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_12_01.parquet


---completed job---


2024-01-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 1, 1, 0, 0),
 'snapshot_date_str': '2024-01-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-01-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_01_01.parquet


---completed job---


2024-02-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 2, 1, 0, 0),
 'snapshot_date_str': '2024-02-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-02-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_02_01.parquet


---completed job---


2024-03-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 3, 1, 0, 0),
 'snapshot_date_str': '2024-03-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-03-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_03_01.parquet


---completed job---


2024-04-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 4, 1, 0, 0),
 'snapshot_date_str': '2024-04-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-04-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_04_01.parquet


---completed job---


2024-05-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 5, 1, 0, 0),
 'snapshot_date_str': '2024-05-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-05-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_05_01.parquet


---completed job---


2024-06-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 6, 1, 0, 0),
 'snapshot_date_str': '2024-06-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-06-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_06_01.parquet


---completed job---


2024-07-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 7, 1, 0, 0),
 'snapshot_date_str': '2024-07-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-07-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_07_01.parquet


---completed job---


2024-08-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 8, 1, 0, 0),
 'snapshot_date_str': '2024-08-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-08-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_08_01.parquet


---completed job---


2024-09-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 9, 1, 0, 0),
 'snapshot_date_str': '2024-09-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-09-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_09_01.parquet


---completed job---


2024-10-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 10, 1, 0, 0),
 'snapshot_date_str': '2024-10-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl


                                                                                

extracted features_sdf 8974 2024-10-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_10_01.parquet


---completed job---


2024-11-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 11, 1, 0, 0),
 'snapshot_date_str': '2024-11-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-11-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_11_01.parquet


---completed job---


2024-12-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 12, 1, 0, 0),
 'snapshot_date_str': '2024-12-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2024-12-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2024_12_01.parquet


---completed job---




## Check datamart

In [29]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [31]:
folder_path = "scripts/datamart/gold/model_predictions/credit_model_2024_06_01/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
df = spark.read.option("header", "true").parquet(*files_list)
print("row_count:",df.count())

df.show()

row_count: 485
+-----------+-------------+--------------------+--------------------+
|customer_id|snapshot_date|          model_name|   model_predictions|
+-----------+-------------+--------------------+--------------------+
|  CUS_0x405|   2024-01-01|credit_model_2024...| 0.15490520000457764|
| CUS_0x4655|   2024-01-01|credit_model_2024...| 0.07329188287258148|
| CUS_0x4953|   2024-01-01|credit_model_2024...|  0.7080382704734802|
| CUS_0x4a0d|   2024-01-01|credit_model_2024...| 0.05151388421654701|
|  CUS_0x4d3|   2024-01-01|credit_model_2024...|  0.8074637651443481|
| CUS_0x4d40|   2024-01-01|credit_model_2024...|  0.6069353818893433|
| CUS_0x50e9|   2024-01-01|credit_model_2024...|  0.2434036284685135|
| CUS_0x54d7|   2024-01-01|credit_model_2024...|0.033143628388643265|
| CUS_0x57e5|   2024-01-01|credit_model_2024...| 0.21770469844341278|
| CUS_0x58cd|   2024-01-01|credit_model_2024...| 0.21139536798000336|
| CUS_0x5e61|   2024-01-01|credit_model_2024...|0.048095766454935074|
| CUS