In [None]:
import subprocess
import re
from datetime import datetime, timedelta
import pandas as pd

start_date_str = '2020-01-01'
end_date_str = '2020-02-01'

start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
end_date = datetime.strptime(end_date_str, '%Y-%m-%d')

date_list = []
current_date = start_date
while current_date <= end_date:
    date_list.append(current_date.strftime('%Y-%m-%d'))
    current_date += timedelta(days=1)


**DIM_CUSTOMER**

In [None]:
# daily snapshot
rows = []

for i in range(len(date_list)-1):
    result = subprocess.run(['dbt', 
                            'build', 
                            '--select', 
                            'dim_customer_daily_snapshot', 
                            '--vars', 
                            f'{{start_date: {date_list[i]}, end_date: {date_list[i+1]}}}'],
                            shell=True, 
                            text=True,
                            stdout=subprocess.PIPE, 
                            universal_newlines=True,
                            cwd='./dim_model_scd_test')
    
    stats = re.search(r"\[\x1b\[32mSUCCESS (\d+)\x1b\[0m in (\d+\.\d+)s\]", result.stdout)
    total_duration = re.search(r"\((\d+.\d+)s\)", result.stdout)[1]
    row_count = int(stats[1])
    seconds = float(stats[2])
    log = {'start_date': date_list[i], 'end_date':date_list[i+1], 'rows_affected':row_count, 'seconds':seconds}

    rows.append(log)


In [None]:
dim_customer_daily_snapshot_stats = pd.DataFrame(rows)
dim_customer_daily_snapshot_stats.to_csv('dim_customer_daily_snapshot_stats_small.csv', index = False)

In [None]:
# SCD type 2
rows = []

for i in range(len(date_list)-1):
    result = subprocess.run(['dbt', 
                            'build', 
                            '--select', 
                            'dim_customer_scd', 
                            '--vars', 
                            f'{{start_date: {date_list[i]}, end_date: {date_list[i+1]}}}'],
                            shell=True, 
                            text=True,
                            stdout=subprocess.PIPE, 
                            universal_newlines=True,
                            cwd='./dim_model_scd_test')
    
    stats = re.search(r"\[\x1b\[32mSUCCESS (\d+)\x1b\[0m in (\d+\.\d+)s\]", result.stdout)
    total_duration = re.search(r"\((\d+.\d+)s\)", result.stdout)[1]
    row_count = int(stats[1])
    seconds = float(stats[2])
    log = {'start_date': date_list[i], 'end_date':date_list[i+1], 'rows_affected':row_count, 'seconds':seconds}

    rows.append(log)


In [None]:
dim_customer_scd_2 = pd.DataFrame(rows)
dim_customer_scd_2.to_csv('dim_customer_scd_2_stats_small.csv', index = False)

**DIM_PRODUCT**

In [None]:
# daily snapshot
rows = []

for i in range(len(date_list)-1):
    result = subprocess.run(['dbt', 
                            'build', 
                            '--select', 
                            'dim_product_daily_snapshot', 
                            '--vars', 
                            f'{{start_date: {date_list[i]}, end_date: {date_list[i+1]}}}'],
                            shell=True, 
                            text=True,
                            stdout=subprocess.PIPE, 
                            universal_newlines=True,
                            cwd='./dim_model_scd_test')
    
    stats = re.search(r"\[\x1b\[32mSUCCESS (\d+)\x1b\[0m in (\d+\.\d+)s\]", result.stdout)
    total_duration = re.search(r"\((\d+.\d+)s\)", result.stdout)[1]
    row_count = int(stats[1])
    seconds = float(stats[2])
    log = {'start_date': date_list[i], 'end_date':date_list[i+1], 'rows_affected':row_count, 'seconds':seconds}
    print(log)
    rows.append(log)


__________

In [19]:
import snowflake.connector
import os
import json

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PW'),
    account=os.getenv('SNOWFLAKE_AC'),
)

query = """

    SELECT SYSTEM$CLUSTERING_INFORMATION('PRACTICE.DIMENSIONAL_MODELS.DIM_CUSTOMER_DAILY_SNAPSHOT', '(RECORD_DATE)');

    SELECT SYSTEM$CLUSTERING_INFORMATION('PRACTICE.DIMENSIONAL_MODELS.DIM_CUSTOMER_SCD', '(DBT_VALID_FROM, DBT_VALID_TO)');
"""

cursor_list = conn.execute_string(query)

dim_customer_scd_cluster_stats = json.loads(cursor_list[-1].fetch_pandas_all().iloc[0,0])
dim_customer_daily_snapshot_cluster_stats = json.loads(cursor_list[-2].fetch_pandas_all().iloc[0,0])

In [20]:
dim_customer_scd_cluster_stats

{'cluster_by_keys': 'LINEAR(DBT_VALID_FROM, DBT_VALID_TO)',
 'notes': 'Clustering key columns contain high cardinality key DBT_VALID_FROM which might result in expensive re-clustering. Please refer to https://docs.snowflake.net/manuals/user-guide/tables-clustering-keys.html for more information.',
 'total_partition_count': 6,
 'total_constant_partition_count': 0,
 'average_overlaps': 5.0,
 'average_depth': 6.0,
 'partition_depth_histogram': {'00000': 0,
  '00001': 0,
  '00002': 0,
  '00003': 0,
  '00004': 0,
  '00005': 0,
  '00006': 6,
  '00007': 0,
  '00008': 0,
  '00009': 0,
  '00010': 0,
  '00011': 0,
  '00012': 0,
  '00013': 0,
  '00014': 0,
  '00015': 0,
  '00016': 0},
 'clustering_errors': []}

In [21]:
dim_customer_daily_snapshot_cluster_stats

{'cluster_by_keys': 'LINEAR(RECORD_DATE)',
 'total_partition_count': 199,
 'total_constant_partition_count': 195,
 'average_overlaps': 0.0201,
 'average_depth': 1.0151,
 'partition_depth_histogram': {'00000': 0,
  '00001': 196,
  '00002': 3,
  '00003': 0,
  '00004': 0,
  '00005': 0,
  '00006': 0,
  '00007': 0,
  '00008': 0,
  '00009': 0,
  '00010': 0,
  '00011': 0,
  '00012': 0,
  '00013': 0,
  '00014': 0,
  '00015': 0,
  '00016': 0},
 'clustering_errors': []}

_________

In [1]:
import pandas as pd

dim_customer_scd_2 = pd.read_csv('dim_customer_scd_2_stats_small.csv')
dim_customer_daily_snapshot_stats = pd.read_csv('dim_customer_daily_snapshot_stats_small.csv')

In [2]:
dim_customer_scd_2 = dim_customer_scd_2[dim_customer_scd_2['start_date'] > '2020-01-01']
dim_customer_daily_snapshot_stats = dim_customer_daily_snapshot_stats[dim_customer_daily_snapshot_stats['start_date'] > '2020-01-01']

In [3]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

fig = make_subplots()

fig.add_trace(
    go.Scatter(x=dim_customer_scd_2['start_date'], 
               y=dim_customer_scd_2['rows_affected'], 
               mode='lines+markers', 
               name='SCD Type 2'),
)

fig.add_trace(
    go.Scatter(x=dim_customer_daily_snapshot_stats['start_date'], 
               y=dim_customer_daily_snapshot_stats['rows_affected'], 
               mode='lines+markers', 
               name='Daily snapshot'),
)


fig.update_layout(
    xaxis_title="Date",
    yaxis_title='Rows Affected',
)

fig.show()
fig.write_image('row_count.png')

In [4]:
fig = make_subplots()

fig.add_trace(
    go.Scatter(x=dim_customer_scd_2['start_date'], 
               y=dim_customer_scd_2['seconds'], 
               mode='lines+markers', 
               name='SCD Type 2'),
)

fig.add_trace(
    go.Scatter(x=dim_customer_daily_snapshot_stats['start_date'], 
               y=dim_customer_daily_snapshot_stats['seconds'], 
               mode='lines+markers', 
               name='Daily snapshot'),
)


fig.update_layout(
    xaxis_title="Date",
    yaxis_title='Seconds',
)

fig.show()
fig.write_image('seconds.png')