### Prerequisite:  
* miniconda installation from 2023 or later 
* in anaconda powershell prompt, "conda activate 2023"

In [1]:

import pandas as pd 

from datetime import datetime
import pytz


Source data model is in https://docs.google.com/spreadsheets/d/1Qh28Lb4dcbw9YMqcXLSj7N8l6Tlr46xNQkV-t1A2txc/edit#gid=0


In [3]:
import json
# read the database information from the json file
with open('../dbconn.json', 'r') as f:
    di = json.load(f)
# create a connection string for postgresql
pg_uri = f"//{di['username']}:{di['password']}@{di['host']}:{di['port']}/{di['database']}"

In [4]:
#!pip install sqlalchemy
#!pip install psycopg2

# using https://pythontic.com/pandas/serialization/postgresql as example
# though https://naysan.ca/2020/05/31/postgresql-to-pandas/ avoids the sqlalchemy layer
# Example python program to read data from a PostgreSQL table
# and load into a pandas DataFrame
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy import text

# Create an engine instance
alchemyEngine   = create_engine(
    f'postgresql+psycopg2:{pg_uri}', pool_recycle=3600
).execution_options(isolation_level="AUTOCOMMIT");

 # Connect to PostgreSQL server
con    = alchemyEngine.connect();
# con.execute (text("SET default_tablespace = u02_pgdata"))
con.execute (text("create schema if not exists sppdata authorization current_user"))
con.execute (text("set search_path to sppdata"))

con.autocommit=True;
# Read data from PostgreSQL database table and load into a DataFrame instance
#dataFrame       = pd.read_sql(text("select * from information_schema.tables"), con);
#dataFrame

def pgsqldf(query): 
    return pd.read_sql(text(query), con)


In [6]:

def space(): 
    return pgsqldf("""
    SELECT
      nspname || '.' || C.relname AS "relation",
      pg_total_relation_size(C.oid) AS "total_size",
      pg_relation_size(C.oid) AS "data_size",
    --  pg_total_relation_size(C.oid) / pg_relation_size(C.oid) AS "bloat_ratio",
      pg_stat_user_tables.n_live_tup AS "row_count",
      pg_stat_user_tables.n_dead_tup AS "dead_rows"
    FROM pg_class C
    LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
    LEFT JOIN pg_stat_user_tables ON (pg_stat_user_tables.relid = C.oid)
    WHERE nspname NOT IN ('pg_catalog', 'information_schema')
      AND C.relkind <> 'i'
      AND nspname !~ '^pg_toast'
      AND nspname = 'sppdata'
      and pg_total_relation_size(C.oid) > 0
    ORDER BY pg_total_relation_size(C.oid) DESC
    """)

space()

Unnamed: 0,relation,total_size,data_size,row_count,dead_rows
0,sppdata.rtbm_lmp_by_location,745046016,545554432,4802551,0
1,sppdata.da_lmp_by_location,65568768,47988736,422464,0
2,sppdata.tie_flows_long,62070784,42737664,591091,71875
3,sppdata.rtbm_binding_constraints,29163520,22790144,156464,2
4,sppdata.area_control_error,1679360,1138688,21688,125
5,sppdata.generation_mix,1105920,958464,4336,125
6,sppdata.stlf_vs_actual,425984,278528,4290,746
7,sppdata.settlement_location,180224,90112,1145,0
8,sppdata.mtlf_vs_actual,114688,49152,527,0


In [7]:
# perform vacuum; run space again 
con.execute (text("vacuum"))

space()

CPU times: total: 0 ns
Wall time: 343 ms


Unnamed: 0,relation,total_size,data_size,row_count,dead_rows
0,sppdata.rtbm_lmp_by_location,745054208,545554432,4802524,0
1,sppdata.da_lmp_by_location,65568768,47988736,422464,0
2,sppdata.tie_flows_long,62070784,42737664,522671,0
3,sppdata.rtbm_binding_constraints,29163520,22790144,156455,0
4,sppdata.area_control_error,1679360,1138688,21688,0
5,sppdata.generation_mix,1105920,958464,4324,0
6,sppdata.stlf_vs_actual,425984,278528,4290,0
7,sppdata.settlement_location,188416,90112,1145,0
8,sppdata.mtlf_vs_actual,114688,49152,527,0


In [8]:
def trim_table(table, timekey, delete_older_than): 
    df = pgsqldf(f"""
        select '{table}' as table, 
        count(*) as rowcount, 
        count(case when {timekey} < current_timestamp - interval '{delete_older_than}' then 1 else null end) as old_row_count
        from {table}
    """)
    print (df)
    con.execute (text(f"""
        delete from {table} where {timekey} < current_timestamp - interval '{delete_older_than}' 
        """))
    con.execute (text(f""" 
        vacuum (analyze) {table}
        """))
    

In [9]:
for table, timekey in ( 
     ['sppdata.rtbm_lmp_by_location', 'gmtinterval_end'],
     ['sppdata.tie_flows_long', 'gmttime'],
     ['sppdata.da_lmp_by_location', 'gmtinterval_end'],
     ['sppdata.rtbm_binding_constraints', 'gmtinterval_end'],
     ['sppdata.area_control_error', 'gmttime'],
     ['sppdata.generation_mix', 'gmt_mkt_interval'],
     ['sppdata.stlf_vs_actual', 'gmtinterval_end'],
     ['sppdata.mtlf_vs_actual', 'gmtinterval_end']):
    trim_table(table, timekey, '2 weeks')

                          table  rowcount  old_row_count
0  sppdata.rtbm_lmp_by_location   4803232         374248
                    table  rowcount  old_row_count
0  sppdata.tie_flows_long    591091          46035
                        table  rowcount  old_row_count
0  sppdata.da_lmp_by_location    422464          32144
                              table  rowcount  old_row_count
0  sppdata.rtbm_binding_constraints    156464          14212
                        table  rowcount  old_row_count
0  sppdata.area_control_error     21688           1705
                    table  rowcount  old_row_count
0  sppdata.generation_mix      4336            341
                    table  rowcount  old_row_count
0  sppdata.stlf_vs_actual      4290            340
                    table  rowcount  old_row_count
0  sppdata.mtlf_vs_actual       527             28


In [10]:
space()

Unnamed: 0,relation,total_size,data_size,row_count,dead_rows
0,sppdata.rtbm_lmp_by_location,745054208,545554432,4066886,374248
1,sppdata.da_lmp_by_location,65568768,47988736,358176,32144
2,sppdata.tie_flows_long,61972480,42639360,499021,46035
3,sppdata.rtbm_binding_constraints,29163520,22790144,128040,14212
4,sppdata.area_control_error,1679360,1138688,19983,0
5,sppdata.generation_mix,1105920,958464,3995,0
6,sppdata.stlf_vs_actual,425984,278528,3950,0
7,sppdata.settlement_location,188416,90112,1145,0
8,sppdata.mtlf_vs_actual,114688,49152,499,0


In [13]:
# perform vacuum; run space again 
con.execute (text("vacuum"))

print (space())

CPU times: total: 0 ns
Wall time: 91.7 ms
                           relation  total_size  data_size  row_count  \
0      sppdata.rtbm_lmp_by_location   745078784  545554432    4424933   
1        sppdata.da_lmp_by_location    65593344   47988736     390320   
2            sppdata.tie_flows_long    61997056   42639360     545082   
3  sppdata.rtbm_binding_constraints    29188096   22790144     142254   
4        sppdata.area_control_error     1703936    1138688      19984   
5            sppdata.generation_mix     1105920     958464       3995   
6            sppdata.stlf_vs_actual      425984     278528       3950   
7       sppdata.settlement_location      188416      90112       1145   
8            sppdata.mtlf_vs_actual      114688      49152        499   

   dead_rows  
0          0  
1          0  
2          0  
3          0  
4          0  
5          0  
6          0  
7          0  
8          0  
