# pg_server___cron_jobs
Provides information about cron jobs running in the DB. 

Cells: 
- Configure: Connect to a specfic DB
- Current size of each table: in KB, dead rows, bloat, data size, all indexes size,  


PreReq: 
- ```pip install cron-descriptor```

In [1]:
import sys
print(sys.path)

from cron_descriptor import get_description, ExpressionDescriptor

['/Users/itaybraun/Documents/GitHub/db-observability-toolkit/Notebooks', '/Library/Frameworks/Python.framework/Versions/3.10/lib/python310.zip', '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10', '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/lib-dynload', '', '/Users/itaybraun/Library/Python/3.10/lib/python/site-packages', '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages']


In [2]:
import sqlalchemy
import pandas as pd
import configparser
import matplotlib.pyplot as plt 
from IPython.display import HTML


# Read from the Config file
config = configparser.ConfigParser() 

config.read_file(open(r'../ipynb.cfg'))

con_str = config.get('con_str', 'PG_AIRBASES') 
engine = sqlalchemy.create_engine(con_str)

try:
    connection = engine.connect()
    print ("Opened Connection")
except (Exception, sqlalchemy.exc.SQLAlchemyError) as error:
    print("Error while connecting to PostgreSQL database:", error)


Opened Connection


## Configuration
View the configurations related to the cron jobs in the current DB

In [3]:
qry_cron_configuration = """
-- Cron Jobs
SELECT name, setting, short_desc, extra_desc, source, min_val, max_val
FROM pg_settings 
WHERE name like '%%cron%%' 
"""
df = pd.read_sql_query(qry_cron_configuration, connection)
df


Unnamed: 0,name,setting,short_desc,extra_desc,source,min_val,max_val
0,cron.database_name,airbases,Database in which pg_cron metadata is kept.,,configuration file,,
1,cron.enable_superuser_jobs,on,Allow jobs to be scheduled as superuser,,default,,
2,cron.host,localhost,Hostname to connect to postgres.,This setting has no effect when background wor...,default,,
3,cron.log_min_messages,warning,log_min_messages for the launcher bgworker.,,configuration file,,
4,cron.log_run,on,Log all jobs runs into the job_run_details table,,configuration file,,
5,cron.log_statement,on,Log all cron statements prior to execution.,,configuration file,,
6,cron.max_running_jobs,1,Maximum number of jobs that can run concurrently.,,configuration file,0.0,7.0
7,cron.use_background_workers,on,Use background workers instead of client sessi...,,configuration file,,


## Number of jobs and size of the history table

In [4]:
qry_jobs_tables_stats = """ 
SELECT 
	min(j.count_jobs) as count_jobs, 
	count(jobid) as count_history_rows, 
  min(start_time) as oldest_job_history,
  now() -  min(start_time) as oldest_job_history_str,
  max(start_time) as newest_job_history
FROM cron.job_run_details AS rd
cross join ( SELECT count(*) as count_jobs
from cron.job) as j """ 
df = pd.read_sql_query(qry_jobs_tables_stats, connection)
df 



Unnamed: 0,count_jobs,count_history_rows,oldest_job_history,oldest_job_history_str,newest_job_history
0,6,156434,2023-05-21 15:30:00.068394+00:00,114 days 03:46:20.303647,2023-09-12 19:16:00.092220+00:00


## Analyze the size of the job_run_details
If the value is higher than 10,000 (feel free to select any other value) then you should run a daily job to delete data older than 7 days (or any other history length)

TODO: write here the command: 

In [5]:
# Find the value of the 'count_history_rows' column
value = df['count_history_rows'].iloc[0]

# Check if the value is higher than 10000 and display it in red if true
if value > 10000:
    message = f"<font color='pink'>The value of count_history_rows ({value}) is higher than 10000</font>"
else:
    message = f"The value of count_history_rows ({value}) is within the allowed limit."

# Display the message with HTML formatting
display(HTML(message))

## The Details of the Job
For each job show its: 
- jobid and name
- is active

In [6]:
qry_jobs_details = """ 

-- Jobs' details
SELECT 
	jobid,  
  schedule, 
  command, 
  length(command),
  nodename, 
  nodeport, 
  database, 
  username, 
  active
  jobname
FROM cron.job; """ 
print(get_description("* 2 3 * *"))

df = pd.read_sql_query(qry_jobs_details, connection)
df["schedule_text"] = df["schedule"].apply(get_description)
df


Every minute, between 02:00 AM and 02:59 AM, on day 3 of the month


Unnamed: 0,jobid,schedule,command,length,nodename,nodeport,database,username,jobname,schedule_text
0,1,*/10 * * * *,select *\nfrom (select * from postgres_air.boa...,170,localhost,5432,airbases,postgres,True,Every 10 minutes
1,3,*/1 * * * *,\n -- Your SQL query goes here\n ...,738,localhost,5432,airbases,postgres,True,Every minute
2,4,*/1 * * * *,\n INSERT INTO metis.pg_stat_tables_act...,854,localhost,5432,airbases,postgres,True,Every minute
3,5,0 1 * * *,\n SELECT delete_old_data_periodically()\n,46,localhost,5432,airbases,postgres,True,At 01:00 AM
4,6,*/1 * * * *,"INSERT INTO metis.queries (query_id, query_tex...",462,localhost,5432,airbases,postgres,True,Every minute
5,7,*/5 * * * *,\n INSERT INTO metis.query_stats_snapshots ...,516,localhost,5432,airbases,postgres,True,Every 5 minutes


## Job Execution Statistics
Some statistics about the jobs run in the last 7 days (Configurable): 
- Number of times the job run
- How many times it finished succesfully or with a failure. 
- Avg duration. 
- First time and last time it run in the monitored duration. 

In [7]:
qry_job_history_stats = """ 
--
SELECT 
	j.jobid, 
  j.jobname,
	count(runid) as count_job_runs, 
  SUM (case rd.status
  	WHEN 'succeeded' then 1 ELSE 0 END) AS count_success, 
  SUM (case rd.status
  	WHEN 'failed' then 1 ELSE 0 END) AS count_failure, 
  --count(runid) / SUM ((case rd.status WHEN 'failed' then 1 ELSE 0 END)::numeric) as success_pct,  
  min(start_time) AS first_run, 
  max(start_time) AS last_run,
 	SUM( 
    	(EXTRACT(EPOCH FROM end_time - start_time) * 1000)::bigint 
    ) AS total_job_duration_ms, 
  SUM( 
    	(EXTRACT(EPOCH FROM end_time - start_time) * 1000)::bigint 
    )  / 
  SUM (case rd.status WHEN 'succeeded' then 1 ELSE 0 END) AS avg_duration_succesful_runs
FROM 
	cron.job_run_details AS rd
JOIN 
	cron.job AS j 
  	ON rd.jobid = j.jobid
WHERE j.active = true
GROUP BY 
	j.jobid, 
  j.jobname

"""
df = pd.read_sql_query(qry_job_history_stats, connection)
df 

Unnamed: 0,jobid,jobname,count_job_runs,count_success,count_failure,first_run,last_run,total_job_duration_ms,avg_duration_succesful_runs
0,1,pmc data reliability,9890,9889,1,2023-05-21 15:30:00.068394+00:00,2023-09-12 19:10:00.104756+00:00,3474517.0,351.351704
1,3,my_cron_job,49374,49362,12,2023-08-09 12:23:00.036840+00:00,2023-09-12 19:16:00.092220+00:00,10240713.0,207.461468
2,4,pg_stat_tables_activity_snapshots,44904,44893,11,2023-08-12 14:53:00.010841+00:00,2023-09-12 19:16:00.082653+00:00,12763087.0,284.300158
3,5,pg_monitor__delete_old_data,31,31,0,2023-08-13 01:00:00.062161+00:00,2023-09-12 01:00:00.124788+00:00,90607.0,2922.806452
4,6,pg_stat_statements_queries,43538,43515,23,2023-08-13 13:39:00.006730+00:00,2023-09-12 19:16:00.005400+00:00,3020044.0,69.402367
5,7,pg_stat_statements_queries_measures,8695,8693,2,2023-08-13 14:45:02.118176+00:00,2023-09-12 19:15:00.124214+00:00,1660535.0,191.019786


## Run Details of a specific Job 

In [8]:
qry_job_run_details = """ 
-- Last 100 runs
SELECT * 
FROM cron.job_run_details 
WHERE jobid = 3
ORDER BY start_time desc
LIMIT 100;   """ 

df = pd.read_sql_query(qry_job_run_details, connection)
df 

Unnamed: 0,jobid,runid,job_pid,database,username,command,status,return_message,start_time,end_time
0,3,5131,,airbases,postgres,\n -- Your SQL query goes here\n ...,failed,could not start background process; more detai...,NaT,NaT
1,3,64315,,airbases,postgres,\n -- Your SQL query goes here\n ...,failed,could not start background process; more detai...,NaT,NaT
2,3,59351,,airbases,postgres,\n -- Your SQL query goes here\n ...,failed,could not start background process; more detai...,NaT,NaT
3,3,63922,,airbases,postgres,\n -- Your SQL query goes here\n ...,failed,could not start background process; more detai...,NaT,NaT
4,3,59568,,airbases,postgres,\n -- Your SQL query goes here\n ...,failed,could not start background process; more detai...,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...
95,3,156160,8712.0,airbases,postgres,\n -- Your SQL query goes here\n ...,succeeded,INSERT 0 1,2023-09-12 17:53:00.098347+00:00,2023-09-12 17:53:00.103351+00:00
96,3,156157,8601.0,airbases,postgres,\n -- Your SQL query goes here\n ...,succeeded,INSERT 0 1,2023-09-12 17:52:00.099001+00:00,2023-09-12 17:52:00.104016+00:00
97,3,156154,8487.0,airbases,postgres,\n -- Your SQL query goes here\n ...,succeeded,INSERT 0 1,2023-09-12 17:51:00.085706+00:00,2023-09-12 17:51:00.090562+00:00
98,3,156150,8369.0,airbases,postgres,\n -- Your SQL query goes here\n ...,succeeded,INSERT 0 1,2023-09-12 17:50:00.218380+00:00,2023-09-12 17:50:00.224459+00:00


## Testing Mini series - Experimental
Aug 21 - test failed. Can't show it all in one object. 

In [9]:
import pandas as pd
import random
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# Create empty DataFrame
data = {
    'product_name': [],
    'orderdate': [],
    'order_amount': []
}

# List of products
products = ['Product A', 'Product B', 'Product C', 'Product D', 'Product E']

# Generate 10 random dates for each product
for product in products:
    for _ in range(10):
        random_date = datetime(2023, random.randint(1, 12), random.randint(1, 28))
        data['product_name'].append(product)
        data['orderdate'].append(random_date)
        data['order_amount'].append(random.randint(100, 1000))

# Create DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
# print(df)
# Create subplots for each product
fig = plt.figure(figsize=(10, 8))
gs = GridSpec(5, 1, figure=fig)
"""
for i, product in enumerate(products):
    ax = fig.add_subplot(gs[i, 0])
    product_data = df[df['product_name'] == product]
    ax.bar(product_data['orderdate'], product_data['order_amount'])
    ax.set_title(product)
    ax.set_xlabel('Order Date')
    ax.set_ylabel('Order Amount')
"""
# plt.tight_layout()
#  plt.show()

"\nfor i, product in enumerate(products):\n    ax = fig.add_subplot(gs[i, 0])\n    product_data = df[df['product_name'] == product]\n    ax.bar(product_data['orderdate'], product_data['order_amount'])\n    ax.set_title(product)\n    ax.set_xlabel('Order Date')\n    ax.set_ylabel('Order Amount')\n"

<Figure size 1000x800 with 0 Axes>