# Objective 
Run test cases for different thread counts and different db sizes
Initial run 'assumes' that the 16-year data set has already been downloaded and built as parquet files. 

In [None]:
# import required services
import datetime
import json
import platform
import time
import warnings
from pathlib import Path
import os
import glob
import shutil
import inspect

import click
import duckdb
import ibis
import psutil
from jinja2 import Template
from memory_profiler import memory_usage

In [None]:
def setGlobals(data="data", perf="perf", acq="acq", threads=6, years=2, mode="sql", db="mortgage.db"):
    env = {}
    # print(f"{inspect.stack()[0][3]}  entered")
    # specify the name of the 'data' sub-folder
    env["data_folder"] = data
    #enable this to run on Linux, OSX, Windows
    env["path_separator"] = os.path.sep
    # define path information for source data sets. 
    env["base_path"] = os.getcwd()
    #env["data_path"] = env["base_path"] + env["path_separator"] + env["data_folder"]
    env["data_path"] = env["data_folder"]
    env["perf_path"] = env["data_path"] + env["path_separator"] + perf
    env["acq_path"] = env["data_path"] + env["path_separator"] + acq
    env["full_perf"] = env["base_path"] + env["path_separator"] + env["perf_path"]
    env["full_acq"] = env["base_path"] + env["path_separator"] + env["acq_path"]
    env["threads"] = threads
    env["years"] = years
    env["mode"] = mode
    env["db"] = db
    
    # set up the select criteria for finding the correct performance and acquisition files
    if years > 10:
        #first 10 years
        year_list = list(range(0,10))
        env["perf_regex"] = "'Performance_200{0}*.parquet'".format(year_list)
        env["acq_regex"] = "'Acquisition_200{0}*.parquet'".format(year_list)
        #everything else
        year_list = list(range(0,env["years"]-10))
        env["perf_regex_2"] = "'Performance_201{0}*.parquet'".format(year_list)
        env["acq_regex_2"] = "'Acquisition_201{0}*.parquet'".format(year_list)
    else:
        env["acq_regex_2"] = None
        env["perf_regex_2"] = None
        year_list = list(range(0,env["years"]))
        env["perf_regex"] = "'Performance_200{0}*.parquet'".format(year_list)
        env["acq_regex"] = "'Acquisition_200{0}*.parquet'".format(year_list)
    env["platform"] = platform_info()
    
    return env

In [None]:
def getMortgageFiles(env_in):
    env = env_in
    # print(f"{inspect.stack()[0][3]}  entered")
    # get a list of the files to be copied

    os.chdir(env["full_perf"])
    perf_list = eval("glob.glob({0})".format(env["perf_regex"]))
    if env["perf_regex_2"] != None:
        t_list = eval("glob.glob({0})".format(env["perf_regex_2"]))
        perf_list = perf_list + t_list

    os.chdir(env["full_acq"])
    t_str = "glob.glob({0})".format(env["acq_regex"])
    acq_list = eval(t_str)
    if env["acq_regex_2"] != None:
        t_list = eval("glob.glob({0})".format(env["acq_regex_2"]))
        acq_list = acq_list + t_list

    env["perf_list"] = perf_list
    perf_files_list = map(lambda x : env["perf_path"] + env["path_separator"] + x, env["perf_list"])
    env["perf_files"] = list(perf_files_list)
    
    env["acq_list"] = acq_list
    acq_files_list = map(lambda x : env["acq_path"] + env["path_separator"] + x, env["acq_list"])
    env['acq_files'] = list(acq_files_list)

    # reset back to the root folder for this repo. 
    os.chdir(env["base_path"])
    
    return env

In [None]:
def create_db(env_in):
    env = env_in
    # print(f"{inspect.stack()[0][3]}  entered")

    conn = duckdb.connect(env["db"])

    perf_sql_string = f"CREATE OR REPLACE VIEW perf AS SELECT * FROM read_parquet({env['perf_files']})"
    acq_sql_string = f"CREATE OR REPLACE VIEW acq AS SELECT * FROM read_parquet({env['acq_files']})"
    conn.execute(perf_sql_string)
    conn.execute(acq_sql_string)
    conn.close()
    
    return env

In [None]:
def platform_info():
    print(f"{inspect.stack()[0][3]}  entered")
    return {
        "machine": platform.machine(),
        "version": platform.version(),
        "platform": platform.platform(),
        "system": platform.system(),
        "cpu_count": psutil.cpu_count(),
        "memory": psutil.virtual_memory().total,
        "processor": platform.processor(),
    }


In [None]:
def window_sql(env_in):
    env = env_in
    print(f"{inspect.stack()[0][3]}  entered")
    sql = f"select count(*) from (select RANK() OVER (PARTITION BY loan_id ORDER BY monthly_reporting_period) as number from read_parquet({env['perf_files']}))"
    return sql


In [None]:
def summary_sql(env_in):
    env = env_in
    with open("summary.sql") as f:
        template = Template(f.read())
        perf_string = '"' + '","'.join(env['perf_files']) + '"'
        acq_string = '"' + '","'.join(env['acq_files']) + '"'
    return template.render(
        perf="perf", acq="acq"
    )

In [None]:
for y in range (2,17,2):
    env = setGlobals(years=y, threads=10)
    env = getMortgageFiles(env)

    # get rid of any existing mortgage.db before timing run
    os.chdir(env["base_path"])
    filename = env["base_path"] + env["path_separator"] + env["db"]
    try:
        os.remove(filename)
    except OSError:
        pass 

    conn = duckdb.connect(env["db"])

    perf_sql_string = f"CREATE OR REPLACE VIEW perf AS SELECT * FROM read_parquet({env['perf_files']})"
    acq_sql_string = f"CREATE OR REPLACE VIEW acq AS SELECT * FROM read_parquet({env['acq_files']})"
    window_sql_string = window_sql(env)
    summary_sql_string = summary_sql(env)

    t0 = time.time()
    conn.execute(perf_sql_string)
    t1 = time.time()
    conn.execute(acq_sql_string)
    t2 = time.time()
    conn.close()

    p_count = "SELECT count(*) FROM perf"
    a_count = "SELECT count(*) FROM acq"

    conn = duckdb.connect("mortgage.db")
    sql_pragma = f"PRAGMA threads={env['threads']}"
    conn.execute(sql_pragma)

    result = conn.execute(str(p_count)).fetchall()
    perf_rows = result[0][0]
    print(f"years: {y} time: {t1-t0} perf rows: {perf_rows}")

    result = conn.execute(str(a_count)).fetchall()
    print(f"years: {y} time: {t2-t1}  acq rows: {result[0][0]}")

    print("summary SQL timing results")
    print("years\tthreads\ttime\t\t\trows/second")
    for i in range(1,11):
        sql_pragma = f"PRAGMA threads={i}"
        conn.execute(sql_pragma)
        t0 = time.time()
        result = conn.execute(str(summary_sql_string)).fetchall()
        t1 = time.time()
        print(f"{env['years']}\t{i}\t{t1-t0}\t{perf_rows/(t1-t0):,.2f}")

    conn.close()

