# Objective 
Run test cases for different thread counts and different db sizes
Initial run 'assumes' that the 16-year data set has already been downloaded and built as parquet files. 

In [1]:
# import required services
import datetime
import json
import platform
import time
import warnings
from pathlib import Path
import os
import glob
import shutil
import inspect

import click
import duckdb
import ibis
import psutil
from jinja2 import Template
from memory_profiler import memory_usage

In [2]:
def setGlobals(data="data", perf="perf", acq="acq", threads=6, years=2, mode="sql", db="mortgage.db"):
    env = {}
    # print(f"{inspect.stack()[0][3]}  entered")
    # specify the name of the 'data' sub-folder
    env["data_folder"] = data
    #enable this to run on Linux, OSX, Windows
    env["path_separator"] = os.path.sep
    # define path information for source data sets. 
    env["base_path"] = os.getcwd()
    #env["data_path"] = env["base_path"] + env["path_separator"] + env["data_folder"]
    env["data_path"] = env["data_folder"]
    env["perf_path"] = env["data_path"] + env["path_separator"] + perf
    env["acq_path"] = env["data_path"] + env["path_separator"] + acq
    env["full_perf"] = env["base_path"] + env["path_separator"] + env["perf_path"]
    env["full_acq"] = env["base_path"] + env["path_separator"] + env["acq_path"]
    env["threads"] = threads
    env["years"] = years
    env["mode"] = mode
    env["db"] = db
    
    # set up the select criteria for finding the correct performance and acquisition files
    if years > 10:
        #first 10 years
        year_list = list(range(0,10))
        env["perf_regex"] = "'Performance_200{0}*.parquet'".format(year_list)
        env["acq_regex"] = "'Acquisition_200{0}*.parquet'".format(year_list)
        #everything else
        year_list = list(range(0,env["years"]-10))
        env["perf_regex_2"] = "'Performance_201{0}*.parquet'".format(year_list)
        env["acq_regex_2"] = "'Acquisition_201{0}*.parquet'".format(year_list)
    else:
        env["acq_regex_2"] = None
        env["perf_regex_2"] = None
        year_list = list(range(0,env["years"]))
        env["perf_regex"] = "'Performance_200{0}*.parquet'".format(year_list)
        env["acq_regex"] = "'Acquisition_200{0}*.parquet'".format(year_list)
    env["platform"] = platform_info()
    
    return env

In [3]:
def getMortgageFiles(env_in):
    env = env_in
    # print(f"{inspect.stack()[0][3]}  entered")
    # get a list of the files to be copied

    os.chdir(env["full_perf"])
    perf_list = eval("glob.glob({0})".format(env["perf_regex"]))
    if env["perf_regex_2"] != None:
        t_list = eval("glob.glob({0})".format(env["perf_regex_2"]))
        perf_list = perf_list + t_list

    os.chdir(env["full_acq"])
    t_str = "glob.glob({0})".format(env["acq_regex"])
    acq_list = eval(t_str)
    if env["acq_regex_2"] != None:
        t_list = eval("glob.glob({0})".format(env["acq_regex_2"]))
        acq_list = acq_list + t_list

    env["perf_list"] = perf_list
    perf_files_list = map(lambda x : env["perf_path"] + env["path_separator"] + x, env["perf_list"])
    env["perf_files"] = list(perf_files_list)
    
    env["acq_list"] = acq_list
    acq_files_list = map(lambda x : env["acq_path"] + env["path_separator"] + x, env["acq_list"])
    env['acq_files'] = list(acq_files_list)

    # reset back to the root folder for this repo. 
    os.chdir(env["base_path"])
    
    return env

In [4]:
def create_db(env_in):
    env = env_in
    # print(f"{inspect.stack()[0][3]}  entered")

    # get rid of any existing mortgage.db before timing run
    os.chdir(env["base_path"])
    filename = env["base_path"] + env["path_separator"] + env["db"]
    try:
        os.remove(filename)
    except OSError:
        pass 

    conn = duckdb.connect(env["db"])

    perf_sql_string = f"CREATE OR REPLACE VIEW perf AS SELECT * FROM read_parquet({env['perf_files']})"
    acq_sql_string = f"CREATE OR REPLACE VIEW acq AS SELECT * FROM read_parquet({env['acq_files']})"
    conn.execute(perf_sql_string)
    conn.execute(acq_sql_string)
    conn.close()
    
    return env

In [5]:
def platform_info():
    #print(f"{inspect.stack()[0][3]}  entered")
    return {
        "machine": platform.machine(),
        "version": platform.version(),
        "platform": platform.platform(),
        "system": platform.system(),
        "cpu_count": psutil.cpu_count(),
        "memory": psutil.virtual_memory().total,
        "processor": platform.processor(),
    }


In [6]:
def window_sql(env_in):
    env = env_in
    #print(f"{inspect.stack()[0][3]}  entered")
    sql = f"select count(*) from (select RANK() OVER (PARTITION BY loan_id ORDER BY monthly_reporting_period) as number from read_parquet({env['perf_files']}))"
    return sql


In [7]:
def summary_sql(env_in):
    env = env_in
    with open("summary.sql") as f:
        template = Template(f.read())
        perf_string = '"' + '","'.join(env['perf_files']) + '"'
        acq_string = '"' + '","'.join(env['acq_files']) + '"'
    return template.render(
        perf="perf", acq="acq"
    )

In [8]:
def runAllCombos():
    print("years\tthreads\ttime\t\t\trows/second")
    for y in range (2,17,2):
        env = setGlobals(years=y, threads=10)
        env = getMortgageFiles(env)
        env = create_db(env)
        env['combos'] = []
        window_sql_string = window_sql(env)
        summary_sql_string = summary_sql(env)

        p_count = "SELECT count(*) FROM perf"
        a_count = "SELECT count(*) FROM acq"
        conn = duckdb.connect("mortgage.db")
        sql_pragma = f"PRAGMA threads={env['threads']}"
        conn.execute(sql_pragma)

        result = conn.execute(str(p_count)).fetchall()
        perf_rows = result[0][0]

        cpus = env['platform']['cpu_count'] + 1
        for i in range(1,cpus):
            sql_pragma = f"PRAGMA threads={i}"
            conn.execute(sql_pragma)
            t0 = time.time()
            result = conn.execute(str(summary_sql_string)).fetchall()
            t1 = time.time()
            print(f"{env['years']}\t{i}\t{t1-t0}\t{perf_rows/(t1-t0):,.2f}")
            env['combos'].append([env['years'], i, t1-t0, perf_rows/(t1-t0)])
        conn.close()
    return env

In [9]:
def runForVariance(y, loops):
    print("years\tthreads\ttime\t\t\trows/second")
    env = setGlobals(years=y, threads=10)
    env = getMortgageFiles(env)
    env = create_db(env)
    env['combos'] = []
    window_sql_string = window_sql(env)
    summary_sql_string = summary_sql(env)

    p_count = "SELECT count(*) FROM perf"
    a_count = "SELECT count(*) FROM acq"
    conn = duckdb.connect("mortgage.db")
    sql_pragma = f"PRAGMA threads={env['threads']}"
    conn.execute(sql_pragma)

    result = conn.execute(str(p_count)).fetchall()
    perf_rows = result[0][0]

    cpus = env['platform']['cpu_count'] + 1
    for i in range(5,cpus):
        sql_pragma = f"PRAGMA threads={i}"
        conn.execute(sql_pragma)
        for v in range(loops):
            t0 = time.time()
            result = conn.execute(str(summary_sql_string)).fetchall()
            t1 = time.time()
            print(f"{env['years']}\t{i}\t{t1-t0}\t{perf_rows/(t1-t0):,.2f}")
            env['combos'].append([env['years'], i, t1-t0, perf_rows/(t1-t0)])
    conn.close()
    return env

In [None]:
env = runAllCombos()

years	threads	time			rows/second
2	1	21.609627962112427	6,871,228.98
2	2	11.514751195907593	12,895,172.42
2	3	7.692288160324097	19,303,060.27
2	4	6.0490944385528564	24,546,600.08
2	5	5.2666826248168945	28,193,212.42
2	6	4.728961944580078	31,399,005.48
2	7	4.537384033203125	32,724,737.63
2	8	4.607366323471069	32,227,674.46
2	9	4.424649000167847	33,558,526.79
2	10	4.432695388793945	33,497,610.14
2	11	3.6459431648254395	40,726,005.67
2	12	3.527742624282837	42,090,571.17
2	13	3.6899991035461426	40,239,766.42
2	14	3.691091537475586	40,227,856.85
2	15	3.6691246032714844	40,468,699.77
2	16	3.795954704284668	39,116,563.18
2	17	3.6591200828552246	40,579,346.57
2	18	3.7133543491363525	39,986,677.28
2	19	3.7397689819335938	39,704,244.49
2	20	3.693448066711426	40,202,190.29
2	21	3.679440975189209	40,355,234.12
2	22	3.657092571258545	40,601,843.98
2	23	3.738163709640503	39,721,294.61
2	24	3.7231719493865967	39,881,236.76
4	1	91.18525815010071	8,151,366.10
4	2	47.10191559791565	15,780,343.80
4	3	32.

In [None]:
#var_res = []
#for i in range(16,17,2):
#    var_res.append(runForVariance(i, 20))