# set up to load postgres with "n" years of mortgage data

postgresql is very fussy during bulk loads. in this instance, the bulk load would only work if all columns were defined as varchar. When numerics were used, this process failed any time an empty field was encountered for a numeric field. 

In [1]:
# standard imports
# import required services
import datetime
import json
import platform
import time
import warnings
from pathlib import Path
import os
import glob
import shutil
import inspect

import click
import duckdb
import ibis
import psutil
from jinja2 import Template
from memory_profiler import memory_usage

import psycopg2

from getpass import getpass


In [2]:
def setGlobals(data="data", perf="perf", acq="acq", threads=6, years=2, mode="sql", db="mortgage.db"):
    env = {}
    # print(f"{inspect.stack()[0][3]}  entered")
    # specify the name of the 'data' sub-folder
    env["data_folder"] = data
    #enable this to run on Linux, OSX, Windows
    env["path_separator"] = os.path.sep
    # define path information for source data sets. 
    env["base_path"] = os.getcwd()
    #env["data_path"] = env["base_path"] + env["path_separator"] + env["data_folder"]
    env["pg_uid"] = os.getlogin()
    env["data_path"] = env["data_folder"]
    env["perf_path"] = env["data_path"] + env["path_separator"] + perf
    env["acq_path"] = env["data_path"] + env["path_separator"] + acq
    env["full_perf"] = env["base_path"] + env["path_separator"] + env["perf_path"]
    env["full_acq"] = env["base_path"] + env["path_separator"] + env["acq_path"]
    env["threads"] = threads
    env["years"] = years
    env["mode"] = mode
    env["db"] = db
    env["db_base"] = "postgres"
    env["create_SQL"] = "postgresql_create.sql"
    env["acq_load_template"] = "postgresql_acq_load_template.sql"
    env["perf_load_template"] = "postgresql_perf_load_template.sql"
    env["summary_sql"] = "postgresql_summary.sql"
    env["postgres"] = {"user": env["pg_uid"], }
    
    # set up the select criteria for finding the correct performance and acquisition files
    if years > 10:
        #first 10 years
        year_list = list(range(0,10))
        env["perf_regex"] = "'Performance_200{0}*.txt*'".format(year_list)
        env["acq_regex"] = "'Acquisition_200{0}*.txt*'".format(year_list)
        #everything else
        year_list = list(range(0,env["years"]-10))
        env["perf_regex_2"] = "'Performance_201{0}*.txt*'".format(year_list)
        env["acq_regex_2"] = "'Acquisition_201{0}*.txt*'".format(year_list)
    else:
        env["acq_regex_2"] = None
        env["perf_regex_2"] = None
        year_list = list(range(0,env["years"]))
        env["perf_regex"] = "'Performance_200{0}*.txt*'".format(year_list)
        env["acq_regex"] = "'Acquisition_200{0}*.txt*'".format(year_list)
    env["platform"] = platform_info()
    
    return env

In [3]:
def platform_info():
    print(f"{inspect.stack()[0][3]}  entered")
    return {
        "machine": platform.machine(),
        "version": platform.version(),
        "platform": platform.platform(),
        "system": platform.system(),
        "cpu_count": psutil.cpu_count(),
        "memory": psutil.virtual_memory().total,
        "processor": platform.processor(),
    }

In [4]:
def getMortgageFiles(env_in):
    env = env_in
    # print(f"{inspect.stack()[0][3]}  entered")
    # get a list of the files to be copied

    os.chdir(env["full_perf"])
    perf_list = eval("glob.glob({0})".format(env["perf_regex"]))
    if env["perf_regex_2"] != None:
        t_list = eval("glob.glob({0})".format(env["perf_regex_2"]))
        perf_list = perf_list + t_list

    os.chdir(env["full_acq"])
    t_str = "glob.glob({0})".format(env["acq_regex"])
    acq_list = eval(t_str)
    if env["acq_regex_2"] != None:
        t_list = eval("glob.glob({0})".format(env["acq_regex_2"]))
        acq_list = acq_list + t_list

    env["perf_list"] = perf_list
    perf_files_list = map(lambda x : env["full_perf"] + env["path_separator"] + x, env["perf_list"])
    env["perf_files"] = list(perf_files_list)
    
    env["acq_list"] = acq_list
    acq_files_list = map(lambda x : env["full_acq"] + env["path_separator"] + x, env["acq_list"])
    env['acq_files'] = list(acq_files_list)

    # reset back to the root folder for this repo. 
    os.chdir(env["base_path"])
    
    return env

In [5]:
def createLocalDB(env_in):
    env = env_in
    # run sql to create "mortgage_analysis" collection and the two tables (perf and acq)
    
    # password for mysql dev environment (yes, probably a better way to do this ...)
    #pw = getpass(prompt='Please enter your MySQL Password: ', stream=None) 
    #env['pw'] = pw
    connection_request = f"dbname={env['db_base']} user={env['postgres']['user']}"
    print({connection_request})
    try:
        conn = psycopg2.connect(connection_request)
        if conn.status == psycopg2.extensions.STATUS_READY:
            print(f"Connected to {env['db_base']} database")
    except psycopg2.OperationalError as e:
            print(e)
    # Open a cursor to perform database operations
    conn.set_session(autocommit=True)
    cursor = conn.cursor()
    create_file = open(env["base_path"]+env["path_separator"]+env["create_SQL"], "r")
    create_sql = create_file.readlines()
    
    for line in create_sql:
        print(f"Executing: {line}")
        try:
            cursor.execute(line)
        except psycopg2.OperationalError as e:
            print(f"{line} \n{e}")
    conn.close()
    print(f"{env['db_base']} tables created")
    return env

In [6]:
def loadTable(env_in, template, file_list):
    env = env_in
    from mysql.connector import Error
    _file = open(env["base_path"]+env["path_separator"]+template, "r")
    _template = _file.readlines()
    connection_request = f"dbname={env['db_base']} user={env['postgres']['user']}"
    print({connection_request})
    try:
        conn = psycopg2.connect(connection_request)
        if conn.status == psycopg2.extensions.STATUS_READY:
            print(f"Connected to {env['db_base']} database")
        else:
            print(f"connection error: {conn.status}")
    except psycopg2.OperationalError as e:
            print(e)
    cursor = conn.cursor()
    #cursor.execute("USE mortgage_analysis;")
    #cursor.execute("SET SESSION sql_mode = '';")
    print("starting table load")
    t0 = time.time()
    res = []
    for _file in file_list:
        print(f"{_file}")
        _sql = _template[0].format(_file)
        try:
            res = cursor.execute(_sql)
        except psycopg2.OperationalError as e:
            print(f"{_sql} \n{e}")

    t1 = time.time()
    print(f"Years: {env['years']} Table load time: {t1-t0}") 
    conn.commit()
    conn.close()
    return env

In [7]:
def runSummarySQL(env_in):
    env = env_in
    _sql = open(env["base_path"]+env["path_separator"]+env["summary_sql"], "r").read()
    _sql_list = _sql.split("\n")
    _sql_list2 = []
    for each in _sql_list: 
        _sql_list2.append(each.strip())
                          
    summary_sql = " ".join(_sql_list2)
    connection_request = f"dbname={env['db_base']} user={env['postgres']['user']}"
    print({connection_request})
    try:
        conn = psycopg2.connect(connection_request)
        print(f"conn: {conn}")
        if conn.status == psycopg2.extensions.STATUS_READY:
            print(f"Connected to {env['db_base']} database")
    except psycopg2.OperationalError as e:
            print(e)
    cursor = conn.cursor()
    #cursor.execute("USE mortgage_analysis;")
    print("starting summary sql run")
    t0 = time.time()
    # cursor.execute("USE mortgage_analysis;")
    try:
        res = cursor.execute(summary_sql)
        rows = cursor.fetchall()
    except psycopg2.OperationalError as e:
        print(f"summary sql error: \n{e}")
    t1 = time.time()
    conn.close()
    print(f"Years: {env['years']} summary sql execution time: {t1-t0}")
    print(res)
    return env

In [8]:
def countRows(query):
    connection_request = f"dbname={env['db_base']} user={env['postgres']['user']}"
    print({connection_request})
    try:
        conn = psycopg2.connect(connection_request)
        if conn.status == psycopg2.extensions.STATUS_READY:
            print(f"Connected to {env['db_base']} database")
    except psycopg2.OperationalError as e:
            print(e)
    cursor = conn.cursor()
    #cursor.execute("USE mortgage_analysis;")
    print("starting countRows sql run")
    t0 = time.time()
    #cursor.execute("USE mortgage_analysis;")
    res = []
    try:
        res = cursor.execute(query)
        row_count = cursor.fetchall()
    except psycopg2.OperationalError as e:
        print(f"summary sql error: \n{e}")
    t1 = time.time()
    print(f"Years: {env['years']} count sql execution time: {t1-t0} rows: {row_count[0][0]}")
    conn.close()

In [9]:
env = setGlobals(data="data/text", years=1)
env = getMortgageFiles(env)

env = createLocalDB(env)

env = loadTable(env, env["acq_load_template"], env["acq_files"])
env = loadTable(env, env["perf_load_template"], env["perf_files"])

p_count = "SELECT count(*) FROM perf"
a_count = "SELECT count(*) FROM acq"

countRows(a_count)
countRows(p_count)

env = runSummarySQL(env)

platform_info  entered
{'dbname=postgres user=robertdill'}
Connected to postgres database
Executing: DROP DATABASE IF EXISTS mortgage_analysis;

Executing: CREATE DATABASE mortgage_analysis;

Executing: DROP TABLE IF EXISTS acq;

Executing: CREATE TABLE acq (id SERIAL PRIMARY KEY, loan_id VARCHAR(255), orig_channel VARCHAR(255), seller_name VARCHAR(255), orig_interest_rate VARCHAR(255), orig_upb VARCHAR(255), orig_loan_term VARCHAR(255), orig_date VARCHAR(255), first_pay_date VARCHAR(255), orig_ltv VARCHAR(255), orig_cltv VARCHAR(255), num_borrowers VARCHAR(255), dti VARCHAR(255), borrower_credit_score INTEGER, first_home_buyer VARCHAR(255), loan_purpose VARCHAR(255), property_type VARCHAR(255), num_units VARCHAR(255), occupancy_status VARCHAR(255), property_state VARCHAR(255), zip VARCHAR(255), mortgage_insurance_percent VARCHAR(255), product_type VARCHAR(255), coborrow_credit_score VARCHAR(255), mortgage_insurance_type VARCHAR(255), relocation_mortgage_indicator VARCHAR(255), dummy V

InvalidTextRepresentation: invalid input syntax for type integer: ""
CONTEXT:  COPY acq, line 7, column borrower_credit_score: ""


In [None]:
env = runSummarySQL(env)