# Upload sqlite3 database dumb from Hive to IBM Postgres PaaS

Inputs required, but not included in notebook for security:
- [connection string](/home/matt/etl/production/connection_str)__[*]__
- [ssl certificate](/home/matt/etl/production/root.crt)__[*]__
- [sqlite database dump](/home/matt/etl/production/os.db)

__[*]__ = required by web app for connection

In [1]:
import sqlite3
import psycopg2
from psycopg2.extras import execute_values
import pandas as pd
import base64
import os
import re
import sys
from IPython.display import display, HTML
import time
import tqdm

  """)


In [2]:
pg_conn_str = open('connection_str').read().strip()

In [3]:
sqlite_db_file = 'os.db'

## Connect to sqlite3

In [4]:
sqlite_db_con = sqlite3.connect(sqlite_db_file)

In [5]:
sqlite_db_schema = pd.read_sql('SELECT * FROM sqlite_master', sqlite_db_con)

In [6]:
sqlite_db_schema

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,expends,expends,2,"CREATE TABLE expends (\n cycle TEXT(4),..."
1,table,cands,cands,692616,"CREATE TABLE cands (\n cycle TEXT(4), \..."
2,table,indivs,indivs,692748,"CREATE TABLE indivs (\n cycle TEXT(4), ..."
3,table,pacs2cands,pacs2cands,1230481,CREATE TABLE pacs2cands (\n cycle TEXT(...
4,table,cmtes,cmtes,1234645,"CREATE TABLE cmtes (\n cycle TEXT(4), \..."
5,table,pacs2pacs,pacs2pacs,1230480,CREATE TABLE pacs2pacs (\n cycle TEXT(4...
6,table,industry,industry,1240755,CREATE TABLE industry (\n code TEXT(50)...
7,table,biggest_donors,biggest_donors,1240765,"CREATE TABLE biggest_donors (donor text, donor..."
8,table,pacs2pacs_recip,pacs2pacs_recip,1240782,"CREATE TABLE pacs2pacs_recip (donor_id text, d..."
9,table,simple_pacs2pacs,simple_pacs2pacs,1240783,CREATE TABLE simple_pacs2pacs (donor_id text(9...


## Connect to IBM PostgreSQL

#### Decode SSL cert, store in ~/.postgres/root.crt

In [7]:
if 'HOME' not in os.environ:
    whoiam = !whoami
    whoiam = whoiam[0]
    home = f'/home/{whoiam}' if whoiam != 'root' else '/root'
else:
    home = os.environ['HOME']

In [8]:
pgdir = os.path.join(home, '.postgresql')

In [9]:
print(pgdir)

/home/matt/.postgresql


In [10]:
os.makedirs(pgdir, exist_ok=True)

In [11]:
# translate IBM base64-encoded root cert info to regular string, store in ~/.postgres/root.crt
with open(os.path.join(pgdir, 'root.crt'), 'wb') as f:
    with open('root.crt', 'r') as r:
        f.write(base64.standard_b64decode(r.read()))

#### Test connection

In [12]:
pg = psycopg2.connect(pg_conn_str)

In [13]:
pd.read_sql("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';", pg)

Unnamed: 0,table_name
0,expends
1,cands
2,indivs
3,pacs2cands
4,cmtes
5,pacs2pacs
6,industry
7,biggest_donors
8,pacs2pacs_recip
9,simple_pacs2pacs


## Sqlite3 schema -> Postgres schema

In [14]:
for _, row in sqlite_db_schema.iterrows():
    display(HTML(f'<h4>{row.tbl_name}</h2>'))
    print(row['sql'])

CREATE TABLE expends (
        cycle TEXT(4), 
        id INT, 
        trans_id TEXT(20), 
        filer_id TEXT(9), 
        recip_code TEXT(2), 
        pac_name TEXT(50), 
        recip_name TEXT(90),
        exp_code TEXT(3),
        amount REAL,
        date TEXT(12),
        city TEXT(30),
        state TEXT(2),
        zip TEXT(5),
        alt_cmte_id TEXT(9),
        cand_id TEXT(9),
        trans_type TEXT(3),
        description TEXT(100),
        pg TEXT(5),
        pg_other TEXT(20),
        recip_type TEXT(3),
        source TEXT(5)
        )


CREATE TABLE cands (
        cycle TEXT(4), 
        fec_cand_id TEXT(9), 
        cand_id TEXT(9), 
        name TEXT(50), 
        party TEXT(1), 
        sought_office TEXT(4), 
        current_office TEXT(4), 
        curr_cand TEXT(1),
        cycle_cand TEXT(1),
        cpr_ico TEXT(1),
        recip_code TEXT(2),
        no_pacs TEXT(1))


CREATE TABLE indivs (
        cycle TEXT(4), 
        fec_trans_id TEXT(19), 
        contrib_id TEXT(12), 
        name TEXT(50), 
        recip_id TEXT(9), 
        org TEXT(50), 
        parent_org TEXT(50), 
        industry TEXT(5),
        date TEXT(12),
        amount INT,
        street TEXT(40),
        city TEXT(30),
        state TEXT(2),
        zip TEXT(5),
        recip_code TEXT(2),
        trans_type TEXT(3),
        committee_id TEXT(9),
        other_id TEXT(9),
        gender TEXT(1),
        microfilm TEXT(11),
        occupation TEXT(38),
        employer TEXT(38),
        source TEXT(5)
        )


CREATE TABLE pacs2cands (
        cycle TEXT(4), 
        fec_rec_id TEXT(19), 
        pac_id TEXT(9), 
        cand_id TEXT(9), 
        amount INT,
        date TEXT(12), 
        industry TEXT(5), 
        trans_type TEXT(3), 
        contrib_type TEXT(1),
        fec_cand_id TEXT(9))


CREATE TABLE cmtes (
        cycle TEXT(4), 
        cmte_id TEXT(9), 
        name TEXT(50), 
        affiliate TEXT(50),
        parent TEXT(50), 
        recip_id TEXT(9), 
        recip_code TEXT(2), 
        fec_cand_id TEXT(9), 
        party TEXT(1),
        industry TEXT(5),
        source TEXT(5), 
        sensitive TEXT(1),
        foreign_owner INTEGER,
        active_now INTEGER
        )


CREATE TABLE pacs2pacs (
        cycle TEXT(4), 
        fec_rec_id TEXT(19), 
        filer_id TEXT(9), 
        donor_cmte TEXT(50), 
        name TEXT(50), 
        city TEXT(30),
        state TEXT(2),
        zip TEXT(5),
        occupation TEXT(38),
        donor_industry TEXT(5),
        date TEXT(12),
        amount REAL, 
        recipient_id TEXT(9), 
        party TEXT(1),
        cmte_id TEXT(9),
        recip_code TEXT(2),
        recip_industry TEXT(5), 
        ammended TEXT(1), 
        report_type TEXT(3), 
        pg TEXT(1),
        microfilm TEXT(11),
        trans_type TEXT(3),
        donor_industry2 TEXT(5),
        source TEXT(5))


CREATE TABLE industry (
        code TEXT(50), 
        name TEXT(50), 
        catorder TEXT(50), 
        sector TEXT(50), 
        sector_long TEXT(100)
        )


CREATE TABLE biggest_donors (donor text, donor_total int, recipient text, donation int)


CREATE TABLE pacs2pacs_recip (donor_id text, donor_name text, state text, zip text, occupation text, donor_industry text, amount int, recip_id text, party text, recip_industry text )


CREATE TABLE simple_pacs2pacs (donor_id text(9), donor_name text(50), state text, zip text, occupation text, donor_industry text, amount int, recip_id text, party text, recip_industry text, type text )


In [15]:
pg_schema = sqlite_db_schema

In [16]:
pg_schema['pg_sql'] = pg_schema['sql'].map(lambda x: re.sub(r'[Tt][Ee][Xx][Tt]\(([0-9]+)\)',
                                                            r'VARCHAR(\1)',
                                                            x,
                                                            flags=re.IGNORECASE))

In [17]:
pg_schema['pg_sql'] = pg_schema['pg_sql'].map(lambda x: re.sub(r'(\s)real([,\)])',
                                                               r'\1DOUBLE PRECISION\2',
                                                               x,
                                                               flags=re.IGNORECASE))

In [18]:
pg_schema['pg_sql'] = pg_schema['pg_sql'].map(lambda x: re.sub(r'(\s)int([,\)])',
                                                               r'\1INTEGER\2',
                                                               x,
                                                               flags=re.IGNORECASE))

In [19]:
# fix zip
pg_schema['pg_sql'] = pg_schema['pg_sql'].map(lambda x: re.sub(r'zip VARCHAR\(5\)',
                                                               r'zip VARCHAR(9)',
                                                               x,
                                                               flags=re.IGNORECASE))

In [20]:
# fix microfilm varchar width
pg_schema['pg_sql'] = pg_schema['pg_sql'].map(lambda x: re.sub(r'microfilm VARCHAR\([0-9]+\)',
                                                               r'microfilm VARCHAR(21)',
                                                               x,
                                                               flags=re.IGNORECASE))

In [21]:
# fix source varchar width
pg_schema['pg_sql'] = pg_schema['pg_sql'].map(lambda x: re.sub(r'source VARCHAR\([0-9]+\)',
                                                               r'source VARCHAR(9)',
                                                               x,
                                                               flags=re.IGNORECASE))

In [22]:
for _, row in sqlite_db_schema.iterrows():
    display(HTML(f'<h4>{row.tbl_name}</h2>'))
    print(row['pg_sql'])

CREATE TABLE expends (
        cycle VARCHAR(4), 
        id INTEGER, 
        trans_id VARCHAR(20), 
        filer_id VARCHAR(9), 
        recip_code VARCHAR(2), 
        pac_name VARCHAR(50), 
        recip_name VARCHAR(90),
        exp_code VARCHAR(3),
        amount DOUBLE PRECISION,
        date VARCHAR(12),
        city VARCHAR(30),
        state VARCHAR(2),
        zip VARCHAR(9),
        alt_cmte_id VARCHAR(9),
        cand_id VARCHAR(9),
        trans_type VARCHAR(3),
        description VARCHAR(100),
        pg VARCHAR(5),
        pg_other VARCHAR(20),
        recip_type VARCHAR(3),
        source VARCHAR(9)
        )


CREATE TABLE cands (
        cycle VARCHAR(4), 
        fec_cand_id VARCHAR(9), 
        cand_id VARCHAR(9), 
        name VARCHAR(50), 
        party VARCHAR(1), 
        sought_office VARCHAR(4), 
        current_office VARCHAR(4), 
        curr_cand VARCHAR(1),
        cycle_cand VARCHAR(1),
        cpr_ico VARCHAR(1),
        recip_code VARCHAR(2),
        no_pacs VARCHAR(1))


CREATE TABLE indivs (
        cycle VARCHAR(4), 
        fec_trans_id VARCHAR(19), 
        contrib_id VARCHAR(12), 
        name VARCHAR(50), 
        recip_id VARCHAR(9), 
        org VARCHAR(50), 
        parent_org VARCHAR(50), 
        industry VARCHAR(5),
        date VARCHAR(12),
        amount INTEGER,
        street VARCHAR(40),
        city VARCHAR(30),
        state VARCHAR(2),
        zip VARCHAR(9),
        recip_code VARCHAR(2),
        trans_type VARCHAR(3),
        committee_id VARCHAR(9),
        other_id VARCHAR(9),
        gender VARCHAR(1),
        microfilm VARCHAR(21),
        occupation VARCHAR(38),
        employer VARCHAR(38),
        source VARCHAR(9)
        )


CREATE TABLE pacs2cands (
        cycle VARCHAR(4), 
        fec_rec_id VARCHAR(19), 
        pac_id VARCHAR(9), 
        cand_id VARCHAR(9), 
        amount INTEGER,
        date VARCHAR(12), 
        industry VARCHAR(5), 
        trans_type VARCHAR(3), 
        contrib_type VARCHAR(1),
        fec_cand_id VARCHAR(9))


CREATE TABLE cmtes (
        cycle VARCHAR(4), 
        cmte_id VARCHAR(9), 
        name VARCHAR(50), 
        affiliate VARCHAR(50),
        parent VARCHAR(50), 
        recip_id VARCHAR(9), 
        recip_code VARCHAR(2), 
        fec_cand_id VARCHAR(9), 
        party VARCHAR(1),
        industry VARCHAR(5),
        source VARCHAR(9), 
        sensitive VARCHAR(1),
        foreign_owner INTEGER,
        active_now INTEGER
        )


CREATE TABLE pacs2pacs (
        cycle VARCHAR(4), 
        fec_rec_id VARCHAR(19), 
        filer_id VARCHAR(9), 
        donor_cmte VARCHAR(50), 
        name VARCHAR(50), 
        city VARCHAR(30),
        state VARCHAR(2),
        zip VARCHAR(9),
        occupation VARCHAR(38),
        donor_industry VARCHAR(5),
        date VARCHAR(12),
        amount DOUBLE PRECISION, 
        recipient_id VARCHAR(9), 
        party VARCHAR(1),
        cmte_id VARCHAR(9),
        recip_code VARCHAR(2),
        recip_industry VARCHAR(5), 
        ammended VARCHAR(1), 
        report_type VARCHAR(3), 
        pg VARCHAR(1),
        microfilm VARCHAR(21),
        trans_type VARCHAR(3),
        donor_industry2 VARCHAR(5),
        source VARCHAR(9))


CREATE TABLE industry (
        code VARCHAR(50), 
        name VARCHAR(50), 
        catorder VARCHAR(50), 
        sector VARCHAR(50), 
        sector_long VARCHAR(100)
        )


CREATE TABLE biggest_donors (donor text, donor_total INTEGER, recipient text, donation INTEGER)


CREATE TABLE pacs2pacs_recip (donor_id text, donor_name text, state text, zip text, occupation text, donor_industry text, amount INTEGER, recip_id text, party text, recip_industry text )


CREATE TABLE simple_pacs2pacs (donor_id VARCHAR(9), donor_name VARCHAR(50), state text, zip text, occupation text, donor_industry text, amount INTEGER, recip_id text, party text, recip_industry text, type text )


## Perform upload

In [24]:
for _, row in tqdm.tqdm_notebook(sqlite_db_schema.iterrows(), total=len(sqlite_db_schema)):
    tbl = row['tbl_name']
    display(HTML(f'<h3>{tbl}</h3>'))
    
    pg_create_table = row['pg_sql']
    print(pg_create_table)
    
    with pg.cursor() as pgcur:
        pgcur.execute(f'DROP TABLE IF EXISTS {tbl}')
        pgcur.execute(pg_create_table)

        tbl_row_df = pd.read_sql(f'SELECT * FROM {tbl} LIMIT 1', sqlite_db_con)
        cols = ','.join(tbl_row_df.columns)
        print(cols)

        vals = sqlite_db_con.execute(f'SELECT * FROM {tbl}')
        start_time = time.time()
        try:
            execute_values(pgcur,
                           f"""INSERT INTO {tbl}({cols}) VALUES %s""",
                           vals,
                           page_size=1000)
        except Exception as e:
            print(e, file=sys.stderr)
            break

        end_time = time.time()
        print(f'inserted rows in {round((end_time - start_time)*1000, 2)}ms')
    pg.commit()

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

CREATE TABLE expends (
        cycle VARCHAR(4), 
        id INTEGER, 
        trans_id VARCHAR(20), 
        filer_id VARCHAR(9), 
        recip_code VARCHAR(2), 
        pac_name VARCHAR(50), 
        recip_name VARCHAR(90),
        exp_code VARCHAR(3),
        amount DOUBLE PRECISION,
        date VARCHAR(12),
        city VARCHAR(30),
        state VARCHAR(2),
        zip VARCHAR(9),
        alt_cmte_id VARCHAR(9),
        cand_id VARCHAR(9),
        trans_type VARCHAR(3),
        description VARCHAR(100),
        pg VARCHAR(5),
        pg_other VARCHAR(20),
        recip_type VARCHAR(3),
        source VARCHAR(9)
        )
cycle,id,trans_id,filer_id,recip_code,pac_name,recip_name,exp_code,amount,date,city,state,zip,alt_cmte_id,cand_id,trans_type,description,pg,pg_other,recip_type,source
inserted rows in 2059897.09ms


CREATE TABLE cands (
        cycle VARCHAR(4), 
        fec_cand_id VARCHAR(9), 
        cand_id VARCHAR(9), 
        name VARCHAR(50), 
        party VARCHAR(1), 
        sought_office VARCHAR(4), 
        current_office VARCHAR(4), 
        curr_cand VARCHAR(1),
        cycle_cand VARCHAR(1),
        cpr_ico VARCHAR(1),
        recip_code VARCHAR(2),
        no_pacs VARCHAR(1))
cycle,fec_cand_id,cand_id,name,party,sought_office,current_office,curr_cand,cycle_cand,cpr_ico,recip_code,no_pacs
inserted rows in 527.04ms


CREATE TABLE indivs (
        cycle VARCHAR(4), 
        fec_trans_id VARCHAR(19), 
        contrib_id VARCHAR(12), 
        name VARCHAR(50), 
        recip_id VARCHAR(9), 
        org VARCHAR(50), 
        parent_org VARCHAR(50), 
        industry VARCHAR(5),
        date VARCHAR(12),
        amount INTEGER,
        street VARCHAR(40),
        city VARCHAR(30),
        state VARCHAR(2),
        zip VARCHAR(9),
        recip_code VARCHAR(2),
        trans_type VARCHAR(3),
        committee_id VARCHAR(9),
        other_id VARCHAR(9),
        gender VARCHAR(1),
        microfilm VARCHAR(21),
        occupation VARCHAR(38),
        employer VARCHAR(38),
        source VARCHAR(9)
        )
cycle,fec_trans_id,contrib_id,name,recip_id,org,parent_org,industry,date,amount,street,city,state,zip,recip_code,trans_type,committee_id,other_id,gender,microfilm,occupation,employer,source
inserted rows in 1482608.18ms


CREATE TABLE pacs2cands (
        cycle VARCHAR(4), 
        fec_rec_id VARCHAR(19), 
        pac_id VARCHAR(9), 
        cand_id VARCHAR(9), 
        amount INTEGER,
        date VARCHAR(12), 
        industry VARCHAR(5), 
        trans_type VARCHAR(3), 
        contrib_type VARCHAR(1),
        fec_cand_id VARCHAR(9))
cycle,fec_rec_id,pac_id,cand_id,amount,date,industry,trans_type,contrib_type,fec_cand_id
inserted rows in 10987.77ms


CREATE TABLE cmtes (
        cycle VARCHAR(4), 
        cmte_id VARCHAR(9), 
        name VARCHAR(50), 
        affiliate VARCHAR(50),
        parent VARCHAR(50), 
        recip_id VARCHAR(9), 
        recip_code VARCHAR(2), 
        fec_cand_id VARCHAR(9), 
        party VARCHAR(1),
        industry VARCHAR(5),
        source VARCHAR(9), 
        sensitive VARCHAR(1),
        foreign_owner INTEGER,
        active_now INTEGER
        )
cycle,cmte_id,name,affiliate,parent,recip_id,recip_code,fec_cand_id,party,industry,source,sensitive,foreign_owner,active_now
inserted rows in 1487.51ms


CREATE TABLE pacs2pacs (
        cycle VARCHAR(4), 
        fec_rec_id VARCHAR(19), 
        filer_id VARCHAR(9), 
        donor_cmte VARCHAR(50), 
        name VARCHAR(50), 
        city VARCHAR(30),
        state VARCHAR(2),
        zip VARCHAR(9),
        occupation VARCHAR(38),
        donor_industry VARCHAR(5),
        date VARCHAR(12),
        amount DOUBLE PRECISION, 
        recipient_id VARCHAR(9), 
        party VARCHAR(1),
        cmte_id VARCHAR(9),
        recip_code VARCHAR(2),
        recip_industry VARCHAR(5), 
        ammended VARCHAR(1), 
        report_type VARCHAR(3), 
        pg VARCHAR(1),
        microfilm VARCHAR(21),
        trans_type VARCHAR(3),
        donor_industry2 VARCHAR(5),
        source VARCHAR(9))
cycle,fec_rec_id,filer_id,donor_cmte,name,city,state,zip,occupation,donor_industry,date,amount,recipient_id,party,cmte_id,recip_code,recip_industry,ammended,report_type,pg,microfilm,trans_type,donor_industry2,source
inserted rows in 15527.09ms


CREATE TABLE industry (
        code VARCHAR(50), 
        name VARCHAR(50), 
        catorder VARCHAR(50), 
        sector VARCHAR(50), 
        sector_long VARCHAR(100)
        )
code,name,catorder,sector,sector_long
inserted rows in 16.67ms


CREATE TABLE biggest_donors (donor text, donor_total INTEGER, recipient text, donation INTEGER)
donor,donor_total,recipient,donation
inserted rows in 29.28ms


CREATE TABLE pacs2pacs_recip (donor_id text, donor_name text, state text, zip text, occupation text, donor_industry text, amount INTEGER, recip_id text, party text, recip_industry text )
donor_id,donor_name,state,zip,occupation,donor_industry,amount,recip_id,party,recip_industry
inserted rows in 0.03ms


CREATE TABLE simple_pacs2pacs (donor_id VARCHAR(9), donor_name VARCHAR(50), state text, zip text, occupation text, donor_industry text, amount INTEGER, recip_id text, party text, recip_industry text, type text )
donor_id,donor_name,state,zip,occupation,donor_industry,amount,recip_id,party,recip_industry,type
inserted rows in 3928.4ms



## Create indices

In [26]:
indices = {
    'expends': ['trans_id'],
    'cands': ['cand_id', 'name'],
    'expends': ['amount'],
    'indivs': ['recip_id', 'industry'],
    'cmtes': ['cmte_id'],
    'pacs2cands': ['cand_id', 'pac_id', 'industry'],
    'industry': ['code'],
    'pacs2pacs': ['filer_id', 'donor_industry']
}

In [29]:
for key, val in indices.items():
    for v in val:
        with pg.cursor() as cur:
            index_create_stmt = f'CREATE INDEX {key}_{v}_index ON {key} ({v});'
            print(index_create_stmt)
            cur.execute(index_create_stmt)
pg.commit()

CREATE INDEX expends_amount_index ON expends (amount);
CREATE INDEX cands_cand_id_index ON cands (cand_id);
CREATE INDEX cands_name_index ON cands (name);
CREATE INDEX indivs_recip_id_index ON indivs (recip_id);
CREATE INDEX indivs_industry_index ON indivs (industry);
CREATE INDEX cmtes_cmte_id_index ON cmtes (cmte_id);
CREATE INDEX pacs2cands_cand_id_index ON pacs2cands (cand_id);
CREATE INDEX pacs2cands_pac_id_index ON pacs2cands (pac_id);
CREATE INDEX pacs2cands_industry_index ON pacs2cands (industry);
CREATE INDEX industry_code_index ON industry (code);
CREATE INDEX pacs2pacs_filer_id_index ON pacs2pacs (filer_id);
CREATE INDEX pacs2pacs_donor_industry_index ON pacs2pacs (donor_industry);
