In [1]:
import json
import os

import pandas as pd
from pandas import DataFrame as DF, Series
import numpy as np
import psycopg2 as pg

In [2]:
def insert_df(df, name):
    """ Inserts rows from a dataframe into a postgres table.
    
        df : pandas dataframe
        name : name of the table in postgres
    """
    global cur
    
    cols = to_df('select * from ' + name).columns.tolist()
    rows = [tuple(r)[1:] for r in df[cols].itertuples()]
    n = len(rows[0])
    row_values = ','.join([cur.mogrify('({})'.format(
        ', '.join(n*['%s'])), r).decode('utf-8') for r in rows])
    q = 'INSERT INTO {} VALUES '.format(name.upper()) + row_values
    cur.execute(q)
#     print('Inserted {} rows into {}'.format(len(df), name))

    
def to_df(q):
    """ Return dataframe containing query results.
    
        q : query string that will be executed by postgres
    """
    global cur
    try:
        cur.execute(q)
    except Exception as e:
        db.rollback()
        print(e)
        return None
    return DF(cur.fetchall(), 
              columns=[d[0] for d in cur.description])

def nan_to_null(f,
        _NULL=pg.extensions.AsIs('NULL'),
        _NaN=np.NaN,
        _Float=pg.extensions.Float):
    if f is not _NaN:
        return _Float(f)
    return _NULL

pg.extensions.register_adapter(float, nan_to_null)

In [3]:
host = '54.202.102.40'
# host = '0.0.0.0'
dbname = 'local-elections-finance'
port = '5432'
password = 'mqPbgkRoQXognWRz2tMQ'
# password = 'local-elections'
db = pg.connect(host=host, dbname=dbname, user='local-elections', password=password)
cur = db.cursor()

In [4]:
files = os.listdir('cleaned_scrapes')
files

['transactions_cleaned.csv',
 'trans_details_cleaned.csv',
 'statement_cleaned.csv',
 'election_activity_cleaned.csv',
 'comm_history_cleaned.csv']

In [5]:
all_dfs = {}
for f in files:
    name = f.split('.')[0]
    all_dfs[name] = pd.read_csv(os.path.join('cleaned_scrapes', f))

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
list(all_dfs)

['election_activity_cleaned',
 'trans_details_cleaned',
 'transactions_cleaned',
 'comm_history_cleaned',
 'statement_cleaned']

### Replace all records in table

In [7]:
def del_all_records(table):
    q = "DELETE FROM {}".format(table)
    try:
        cur.execute(q)
        db.commit()
    except Exception as e:
        db.rollback()
        print(e)

In [8]:
to_df("select table_name from information_schema.tables where table_schema='public'")

Unnamed: 0,table_name
0,ballots
1,committee_history
2,transaction_details
3,committees_list
4,donor
5,election_activity
6,payee
7,statement_of_org
8,transactions


In [29]:
import re
import sys

for name, df in all_dfs.items():
    name = re.sub('_cleaned', '', name)
    name = re.sub('comm_', 'committee_', name)
    name = re.sub('trans_', 'transaction_', name)
    name = re.sub('statement', 'statement_of_org', name)
    print(name.upper())
    del_all_records(name)
    chunksize = 5000
    try:
        for i in range(0, len(df), chunksize):
            chunk = df.iloc[i:i+chunksize]
            insert_df(chunk, name)
            n = min(i+chunksize, len(df))
            message = '\rInserted {:>7}/{} rows into {}'.format(n, len(df), name)
            sys.stdout.write(message)
            sys.stdout.flush()
        print()
    except Exception as e:
        db.rollback()
        print(name)
        print(e)
        print()
db.commit()

TRANSACTION_DETAILS
Inserted  500000/541368 rows into transaction_detailstransaction_details
duplicate key value violates unique constraint "transaction_details_pkey"
DETAIL:  Key (transaction_id)=(1034447) already exists.


COMMITTEE_HISTORY
Inserted    9525/9525 rows into committee_history
ELECTION_ACTIVITY
Inserted   15347/15347 rows into election_activity
TRANSACTIONS
Inserted 1022561/1022561 rows into transactions
STATEMENT_OF_ORG
Inserted    1596/1596 rows into statement_of_org


For some reason there are ~30k records that are duplicated except for the value in Agg (e.g. 0 vs nan)

In [9]:
all_dfs['trans_details_cleaned'].shape

(541368, 24)

In [10]:
all_dfs['trans_details_cleaned'].dropna(subset=['aggregate']).shape

(491200, 24)

In [11]:
duplicate_trans_ids = all_dfs['trans_details_cleaned'][
    all_dfs['trans_details_cleaned'].duplicated(subset=['transaction_id'])
    ].transaction_id.tolist()

In [12]:
diffs = {}
for tx in duplicate_trans_ids:
    dfx = all_dfs['trans_details_cleaned'][
            all_dfs['trans_details_cleaned'].transaction_id == tx
            ].fillna(-1)
    x = dfx.iloc[0] == dfx.iloc[1]
    colmask = x[x == False].index
    dfx = dfx[colmask]
    diffs[tx] = dfx

In [33]:
diffcols = {}
difftups = set()
for k,df in diffs.items():
    difftups.update([tuple(df)])
    for c in df:
        diffcols.setdefault(c, set())
        diffcols[c].update([tuple(df.loc[:, c])])

In [34]:
difftups

{('aggregate',),
 ('aggregate', 'amount'),
 ('aggregate', 'associations'),
 ('aggregate', 'process_status'),
 ('associations',),
 ('process_status',)}

In [35]:
list(diffcols)

['aggregate', 'amount', 'process_status', 'associations']

In [44]:
for k in diffcols:
#     if k == 'associations':
#         continue
    print(20*'-')
    print(k.upper(), '\n')
    print(diffcols[k], '\n')

--------------------
AGGREGATE 

{(281.79, 170.79), (0.0, -1.0), (8727.0, 5818.0), (146.11, 119.54)} 

--------------------
AMOUNT 

{(45.31, 14.5), (-3.5, -1.0), (-16.23, -1.0), (-0.01, -1.0), (2.4, 1.2), (-180.29, -1.0), (9.92, 4.96), (-3649.65, -1.0), (-640.94, -1.0), (-5.0, -1.0), (-1249.82, -1.0), (-74.62, -1.0), (-43771.05, -1.0), (-74.0, -1.0), (-80.0, -1.0), (-47.33, -1.0), (-2013.45, -1.0), (-24.48, -1.0), (-96.0, -1.0), (-3552.16, -1.0), (-14.0, -1.0), (-460.85, -1.0), (-4474.09, -1.0), (-217.07, -1.0), (-473.43, -1.0), (-1241.68, -1.0), (-371.0, -1.0), (-0.36, -1.0)} 

--------------------
PROCESS_STATUS 

{('Insufficient/Statutory', 'Filed'), ('Complete', 'Filed')} 

--------------------
ASSOCIATIONS 

{('Miscellaneous Account Receivable for $85.00 from Bobbi Jo Annal on 09/24/2013 (1584242) Complete', 'Miscellaneous Account Receivable for $85.00 from Bobbi Jo Annal on 09/24/2013 (1584242) Complete, '), ('Account Payable for $588.00 from Western Communications, Inc. on 06/0

In [23]:
all_dfs['trans_details_cleaned'].process_status.unique()

array(['Complete', 'Insufficient/Statutory', 'Insufficient/General',
       'Filed'], dtype=object)

In [18]:
all_dfs['trans_details_cleaned'].select_dtypes(['O']).apply(lambda x: x.str.len()).max()

address                           101.0
address_book_type                  28.0
agent                              32.0
associations                     1216.0
description                       199.0
due_date                           22.0
employer_name                      92.0
filed_date                         22.0
name                              115.0
occupation                         91.0
occupation_letter_date             10.0
payer_of_personal_expenditure      54.0
payment_method                     25.0
process_status                     22.0
purpose                           271.0
repayment_schedule                 89.0
transaction_date                   10.0
transaction_sub_type               39.0
transaction_type                   24.0
dtype: float64

In [28]:
cur.execute("alter table transaction_details alter column purpose type varchar(512)")
db.commit()

In [51]:
def fix_assoc(x):
    if x != x:
        return x
    return x.strip(', ')

all_dfs['trans_details_cleaned'][all_dfs['trans_details_cleaned'].associations.notnull()].associations.apply(lambda x: fix_assoc(x)).values[3]

'Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 07/01/2015 (2037771) Complete, Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 05/01/2015 (2037594) Complete, Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 06/01/2015 (2037769) Complete, Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 01/02/2015 (2037589) Complete, Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 04/01/2015 (2037592) Complete, Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 03/02/2015 (2037591) Complete, Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 02/02/2015 (2037590) Complete'

In [54]:
s = all_dfs['trans_details_cleaned'][all_dfs['trans_details_cleaned'].associations.notnull()].associations.values[3]

In [61]:
all_dfs['trans_details_cleaned'][all_dfs['trans_details_cleaned'].associations.notnull()].iloc[3]

address                                 14287 S Firethorne Ct Oregon City OR 97045
address_book_type                                     Candidate's Immediate Family
agent                                                                          NaN
aggregate                                                                     1300
amount                                                                         700
associations                     Loan Received (Non-Exempt) for $100.00 from Ke...
description                                                                    NaN
due_date                                                    01/05/2017 11:59:00 PM
employer_name                                                         Not Employed
filed_date                                                  12/14/2016 08:24:40 PM
name                                                                Ken Humberston
occupation                                                                     NaN
occu

In [56]:
s = s.split(', ')

In [57]:
s

['Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 07/01/2015 (2037771) Complete',
 'Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 05/01/2015 (2037594) Complete',
 'Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 06/01/2015 (2037769) Complete',
 'Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 01/02/2015 (2037589) Complete',
 'Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 04/01/2015 (2037592) Complete',
 'Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 03/02/2015 (2037591) Complete',
 'Loan Received (Non-Exempt) for $100.00 from Ken Humberston on 02/02/2015 (2037590) Complete']

In [25]:
to_df('select * from transaction_details')

Unnamed: 0,transaction_id,payee_id,donor_id,address,address_book_type,agent,aggregate,amount,associations,description,...,occupation,occupation_letter_date,payer_of_personal_expenditure,payment_method,process_status,purpose,repayment_schedule,transaction_date,transaction_sub_type,transaction_type
