# Cleaning of All Files

In [1]:
import io
import json
import re
import os

import pandas as pd
from pandas import DataFrame as DF, Series
import numpy as np

import requests
import missingno as msn
import psycopg2 as pg

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

In [2]:
PATH = os.path.join(os.getenv('HOME'), 'Projects/HackOregon/elections-2018/scrape_files')

In [3]:
from glob import glob

os.chdir(PATH)
files = [f for f in glob('*.csv')]
os.chdir('../../')

In [4]:
files

['transactions_gaps33_4.csv',
 'transaction_detail12.csv',
 'transaction_detail3.csv',
 'transaction_detail22.csv',
 'election_activity3.csv',
 'statement_of_organization3.csv',
 'all_transactions195.csv',
 'all_transactions4.csv',
 'transaction_detail18.csv',
 'all_transactions6.csv',
 'transaction_detail195.csv',
 'election_activity_first_batch.csv',
 'transaction_detail17.csv',
 'all_transactions9.csv',
 'transactions_gaps33_7.csv',
 'transaction_detail13.csv',
 'all_transactions10.csv',
 'transaction_detail_first_batch.csv',
 'all_transactions18.csv',
 'all_transactions8.csv',
 'all_transactions115.csv',
 'transactions_gaps275_2.csv',
 'trans_detail_first_batch_clean.csv',
 'transactions_gaps348_3.csv',
 'all_transactions14.csv',
 'all_transactions1152.csv',
 'transactions_gaps33_11.csv',
 'all_transactions15.csv',
 'transactions_gaps348_2.csv',
 'all_transactions11.csv',
 'all_transactions13.csv',
 'transaction_detail11.csv',
 'transactions_gaps33_12.csv',
 'transaction_detail14.c

In [5]:
import warnings
from ast import literal_eval

map_names = {
    'all': 'transactions',
    'transactions': 'transactions',
    'transaction': 'trans_details',
    'statement': 'statement',
    'committee': 'comm_history',
    'election': 'election_activity',
    }
dfs = {}
with warnings.catch_warnings(record=True) as w:
    for i,f in enumerate(files):
        head = f.split('_')[0]
        if head == 'trans':
            continue
        else:
            head = map_names[head]
        df = pd.read_csv('elections-2018/scrape_files/{}'.format(f))
        if w:
            u = w[-1]
            u.message.args[0]
            tup = literal_eval(u.message.args[0].strip('Columns ').split(' ')[0])
            try:
                df = pd.read_csv('elections-2018/scrape_files/{}'.format(f), dtype={k:str for k in tup})
            except:
                df = pd.read_csv('elections-2018/scrape_files/{}'.format(f), dtype={tup: str})
        dfs.setdefault(head, DF([], columns=df.columns))
        dfs[head] = dfs[head].append(df)

In [6]:
list(dfs)

['comm_history',
 'transactions',
 'statement',
 'election_activity',
 'trans_details']

## Routines

### Visually Inspect Column Values

In [7]:
def inspect_cols(df):
    long_cols = []

    def print_message(c, head=False):
        print(40*'-')
        print(c.upper(), '\n')
        if head:
            print(df.loc[:, c].unique()[:25].tolist(), '\n')
        else:
            print(df.loc[:, c].unique().tolist(), '\n')

    for c in df:
        if df.loc[:, c].nunique() < 100:
            print_message(c)
        else:
            print_message(c, True)
            long_cols.append(c)
    print(40*'=')
    print('Columns with many unique values:\n')
    print(long_cols)

### Find Column Values That Mismatch Pattern

In [8]:
def check_values(df, col_pattern, val_pattern):
    cols = [c for c in df if re.search(col_pattern, c)]

    for c in cols:
        mask = df.loc[:, c].dropna() \
            .apply(lambda x: True if not re.match(val_pattern, str(x)) else False)
        bad = df.loc[:, c].dropna()[mask].drop_duplicates()
        print(40*'-')
        print(c.upper(), '\n')
        print(bad, '\n')

## Statement DF

In [9]:
dfs['statement'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1617 entries, 0 to 124
Data columns (total 26 columns):
Candidate Address                  960 non-null object
Committee                          0 non-null float64
Committee Acronym                  295 non-null object
Committee Address                  1596 non-null object
Committee Campaign Phone           1384 non-null object
Committee Filing Effective From    1596 non-null object
Committee Filing Type              1596 non-null object
Committee ID                       1596 non-null float64
Committee Name                     1596 non-null object
Committee PAC Type                 632 non-null object
Election/Office                    960 non-null object
Email Address                      958 non-null object
Employer                           859 non-null object
Fax                                150 non-null object
Home Phone                         663 non-null object
Mailing Address                    960 non-null object
Name    

### Inspect Columns

In [10]:
inspect_cols(dfs['statement'])

----------------------------------------
CANDIDATE ADDRESS 

[nan, '1907 NW Mill Pond Road, Portland,           OR      97229         - 7553', '3393 Arlington Avenue, Eugene,           OR      97408', '23695 SW Brittany Ln, Sherwood,           OR      97140', '1003 Terrace Dr, Lake Oswego,           OR      97034', '1163 State St, Salem,           OR      97301', '6771 SW 162nd Drive, Beaverton,           OR      97007         - 4894', '7811 Old Stage Rd, Central Point,           OR      97502', '17915 NW Lonerock Dr, Portland,           OR      97229', '9026 SW 36th Ave, Portland,           OR      97219', '2138 Lois Drive, Roseburg,           OR      97470', '18340 SW Monte Verdi Blvd, Aloha,           OR      97007', '2248 Potter St, Eugene,           OR      97405', '410 Jefferson Street, Oregon City,           OR      97045', '22760 SW 87th Pl., Tualatin,           OR      97062', '660 Morgan Ave, Ontario,           OR      97914', '805 Kingwood Dr NW, Salem,           OR      973

### Phone

In [11]:
ph_pattern = '\(\d{3}\)\s*\d{3}-\d{4}( x{0,1}\d{0,4}){0,1}'
c = '(Fax|Phone)'
check_values(dfs['statement'], c, ph_pattern)

----------------------------------------
COMMITTEE CAMPAIGN PHONE 

Series([], Name: Committee Campaign Phone, dtype: object) 

----------------------------------------
FAX 

717     5038514351
1280    5038126773
Name: Fax, dtype: object 

----------------------------------------
HOME PHONE 

31      Exempt from public record
448                  541-536-7444
617                    5418060837
728                    5034345365
816                  503 621-6316
1184                   5415799022
Name: Home Phone, dtype: object 

----------------------------------------
TREASURER FAX 

688     5037417574
717     5038514351
1251    5032706995
1280    5038126773
Name: Treasurer Fax, dtype: object 

----------------------------------------
TREASURER HOME PHONE 

210     Exempt from public record
688                    5037417574
1003                   5417586842
1090                   5419332256
1184                   5415799022
1218                   9712664290
1251                   5032706

Phone/Fax numbers all appear to be correct, but the format needs to be standardized.

### Check Dates

In [12]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = 'Committee Filing Effective From'
check_values(dfs['statement'], c, date_pattern)

----------------------------------------
COMMITTEE FILING EFFECTIVE FROM 

Series([], Name: Committee Filing Effective From, dtype: object) 



All dates are good.

### Check Committee ID

In [13]:
id_pattern = '\d+'
c = 'Committee ID'
check_values(dfs['statement'], c, id_pattern)

----------------------------------------
COMMITTEE ID 

Series([], Name: Committee ID, dtype: float64) 



ID values are good

### Clean Statement Columns

#### Standardize Phone/Fax

In [14]:
def std_phone(x):
    if x != x:
        return None
    patterns = {
        '\d{10}': 
            ' '.join([x[:3], x[3:6], x[6:]]),
        ph_pattern: 
            re.sub('(\)\s*|-)', ' ', x.strip('(')),
        '\d{3}-\d{3}-\d{4}( x{0,1}\d{0,4}){0,1}':
            x.replace('-', ' '),
        '\d{3} \d{3}-\d{4}( x{0,1}\d{0,4}){0,1}':
            x.replace('-', ' '),
        '(exempt|Exempt)':
            'Exempt from public record',
    }
    for p in patterns:
        if re.match(p, x):
            return patterns[p]
    return x

In [15]:
cols = [c for c in dfs['statement'] if re.search('(Fax|Phone)', c)]

for c in cols:
    dfs['statement'].loc[:, c] = dfs['statement'].loc[:, c].apply(lambda x: std_phone(x))

In [16]:
ph_pattern = '\d{3} \d{3} \d{4}( x{0,1}\d{0,4}){0,1}'
c = '(Fax|Phone)'
check_values(dfs['statement'], c, ph_pattern)

----------------------------------------
COMMITTEE CAMPAIGN PHONE 

Series([], Name: Committee Campaign Phone, dtype: object) 

----------------------------------------
FAX 

Series([], Name: Fax, dtype: object) 

----------------------------------------
HOME PHONE 

31    Exempt from public record
Name: Home Phone, dtype: object 

----------------------------------------
TREASURER FAX 

Series([], Name: Treasurer Fax, dtype: object) 

----------------------------------------
TREASURER HOME PHONE 

210    Exempt from public record
Name: Treasurer Home Phone, dtype: object 

----------------------------------------
TREASURER WORK PHONE 

Series([], Name: Treasurer Work Phone, dtype: object) 

----------------------------------------
WORK PHONE 

1013    Exempt from public record
Name: Work Phone, dtype: object 



In [39]:
dfs['statement'].to_csv('statement_cleaned.csv', index=False)

## Trans Details DF

In [17]:
dfs['trans_details'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714391 entries, 0 to 5468
Data columns (total 37 columns):
Address                             532482 non-null object
Address Book Type                   536535 non-null object
Agent                               1910 non-null object
Aggregate                           664223 non-null object
Amount                              714391 non-null object
Associations                        10682 non-null object
Check                               152195 non-null float64
City, State, Zip Code               0 non-null float64
CoSigner Obligations                0 non-null float64
Description                         209571 non-null object
Due Date                            714376 non-null object
Employer                            0 non-null float64
Employer City, State                0 non-null float64
Employer Name                       201455 non-null object
Exam Letter Date                    560583 non-null object
Filed Date              

### Inspect Columns

In [18]:
inspect_cols(dfs['trans_details'])

----------------------------------------
ADDRESS 

['Business Services Division 255 Capitol St NE Ste 180 Salem OR 97310', nan, '3133 NE Prescott Street Corbett OR 97019', '37003 NE Reed Rd Corbett OR 97019', '18 NE Evans Rd Corbett OR 97019', '11726 SW 29th Place Portland OR 97219', '1619 NE 366th Ave Corbett OR 97019', 'Corbett OR 97019', '6900 SW Atlanta St Portland OR 97223', '9738 SE Washington St Portland OR 97216', '2500-116th Avenue NE Bellevue WA 98004', '7421 SW Barbur Blvd Portland OR 97219', '35800 E Historic Columbia River Hwy Corbett OR 97019', '1040 SE Morrison St Portland OR 97214', '1732 NW Quimby Street Suite 200 Portland OR 97209', '1788 SW Harvey Way Aloha OR 97006', '4675 NW Owyhee Court Portland OR 97229', 'PO BOX 10005 Department 415 Palo Alto CA 94303', '2380 NE 10th Ave Hillsboro OR 97124', '5301 W. Baseline Hillsboro OR 97123', '3700 SW Murray Blvd Suite 101 Beaverton OR 97005', '4824 NE 42nd Avenue Portland OR 97218', '500 NE Multnomah Street Suite 100 Portla

### Check Dates

In [19]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*\s*Date'
check_values(dfs['trans_details'], c, date_pattern)

----------------------------------------
DUE DATE 

19119    03/13/2023 11:59:00 PM
Name: Due Date, dtype: object 

----------------------------------------
EXAM LETTER DATE 

Series([], Name: Exam Letter Date, dtype: object) 

----------------------------------------
FILED DATE 

Series([], Name: Filed Date, dtype: object) 

----------------------------------------
OCCUPATION LETTER DATE 

Series([], Name: Occupation Letter Date, dtype: object) 

----------------------------------------
TRANSACTION DATE 

17042    02/23/0007
54035    01/14/0029
82622    01/05/0007
93749    12/08/0006
68927    05/03/0007
81289    11/03/0209
81290    02/23/0009
81291    03/14/0008
81292    02/19/0007
11087    06/10/0007
19119    02/11/2023
21287    06/25/0207
Name: Transaction Date, dtype: object 

----------------------------------------
TRANSACTION FILED DATE 

Series([], Name: Transaction Filed Date, dtype: float64) 



Transaction date of 2023?

### Check Trans ID

In [20]:
id_pattern = '\d+'
c = '\w*\s*ID'
check_values(dfs['trans_details'], c, id_pattern)

----------------------------------------
TRANSACTION ID 

Series([], Name: Transaction ID, dtype: object) 



### Check Aggregate and Amount

In [21]:
float_pattern = '[-]{0,1}\d+[.]{0,1}\d{0,2}'
c = '(Agg\w*|Amount)'
check_values(dfs['trans_details'], c, float_pattern)

----------------------------------------
AGGREGATE 

1           $648.00
2           $250.00
4           $200.00
7           $400.00
19          $595.00
20          $415.00
22          $210.00
27        $1,000.00
28          $500.00
34          $150.00
43          $300.00
44          $550.00
45        $3,000.00
47          $125.00
53        $4,720.51
55        $4,500.00
57       $24,991.61
59        $2,000.00
62       $18,991.61
64       $10,412.09
66        $7,275.93
67        $2,994.08
70       $15,000.00
71        $1,500.00
72        $5,672.81
73        $5,535.31
80          $272.27
87          $350.00
88          $179.71
90          $750.00
            ...    
99554       $846.30
99555       $786.30
99557       $686.25
99559       $586.25
99568       $848.75
99580       $611.50
99583     $4,369.75
99584     $4,732.00
99585       $506.50
99589       $401.50
99596    $10,950.67
99597    $11,975.47
99598    $12,250.47
99599    $12,389.56
99601     $9,096.87
99736       $224.16
99898  

### Clean Trans Detail

#### Dates

In [22]:
def fix_date(d):
    if d != d:
        return None
    x = d[-4:]
    patterns = {
        '(02|91)\d{2}': x[:2][::-1]+x[2:],
        '002\d': x[2] + x[:2] + x[3],
        '000\d': '200' + x[3],
        }
    for p in patterns:
        if re.match(p, x):
            return d[:-4] + patterns[p]
    return d

dfs['trans_details'].loc[:, 'Transaction Date'] = \
    dfs['trans_details']['Transaction Date'].apply(lambda x: fix_date(x))

In [23]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*\s*Date'
check_values(dfs['trans_details'], c, date_pattern)

----------------------------------------
DUE DATE 

19119    03/13/2023 11:59:00 PM
Name: Due Date, dtype: object 

----------------------------------------
EXAM LETTER DATE 

Series([], Name: Exam Letter Date, dtype: object) 

----------------------------------------
FILED DATE 

Series([], Name: Filed Date, dtype: object) 

----------------------------------------
OCCUPATION LETTER DATE 

Series([], Name: Occupation Letter Date, dtype: object) 

----------------------------------------
TRANSACTION DATE 

19119    02/11/2023
Name: Transaction Date, dtype: object 

----------------------------------------
TRANSACTION FILED DATE 

Series([], Name: Transaction Filed Date, dtype: float64) 



#### Agg & Amount

In [34]:
def tofloat(x):
    try:
        if type(x) == float:
            return x
        if x.startswith('$'):
            x = x.strip('$').replace(',', '')
        return float(x)
    except:
        return None
for c in ['Aggregate','Amount']:
    dfs['trans_details'].loc[:, c] = dfs['trans_details'].loc[:, c].apply(lambda x: tofloat(x))

In [36]:
float_pattern = '[-]{0,1}\d+[.]{0,1}\d{0,2}'
c = '(Agg\w*|Amount)'
check_values(dfs['trans_details'], c, float_pattern)

----------------------------------------
AGGREGATE 

Series([], Name: Aggregate, dtype: float64) 

----------------------------------------
AMOUNT 

Series([], Name: Amount, dtype: float64) 

----------------------------------------
TRANSACTION AMOUNT 

Series([], Name: Transaction Amount, dtype: float64) 



In [38]:
dfs['trans_details'].to_csv('trans_details_cleaned.csv', index=False)

## Run Scrapers

In [39]:
os.chdir('elections-2018')
os.getcwd()

'/home/michaelcrown/Projects/HackOregon/elections-2018'

In [32]:
def parse_ostar(cid, start, stop):
    print('Parsing cid {}'.format(cid))
    i = 1
    while True:
        args = ' '.join([
            str(cid), 
            start.lstrip('0'), 
            stop.lstrip('0'), 
            str(i),
            ])
        print(start, stop)
        pypath = os.path.join(os.getcwd(), 'scrape_files/transactions_scrape_date.py')
        cmd = 'python {} {}'.format(pypath, args)
        os.system(cmd)
        file = 'transactions_gaps{}_{}.csv'.format(cid, i)
        dfpath = os.path.join(os.getcwd(), file)
        df = pd.read_csv(dfpath, parse_dates=['Tran Date'])
        if len(df) < 5000:
            break
        stop = df['Tran Date'].min().date().strftime('%m/%d/%Y')
        i += 1

In [None]:
parser_args = [
#     (4572, '01/1/1900', '12/7/17'),
    (4572, '01/1/1900', '09/28/2016'),
    (3396, '10/15/11', '9/15/14'),
    (191, '5/27/2011', '4/28/13'),
    (39, '12/23/2009', '10/25/16'),
    ]

for args in parser_args:
    parse_ostar(*args)