# Cleaning of All Files

In [1]:
import io
import json
import re
import os

import pandas as pd
from pandas import DataFrame as DF, Series
import numpy as np

import requests
import missingno as msn
import psycopg2 as pg

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

### Load All Data

In [2]:
from glob import glob

PATH = os.path.join(os.getenv('HOME'), 'Projects/HackOregon/elections-2018/scrape_files')

os.chdir(PATH)
files = [f for f in glob('*.csv')]
os.chdir('../../')

In [3]:
files

['transactions_gaps33_4.csv',
 'transactions_gaps4572_30.csv',
 'transaction_detail12.csv',
 'transaction_detail3.csv',
 'transaction_detail22.csv',
 'election_activity3.csv',
 'statement_of_organization3.csv',
 'all_transactions195.csv',
 'all_transactions4.csv',
 'transactions_gaps4572_5.csv',
 'transaction_detail18.csv',
 'all_transactions6.csv',
 'transactions_gaps4572_28.csv',
 'transaction_detail195.csv',
 'election_activity_first_batch.csv',
 'transaction_detail17.csv',
 'all_transactions9.csv',
 'transactions_gaps33_7.csv',
 'transactions_gaps4572_15.csv',
 'transactions_gaps4572_11.csv',
 'transaction_detail13.csv',
 'transactions_gaps4572_25.csv',
 'all_transactions10.csv',
 'transactions_gaps39_1.csv',
 'transaction_detail_first_batch.csv',
 'all_transactions18.csv',
 'all_transactions8.csv',
 'all_transactions115.csv',
 'transactions_gaps275_2.csv',
 'trans_detail_first_batch_clean.csv',
 'transactions_gaps348_3.csv',
 'transactions_gaps4572_24.csv',
 'all_transactions14.cs

In [4]:
import warnings
from ast import literal_eval

map_names = {
    'all': 'transactions',
    'transactions': 'transactions',
    'transaction': 'trans_details',
    'statement': 'statement',
    'committee': 'comm_history',
    'election': 'election_activity',
    }
dfs = {}
with warnings.catch_warnings(record=True) as w:
    for i,f in enumerate(files):
        head = f.split('_')[0]
        if head == 'trans':
            continue
        else:
            head = map_names[head]
        df = pd.read_csv('elections-2018/scrape_files/{}'.format(f))
        if w:
            u = w[-1]
            u.message.args[0]
            tup = literal_eval(u.message.args[0].strip('Columns ').split(' ')[0])
            try:
                df = pd.read_csv('elections-2018/scrape_files/{}'.format(f), dtype={k:str for k in tup})
            except:
                df = pd.read_csv('elections-2018/scrape_files/{}'.format(f), dtype={tup: str})
        dfs.setdefault(head, DF([], columns=df.columns))
        dfs[head] = dfs[head].append(df, ignore_index=True)

In [5]:
list(dfs)

['election_activity',
 'statement',
 'trans_details',
 'transactions',
 'comm_history']

In [6]:
dfs['comm_history'].shape

(9611, 6)

### Load Schemas Dict

In [7]:
os.chdir('../Work/HackOregon')

with open('schemas_dict.json', 'r') as f:
    schemas = json.load(f)

In [8]:
list(schemas)

['statement_of_org',
 'payee',
 'ballots',
 'donor',
 'election_activity',
 'transaction_details',
 'transactions',
 'committees_list',
 'committee_history']

## Routines

#### Visually Inspect Column Values

In [9]:
def inspect_cols(df):
    long_cols = []

    def print_message(c, head=False):
        print(40*'-')
        print(c.upper(), '\n')
        if head:
            print(df.loc[:, c].unique()[:25].tolist(), '\n')
        else:
            print(df.loc[:, c].unique().tolist(), '\n')

    for c in df:
        if df.loc[:, c].nunique() < 100:
            print_message(c)
        else:
            print_message(c, True)
            long_cols.append(c)
    print(40*'=')
    print('Columns with many unique values:\n')
    print(long_cols)

#### Find Column Values That Mismatch Pattern

In [10]:
def check_values(df, col_pattern, val_pattern):
    cols = [c for c in df if re.search(col_pattern, c)]

    for c in cols:
        mask = df.loc[:, c].dropna() \
            .apply(lambda x: True if not re.match(val_pattern, str(x)) else False)
        bad = df.loc[:, c].dropna()[mask].drop_duplicates()
        print(40*'-')
        print(c.upper(), '\n')
        print(bad, '\n')

#### Reformat Column Names & Drop Cols Not in Schema

In [11]:
def correct_columns(name, verbose=False):
    
    nmap = {
        'statement': 'statement_of_org',
        'trans_details': 'transaction_details',
        'comm_history': 'committee_history',
        'transactions': 'transactions',
        'election_activity': 'election_activity',
        }
    schema = nmap[name]
    # column names from schema
    keep = [c.split()[0] for c in schemas[schema]]
    # rename columns in df to match format
    def rename(c):
        # three lines to correct transactions cols
        c = re.sub('Tran ID', 'transaction_id', c)
        c = re.sub('Tran Date', 'transaction_date', c)
        c = re.sub('^Sub Type', 'transaction_subtype', c)
        # for all data
        return re.sub('(\s+|/)', '_', c.lower().strip())
        
    dfs[name].columns = [rename(c) for c in dfs[name]]
    symdiff = set(keep).symmetric_difference(set(list(dfs[name])))
    if verbose:
        print('Schema Columns:')
        print(keep, '\n')
        print('Renamed DF Columns:')
        print(list(dfs[name]), '\n')
        print('Symmetric Difference:')
        print(symdiff)
    drop = set(dfs[name]).difference(set(keep))
    print(drop, '\n')
    dfs[name].drop(drop, axis=1, inplace=True)
    print('Dropped {} columns from {}'.format(len(drop), name))
    match = set(dfs[name]) == set(keep)
    print('Columns match schema:', match)
    if not match:
        print(set(dfs[name]).symmetric_difference(set(keep)))

#### Standardize Phone/Fax

In [12]:
ph_pattern1 = '^\(\d{3}\)\s*\d{3}-\d{4}(\s+x{0,1}(\d*|\w*)){0,1}$'
ph_pattern2 = '^\d{3}-\d{3}-\d{4}(\s+x{0,1}(\d*|\w*)){0,1}$'
ph_pattern3 = '^\d{3}\s+\d{3}-\d{4}(\s+x{0,1}(\d*|\w*)){0,1}$'
def std_phone(x):
    if x != x:
        return None
    patterns = {
        '\d{10}': 
            ' '.join([x[:3], x[3:6], x[6:]]),
        ph_pattern1: 
            re.sub('(\)\s*|-)', ' ', x.strip('(')),
        ph_pattern2:
            re.sub('\s{2,}', ' ', x.replace('-', ' ')),
        ph_pattern3:
            re.sub('\s{2,}', ' ', x.replace('-', ' ')),
        '(exempt|Exempt)':
            'Exempt from public record',
    }
    x = x.strip()
    for p in patterns:
        if re.match(p, x):
            return patterns[p]
    return x

#### Check For All Null Records

In [13]:
def allnull(name):
    n = dfs[name].shape[1]
    m = dfs[name][dfs[name].isnull().sum(axis=1) == n].shape[0]
    print('There are {} all null records'.format(m))

## Statement DF

### Rename & Drop Unwanted Columns

In [14]:
correct_columns('statement')

{'committee'} 

Dropped 1 columns from statement
Columns match schema: True


In [15]:
dfs['statement'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1617 entries, 0 to 1616
Data columns (total 25 columns):
candidate_address                  960 non-null object
committee_acronym                  295 non-null object
committee_address                  1596 non-null object
committee_campaign_phone           1384 non-null object
committee_filing_effective_from    1596 non-null object
committee_filing_type              1596 non-null object
committee_id                       1596 non-null float64
committee_name                     1596 non-null object
committee_pac_type                 632 non-null object
election_office                    960 non-null object
email_address                      958 non-null object
employer                           859 non-null object
fax                                150 non-null object
home_phone                         663 non-null object
mailing_address                    960 non-null object
name                               960 non-null object
occupa

In [16]:
# any all null records?
allnull('statement')

There are 21 all null records


There are 21 completely null records in this DF, and the null values are forcing ints to floats.

In [17]:
dfs['statement'].shape

(1617, 25)

In [18]:
dfs['statement'].dropna(how='all', inplace=True)
dfs['statement'].shape

(1596, 25)

### Inspect Columns

In [19]:
inspect_cols(dfs['statement'])

----------------------------------------
CANDIDATE_ADDRESS 

[nan, '1907 NW Mill Pond Road, Portland,           OR      97229         - 7553', '3393 Arlington Avenue, Eugene,           OR      97408', '23695 SW Brittany Ln, Sherwood,           OR      97140', '1003 Terrace Dr, Lake Oswego,           OR      97034', '1163 State St, Salem,           OR      97301', '6771 SW 162nd Drive, Beaverton,           OR      97007         - 4894', '7811 Old Stage Rd, Central Point,           OR      97502', '17915 NW Lonerock Dr, Portland,           OR      97229', '9026 SW 36th Ave, Portland,           OR      97219', '2138 Lois Drive, Roseburg,           OR      97470', '18340 SW Monte Verdi Blvd, Aloha,           OR      97007', '2248 Potter St, Eugene,           OR      97405', '410 Jefferson Street, Oregon City,           OR      97045', '22760 SW 87th Pl., Tualatin,           OR      97062', '660 Morgan Ave, Ontario,           OR      97914', '805 Kingwood Dr NW, Salem,           OR      973

### Phone

In [20]:
ph_pattern = '^\(\d{3}\)\s*\d{3}-\d{4}(\s+x{0,1}(\d*|\w*)){0,1}$'
c = '(fax|phone)'
check_values(dfs['statement'], c, ph_pattern)

----------------------------------------
COMMITTEE_CAMPAIGN_PHONE 

Series([], Name: committee_campaign_phone, dtype: object) 

----------------------------------------
FAX 

717     5038514351
1280    5038126773
Name: fax, dtype: object 

----------------------------------------
HOME_PHONE 

31      Exempt from public record
448                  541-536-7444
617                    5418060837
728                    5034345365
816                  503 621-6316
1184                   5415799022
Name: home_phone, dtype: object 

----------------------------------------
TREASURER_FAX 

688     5037417574
717     5038514351
1251    5032706995
1280    5038126773
Name: treasurer_fax, dtype: object 

----------------------------------------
TREASURER_HOME_PHONE 

210     Exempt from public record
688                    5037417574
1003                   5417586842
1090                   5419332256
1184                   5415799022
1218                   9712664290
1251                   5032706

Phone/Fax numbers all appear to be correct, but the format needs to be standardized.

### Check Dates

In [21]:
date_pattern = '^(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])$'
c = 'committee_filing_effective_from'
check_values(dfs['statement'], c, date_pattern)

----------------------------------------
COMMITTEE_FILING_EFFECTIVE_FROM 

Series([], Name: committee_filing_effective_from, dtype: object) 



All dates are good.

### Check Committee ID

In [22]:
id_pattern = '^\d+$'
c = 'committee_id'
check_values(dfs['statement'], c, id_pattern)

----------------------------------------
COMMITTEE_ID 

0         252.0
1        2069.0
2        2793.0
3        1579.0
4        1869.0
5         765.0
6         216.0
7         167.0
8         221.0
9         337.0
10        260.0
11        223.0
12       7305.0
13       4213.0
14       5207.0
15       3889.0
16       4676.0
17       5528.0
18        350.0
19       4015.0
20      12510.0
21          3.0
22          4.0
23      12878.0
24       4797.0
25       5203.0
26       3094.0
27       4016.0
28       2998.0
29        113.0
         ...   
1587      470.0
1588      353.0
1589       54.0
1590     8667.0
1591     1670.0
1592     5493.0
1593     2109.0
1594     5514.0
1595     1945.0
1596       80.0
1597      319.0
1598     4167.0
1599     4792.0
1600       60.0
1601     5328.0
1602     9927.0
1603     2891.0
1604      328.0
1605      229.0
1606     4572.0
1607     2070.0
1608      152.0
1609      108.0
1610     3597.0
1611     2717.0
1612     2352.0
1613      171.0
1614     3428.0


Need to convert floats to int for ID

### Clean Statement Columns

In [23]:
cols = [c for c in dfs['statement'] if re.search('(fax|phone)', c)]

for c in cols:
    dfs['statement'].loc[:, c] = dfs['statement'].loc[:, c].apply(lambda x: std_phone(x))

In [24]:
ph_pattern = '^\d{3} \d{3} \d{4}(\s+x{0,1}(\d*|\w*)){0,1}$'
c = '(fax|phone)'
check_values(dfs['statement'], c, ph_pattern)

----------------------------------------
COMMITTEE_CAMPAIGN_PHONE 

Series([], Name: committee_campaign_phone, dtype: object) 

----------------------------------------
FAX 

Series([], Name: fax, dtype: object) 

----------------------------------------
HOME_PHONE 

31    Exempt from public record
Name: home_phone, dtype: object 

----------------------------------------
TREASURER_FAX 

Series([], Name: treasurer_fax, dtype: object) 

----------------------------------------
TREASURER_HOME_PHONE 

210    Exempt from public record
Name: treasurer_home_phone, dtype: object 

----------------------------------------
TREASURER_WORK_PHONE 

Series([], Name: treasurer_work_phone, dtype: object) 

----------------------------------------
WORK_PHONE 

1013    Exempt from public record
Name: work_phone, dtype: object 



#### Convert Committee ID to int

In [25]:
dfs['statement'].loc[:, 'committee_id'] = dfs['statement']['committee_id'].astype(int)
# check
c = 'committee_id'
check_values(dfs['statement'], c, id_pattern)

----------------------------------------
COMMITTEE_ID 

Series([], Name: committee_id, dtype: int64) 



In [26]:
dfs['statement'].drop_duplicates().to_csv('statement_cleaned.csv', index=False)

## Trans Details DF

In [27]:
dfs['trans_details']['payee_id'] = 0
dfs['trans_details']['donor_id'] = 0

In [28]:
correct_columns('trans_details')

{'transaction_purpose', 'type', 'transaction_subtype', 'transaction_filed_date', 'exam_letter_date', 'interest_rate', 'in-kind_independent_expenditures', 'transaction_id_:', 'employer', 'cosigner_obligations', 'check', 'employer_city,_state', 'filer_committee_name', 'city,_state,_zip_code', 'transaction_amount'} 

Dropped 15 columns from trans_details
Columns match schema: True


In [29]:
dfs['trans_details'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714391 entries, 0 to 714390
Data columns (total 24 columns):
address                          532482 non-null object
address_book_type                536535 non-null object
agent                            1910 non-null object
aggregate                        664223 non-null object
amount                           714391 non-null object
associations                     10682 non-null object
description                      209571 non-null object
due_date                         714376 non-null object
employer_name                    201455 non-null object
filed_date                       714384 non-null object
name                             704485 non-null object
occupation                       157294 non-null object
occupation_letter_date           4128 non-null object
payer_of_personal_expenditure    12652 non-null object
payment_method                   206232 non-null object
process_status                   714384 non-null object

In [30]:
# any all null records?
allnull('trans_details')

There are 0 all null records


### Inspect Columns

In [31]:
inspect_cols(dfs['trans_details'])

----------------------------------------
ADDRESS 

['Business Services Division 255 Capitol St NE Ste 180 Salem OR 97310', nan, '3133 NE Prescott Street Corbett OR 97019', '37003 NE Reed Rd Corbett OR 97019', '18 NE Evans Rd Corbett OR 97019', '11726 SW 29th Place Portland OR 97219', '1619 NE 366th Ave Corbett OR 97019', 'Corbett OR 97019', '6900 SW Atlanta St Portland OR 97223', '9738 SE Washington St Portland OR 97216', '2500-116th Avenue NE Bellevue WA 98004', '7421 SW Barbur Blvd Portland OR 97219', '35800 E Historic Columbia River Hwy Corbett OR 97019', '1040 SE Morrison St Portland OR 97214', '1732 NW Quimby Street Suite 200 Portland OR 97209', '1788 SW Harvey Way Aloha OR 97006', '4675 NW Owyhee Court Portland OR 97229', 'PO BOX 10005 Department 415 Palo Alto CA 94303', '2380 NE 10th Ave Hillsboro OR 97124', '5301 W. Baseline Hillsboro OR 97123', '3700 SW Murray Blvd Suite 101 Beaverton OR 97005', '4824 NE 42nd Avenue Portland OR 97218', '500 NE Multnomah Street Suite 100 Portla

### Check Dates

In [32]:
date_pattern = '^(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
time_pattern = '(\s+(0[0-9]|1[0-2]):[0-5][0-9]:[0-5][0-9]\s+(AM|PM))*$'
c = '\w*_date'
check_values(dfs['trans_details'], c, date_pattern+time_pattern)

----------------------------------------
DUE_DATE 

426907    03/13/2023 11:59:00 PM
Name: due_date, dtype: object 

----------------------------------------
FILED_DATE 

Series([], Name: filed_date, dtype: object) 

----------------------------------------
OCCUPATION_LETTER_DATE 

Series([], Name: occupation_letter_date, dtype: object) 

----------------------------------------
TRANSACTION_DATE 

20894     02/23/0007
57887     01/14/0029
86474     01/05/0007
97601     12/08/0006
189248    05/03/0007
201610    11/03/0209
201611    02/23/0009
201612    03/14/0008
201613    02/19/0007
263716    06/10/0007
426907    02/11/2023
566158    06/25/0207
Name: transaction_date, dtype: object 



Transaction date of 2023?

### Check Trans ID

In [33]:
dfs['trans_details'][dfs['trans_details'].transaction_id.isnull()].isnull().sum(axis=1)

75665     20
80680     20
103871    20
333323    20
343320    20
500678    20
505693    20
dtype: int64

There are 20/22 missing values for these records.

In [34]:
dfs['trans_details'].shape

(714391, 24)

In [35]:
dfs['trans_details'].dropna(thresh=3, inplace=True)

In [36]:
id_pattern = '^\d+$'
c = '\w*_id'
check_values(dfs['trans_details'], c, id_pattern)

----------------------------------------
TRANSACTION_ID 

3852      1.03445e+06
3853      2.75318e+06
3854      2.74302e+06
3855      2.74302e+06
3856      2.74303e+06
3857      2.73916e+06
3858      2.73926e+06
3859       2.7264e+06
3860      2.72643e+06
3861      2.72008e+06
3862      2.72009e+06
3863      2.70552e+06
3864      2.70552e+06
3865      2.70553e+06
3866      2.70555e+06
3867      2.70555e+06
3868      2.70555e+06
3869      2.70555e+06
3870      2.70556e+06
3871      2.70556e+06
3872       2.6787e+06
3873      2.67871e+06
3874      2.67871e+06
3875      2.67871e+06
3876      2.67871e+06
3877      2.67871e+06
3878      2.67916e+06
3879      2.66193e+06
3880      2.65194e+06
3881      2.65194e+06
             ...     
708336       538294, 
708337       538296, 
708338       538298, 
708339       538300, 
708340       538302, 
708341       538305, 
708342       538307, 
708343       538310, 
708345       538280, 
708347       538194, 
708348       729636, 
708350       49454

Commas and floats

### Check Aggregate and Amount

In [37]:
float_pattern = '[-]{0,1}\d+[.]{0,1}\d{0,2}'
c = '(agg\w*|amount)'
check_values(dfs['trans_details'], c, float_pattern)

----------------------------------------
AGGREGATE 

120322       $648.00
120323       $250.00
120325       $200.00
120328       $400.00
120340       $595.00
120341       $415.00
120343       $210.00
120348     $1,000.00
120349       $500.00
120355       $150.00
120364       $300.00
120365       $550.00
120366     $3,000.00
120368       $125.00
120374     $4,720.51
120376     $4,500.00
120378    $24,991.61
120380     $2,000.00
120383    $18,991.61
120385    $10,412.09
120387     $7,275.93
120388     $2,994.08
120391    $15,000.00
120392     $1,500.00
120393     $5,672.81
120394     $5,535.31
120401       $272.27
120408       $350.00
120409       $179.71
120411       $750.00
             ...    
708349       $846.30
708350       $786.30
708352       $686.25
708354       $586.25
708363       $848.75
708375       $611.50
708378     $4,369.75
708379     $4,732.00
708380       $506.50
708384       $401.50
708391    $10,950.67
708392    $11,975.47
708393    $12,250.47
708394    $12,389.56
70

### Clean Trans Detail

#### Dates

In [38]:
def fix_date(d):
    if d != d:
        return None
    x = d[-4:]
    patterns = {
        '(02|91)\d{2}': x[:2][::-1]+x[2:],
        '002\d': x[2] + x[:2] + x[3],
        '000\d': '200' + x[3],
        }
    for p in patterns:
        if re.match(p, x):
            return d[:-4] + patterns[p]
    return d

dfs['trans_details'].loc[:, 'transaction_date'] = \
    dfs['trans_details']['transaction_date'].apply(lambda x: fix_date(x))

In [39]:
c = '\w*_date'
check_values(dfs['trans_details'], c, date_pattern+time_pattern)

----------------------------------------
DUE_DATE 

426907    03/13/2023 11:59:00 PM
Name: due_date, dtype: object 

----------------------------------------
FILED_DATE 

Series([], Name: filed_date, dtype: object) 

----------------------------------------
OCCUPATION_LETTER_DATE 

Series([], Name: occupation_letter_date, dtype: object) 

----------------------------------------
TRANSACTION_DATE 

426907    02/11/2023
Name: transaction_date, dtype: object 



#### Agg & Amount

In [40]:
def tofloat(x):
    try:
        if type(x) == float:
            return x
        if x.startswith('$'):
            x = x.strip('$').replace(',', '')
        return float(x)
    except:
        return None
for c in ['aggregate','amount']:
    dfs['trans_details'].loc[:, c] = dfs['trans_details'].loc[:, c].apply(lambda x: tofloat(x))

In [41]:
float_pattern = '[-]{0,1}\d+[.]{0,1}\d{0,2}'
c = '(agg\w*|amount)'
check_values(dfs['trans_details'], c, float_pattern)

----------------------------------------
AGGREGATE 

Series([], Name: aggregate, dtype: float64) 

----------------------------------------
AMOUNT 

Series([], Name: amount, dtype: float64) 



#### Transaction ID

In [42]:
dfs['trans_details'].isnull().sum()

address                          181909
address_book_type                177856
agent                            712481
aggregate                         50168
amount                               64
associations                     703709
description                      504820
due_date                             15
employer_name                    512936
filed_date                            7
name                               9906
occupation                       557097
occupation_letter_date           710263
payer_of_personal_expenditure    701739
payment_method                   508159
process_status                        7
purpose                          491216
repayment_schedule               713524
transaction_date                      7
transaction_id                        7
transaction_sub_type                  7
transaction_type                      7
payee_id                              0
donor_id                              0
dtype: int64

In [43]:
dfs['trans_details'].dropna(subset=['transaction_id'], inplace=True)

In [44]:
dfs['trans_details'].loc[:, 'transaction_id'] = [
    int(float(re.sub(',\s*','', x)))
    for x in dfs['trans_details'].transaction_id.astype(str)
    ]

In [45]:
c = '\w*_id'
check_values(dfs['trans_details'], c, id_pattern)

----------------------------------------
TRANSACTION_ID 

Series([], Name: transaction_id, dtype: int64) 

----------------------------------------
PAYEE_ID 

Series([], Name: payee_id, dtype: int64) 

----------------------------------------
DONOR_ID 

Series([], Name: donor_id, dtype: int64) 



In [46]:
dfs['trans_details'].drop_duplicates().to_csv('trans_details_cleaned.csv', index=False)

## Transactions DF

In [47]:
correct_columns('transactions')

set() 

Dropped 0 columns from transactions
Columns match schema: True


In [48]:
schemas['transactions']

['transaction_id int primary key',
 'committee_id int',
 'transaction_date date',
 'status varchar(32)',
 'filer_committee varchar(255)',
 'contributor_payee varchar(255)',
 'transaction_subtype varchar(255)',
 'amount numeric']

In [49]:
dfs['transactions'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1112406 entries, 0 to 1112405
Data columns (total 8 columns):
transaction_id         1112406 non-null object
transaction_date       1112406 non-null object
status                 1112406 non-null object
filer_committee        1112406 non-null object
contributor_payee      1103192 non-null object
transaction_subtype    1112406 non-null object
amount                 1112406 non-null object
committee_id           1112406 non-null object
dtypes: object(8)
memory usage: 67.9+ MB


### Inspect Columns

In [50]:
inspect_cols(dfs['transactions'])

----------------------------------------
TRANSACTION_ID 

[2469752, 2469753, 2469754, 2469755, 2469756, 2469757, 2469758, 2469759, 2469760, 2469761, 2469762, 2469763, 2469764, 2469765, 2469766, 2469767, 2469768, 2469769, 2469770, 2469771, 2469772, 2469773, 2469774, 2469775, 2469776] 

----------------------------------------
TRANSACTION_DATE 

['12/14/2016', '12/13/2016', '12/09/2016', '12/07/2016', '12/06/2016', '12/05/2016', '12/02/2016', '12/01/2016', '11/30/2016', '11/29/2016', '11/28/2016', '11/23/2016', '11/22/2016', '11/21/2016', '11/18/2016', '11/17/2016', '11/16/2016', '11/15/2016', '11/14/2016', '11/10/2016', '11/09/2016', '11/08/2016', '11/07/2016', '11/04/2016', '11/03/2016'] 

----------------------------------------
STATUS 

['Original', 'Amended'] 

----------------------------------------
FILER_COMMITTEE 

['Citizen Action for Political Education', 'Local 48 Electricians PAC', 'No Pot Ontario', 'Northwest Ideas PAC', "Frank O'Donnell for Oregon City Commissioner", 'Vote

### Check Tran ID and Committee ID

In [51]:
id_pattern = '^\d+$'
c = '(\w*_id)'
check_values(dfs['transactions'], c, id_pattern)

----------------------------------------
TRANSACTION_ID 

Series([], Name: transaction_id, dtype: object) 

----------------------------------------
COMMITTEE_ID 

Series([], Name: committee_id, dtype: object) 



### Check Dates

In [52]:
date_pattern = '^(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])$'
c = '\w*_date'
check_values(dfs['transactions'], c, date_pattern)

----------------------------------------
TRANSACTION_DATE 

36378      06/25/0207
183651     04/23/0607
238112     11/17/0006
328210     02/11/2023
383168     06/10/0007
790405     06/04/0200
852630     12/26/0006
897267     12/13/0006
1058145    05/03/0007
1070507    11/03/0209
1070508    02/23/0009
1070509    03/14/0008
1070510    02/19/0007
Name: transaction_date, dtype: object 



### Check Amount

In [53]:
float_pattern = '[-]{0,1}\d+[.]{0,1}\d{0,2}'
c = '(agg\w*|amount)'
check_values(dfs['transactions'], c, float_pattern)

----------------------------------------
AMOUNT 

989218        $200.00
989219         $53.00
989220        $250.00
989225        $400.00
989237        $180.00
989238        $205.00
989240        $210.00
989241         $30.00
989242         $90.00
989243         $20.00
989244        $275.00
989245      $1,000.00
989246        $500.00
989250        $100.00
989252        $150.00
989254        $300.00
989262        $550.00
989263      $3,000.00
989265        $125.00
989268         $82.75
989270        $320.00
989271        $220.51
989272          $3.95
989273      $4,500.00
989275      $6,000.00
989277      $2,000.00
989278          $5.93
989280      $4,193.52
989281      $4,386.00
989282      $3,136.16
              ...    
1077176       $138.98
1077187    $17,126.64
1077210       $298.72
1077211       $220.30
1077218       $185.64
1077236        $17.03
1077238    $11,250.00
1077241     $1,935.48
1077290        $53.61
1077291       $124.25
1077310       $478.50
1077316       $509.28
1077

### Clean Transactions

#### ID to int

In [54]:
dfs['transactions'].loc[:, 'committee_id'] = dfs['transactions'].committee_id.astype(int)

#### Dates

In [55]:
dfs['transactions'].loc[:, 'transaction_date'] = \
    dfs['transactions']['transaction_date'].apply(lambda x: fix_date(x))

In [56]:
# date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*_date'
check_values(dfs['transactions'], c, date_pattern)

----------------------------------------
TRANSACTION_DATE 

183651    04/23/0607
328210    02/11/2023
Name: transaction_date, dtype: object 



In [57]:
dfs['transactions'].loc[183651, 'transaction_date'] = '04/23/2007'

In [58]:
check_values(dfs['transactions'], c, date_pattern)

----------------------------------------
TRANSACTION_DATE 

328210    02/11/2023
Name: transaction_date, dtype: object 



#### Amount

In [59]:
c = 'amount'
dfs['transactions'].loc[:, c] = dfs['transactions'].loc[:, c].apply(lambda x: tofloat(x))

In [60]:
c = '(agg\w*|amount)'
check_values(dfs['transactions'], c, float_pattern)

----------------------------------------
AMOUNT 

Series([], Name: amount, dtype: float64) 



In [61]:
dfs['transactions'].drop_duplicates().to_csv('transactions_cleaned.csv', index=False)

## Election Activity DF

In [62]:
correct_columns('election_activity')

set() 

Dropped 0 columns from election_activity
Columns match schema: True


In [63]:
dfs['election_activity'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15347 entries, 0 to 15346
Data columns (total 5 columns):
election         15347 non-null object
active_date      11395 non-null object
status           15347 non-null object
active_reason    11642 non-null object
committee_id     15347 non-null object
dtypes: object(5)
memory usage: 599.6+ KB


In [64]:
allnull('election_activity')

There are 0 all null records


### Inspect Columns

In [65]:
inspect_cols(dfs['election_activity'])

----------------------------------------
ELECTION 

['2018 Primary Election', '2016 General Election', '2016 Primary Election', '2014 General Election', '2014 Primary Election', '2012 General Election', '2012 Primary Election', '2010 General Election', '2010 Primary Election', '2010 Statewide Special Election', '2008 General Election', '2008 Primary Election', '2007 November Election', '2006 General Election', '2006 Primary Election', '2004 General Election', '2004 Primary Election', '2002 General Election', '2002 Primary Election', '2000 General Election', '1998 General Election', '1998 Biennial Primary Election', '1996 General Election', '1996 Primary Election', '1994 General Election', '1994 Primary', '1992 General Election', '1992 Primary Election', '1990 General Election', '2018 General Election', '2017 May Election', '2015 May Election', '2013 November Election', '2013 May Election', '2011 November Election', '2011 May Election', '2011 March Election', '2009 May Election', '2000 

### Check Committee ID

In [66]:
id_pattern = '^\d+$'
c = '(\w*_id)'
check_values(dfs['election_activity'], c, id_pattern)

----------------------------------------
COMMITTEE_ID 

Series([], Name: committee_id, dtype: object) 



#### ID to int

In [67]:
dfs['election_activity'].loc[:, 'committee_id'] = dfs['election_activity'].committee_id.astype(int)

### Check Dates

In [68]:
# date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*_[dD]ate'
check_values(dfs['election_activity'], c, date_pattern)

----------------------------------------
ACTIVE_DATE 

Series([], Name: active_date, dtype: object) 



In [69]:
dfs['election_activity'].drop_duplicates().to_csv('election_activity_cleaned.csv', index=False)

## Committee History DF

In [70]:
correct_columns('comm_history')

set() 

Dropped 0 columns from comm_history
Columns match schema: True


In [71]:
dfs['comm_history'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9611 entries, 0 to 9610
Data columns (total 6 columns):
committee_name           9611 non-null object
committee_description    9611 non-null object
effective                9611 non-null object
expiration               8012 non-null object
filing_type              9611 non-null object
committee_id             9611 non-null object
dtypes: object(6)
memory usage: 450.6+ KB


In [72]:
allnull('comm_history')

There are 0 all null records


In [73]:
inspect_cols(dfs['comm_history'])

----------------------------------------
COMMITTEE_NAME 

['Committee for Dan Ivancie', 'Oregon Sierra Club PAC', 'Oregon Sierra Club Political Action Committee', 'Oregon Chapter, Sierra Club, Political Committee', 'Oregon Chapter Sierra Club Political Committee', 'Oregon Chapter Sierra Club P C', 'Sierra Club Political Committee-Oregon', 'Sierra Club Committee on Political Education-Oregon Fund', 'Sierra Club Committee on Political Education', 'Oregonians for Affordable Housing', 'Oregonians in Action PAC', 'Oregonians in Action Political Action Committee', 'Oregonians in Action - Political Action Committee', 'Oregonians to Maintain Community Standards', 'PAC 483', 'Pacific Green Party of Oregon', 'Pacific Party', 'Pacific Party Pac', 'Pacific Party-Pac', "Parent's Education Association P.A.C.", "Parent's Education Association", 'Parents Education Association', 'Parents Education Association PAC', 'Parents Education Association Political Action Committee', 'People for Oregon Libraries

### Check Committee ID

In [74]:
id_pattern = '^\d+$'
c = '\w*_id'
check_values(dfs['comm_history'], c, id_pattern)

----------------------------------------
COMMITTEE_ID 

Series([], Name: committee_id, dtype: object) 



In [75]:
dfs['comm_history'].loc[:, 'committee_id'] = dfs['comm_history'].committee_id.astype(int)

### Check Dates

In [76]:
# date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '(effective|expiration)'
check_values(dfs['comm_history'], c, date_pattern)

----------------------------------------
EFFECTIVE 

Series([], Name: effective, dtype: object) 

----------------------------------------
EXPIRATION 

Series([], Name: expiration, dtype: object) 



In [77]:
dfs['comm_history'].drop_duplicates().to_csv('comm_history_cleaned.csv', index=False)

In [78]:
dfs['trans_details'].select_dtypes(['O']).apply(lambda x: x.str.len()).max()

address                           101.0
address_book_type                  28.0
agent                              32.0
associations                     1216.0
description                       199.0
due_date                           22.0
employer_name                      92.0
filed_date                         22.0
name                              115.0
occupation                         91.0
occupation_letter_date             10.0
payer_of_personal_expenditure      54.0
payment_method                     25.0
process_status                     22.0
purpose                           271.0
repayment_schedule                 89.0
transaction_date                   10.0
transaction_sub_type               39.0
transaction_type                   24.0
dtype: float64

## Run Scrapers

In [39]:
os.chdir('elections-2018')
os.getcwd()

'/home/michaelcrown/Projects/HackOregon/elections-2018'

In [32]:
def parse_ostar(cid, start, stop):
    print('Parsing cid {}'.format(cid))
    i = 15
    while True:
        args = ' '.join([
            str(cid), 
            start.lstrip('0'), 
            stop.lstrip('0'), 
            str(i),
            ])
        print(start, stop)
        pypath = os.path.join(os.getcwd(), 'scrape_files/transactions_scrape_date.py')
        cmd = 'python {} {}'.format(pypath, args)
        os.system(cmd)
        file = 'transactions_gaps{}_{}.csv'.format(cid, i)
        dfpath = os.path.join(os.getcwd(), file)
        df = pd.read_csv(dfpath, parse_dates=['Tran Date'])
        if len(df) < 5000:
            break
        stop = df['Tran Date'].min().date().strftime('%m/%d/%Y')
        i += 1

In [None]:
parser_args = [
#     (4572, '01/1/1900', '12/7/17'),
    (4572, '01/1/1900', '11/12/2014'),
    (3396, '10/15/11', '9/15/14'),
    (191, '5/27/2011', '4/28/13'),
    (39, '12/23/2009', '10/25/16'),
    ]

for args in parser_args:
    parse_ostar(*args)