# Cleaning of All Files

In [1]:
import io
import json
import re
import os

import pandas as pd
from pandas import DataFrame as DF, Series
import numpy as np

import requests
import missingno as msn
import psycopg2 as pg

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

In [2]:
PATH = os.path.join(os.getenv('HOME'), 'Projects/HackOregon/elections-2018/scrape_files')

In [3]:
from glob import glob

os.chdir(PATH)
files = [f for f in glob('*.csv')]
os.chdir('../../')

In [4]:
files

['all_transactions1152.csv',
 'election_activity_first_batch.csv',
 'transaction_detail852.csv',
 'transactions_first_batch.csv',
 'all_transactions8.csv',
 'transaction_detail8.csv',
 'committee_history_first_batch.csv',
 'transaction_detail9.csv',
 'transaction_detail41.csv',
 'transaction_detail_missing.csv',
 'all_transactions9.csv',
 'transaction_detail80.csv',
 'transaction_detail115.csv',
 'transactions_gaps15192_1.csv',
 'transaction_detail42.csv',
 'all_transactions12.csv',
 'all_transactions852.csv',
 'transaction_detail19.csv',
 'transaction_detail25.csv',
 'transactions_gaps1524_1.csv',
 'transactions_gaps4572_1.csv',
 'transaction_detail24.csv',
 'transaction_detail18.csv',
 'all_transactions13.csv',
 'all_transactions11.csv',
 'transaction_detail_first_batch.csv',
 'transactions_gaps4572_2.csv',
 'transactions_gaps4572_3.csv',
 'transactions_gaps2690_1.csv',
 'all_transactions10.csv',
 'transactions_gaps33_8.csv',
 'all_transactions14.csv',
 'all_transactions115.csv',
 't

In [5]:
import warnings
from ast import literal_eval

map_names = {
    'all': 'transactions',
    'transactions': 'transactions',
    'transaction': 'trans_details',
    'statement': 'statement',
    'committee': 'comm_history',
    'election': 'election_activity',
    }
dfs = {}
with warnings.catch_warnings(record=True) as w:
    for i,f in enumerate(files):
        head = f.split('_')[0]
        if head == 'trans':
            continue
        else:
            head = map_names[head]
        df = pd.read_csv('elections-2018/scrape_files/{}'.format(f))
        if w:
            u = w[-1]
            u.message.args[0]
            tup = literal_eval(u.message.args[0].strip('Columns ').split(' ')[0])
            try:
                df = pd.read_csv('elections-2018/scrape_files/{}'.format(f), dtype={k:str for k in tup})
            except:
                df = pd.read_csv('elections-2018/scrape_files/{}'.format(f), dtype={tup: str})
        dfs.setdefault(head, DF([], columns=df.columns))
        dfs[head] = dfs[head].append(df, ignore_index=True)

In [6]:
list(dfs)

['statement',
 'trans_details',
 'election_activity',
 'comm_history',
 'transactions']

## Routines

### Visually Inspect Column Values

In [7]:
def inspect_cols(df):
    long_cols = []

    def print_message(c, head=False):
        print(40*'-')
        print(c.upper(), '\n')
        if head:
            print(df.loc[:, c].unique()[:25].tolist(), '\n')
        else:
            print(df.loc[:, c].unique().tolist(), '\n')

    for c in df:
        if df.loc[:, c].nunique() < 100:
            print_message(c)
        else:
            print_message(c, True)
            long_cols.append(c)
    print(40*'=')
    print('Columns with many unique values:\n')
    print(long_cols)

### Find Column Values That Mismatch Pattern

In [8]:
def check_values(df, col_pattern, val_pattern):
    cols = [c for c in df if re.search(col_pattern, c)]

    for c in cols:
        mask = df.loc[:, c].dropna() \
            .apply(lambda x: True if not re.match(val_pattern, str(x)) else False)
        bad = df.loc[:, c].dropna()[mask].drop_duplicates()
        print(40*'-')
        print(c.upper(), '\n')
        print(bad, '\n')

## Statement DF

In [9]:
dfs['statement'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1617 entries, 0 to 1616
Data columns (total 26 columns):
Candidate Address                  960 non-null object
Committee                          0 non-null float64
Committee Acronym                  295 non-null object
Committee Address                  1596 non-null object
Committee Campaign Phone           1384 non-null object
Committee Filing Effective From    1596 non-null object
Committee Filing Type              1596 non-null object
Committee ID                       1596 non-null float64
Committee Name                     1596 non-null object
Committee PAC Type                 632 non-null object
Election/Office                    960 non-null object
Email Address                      958 non-null object
Employer                           859 non-null object
Fax                                150 non-null object
Home Phone                         663 non-null object
Mailing Address                    960 non-null object
Name   

### Inspect Columns

In [10]:
inspect_cols(dfs['statement'])

----------------------------------------
CANDIDATE ADDRESS 

[nan, '1907 NW Mill Pond Road, Portland,           OR      97229         - 7553', '3393 Arlington Avenue, Eugene,           OR      97408', '23695 SW Brittany Ln, Sherwood,           OR      97140', '1003 Terrace Dr, Lake Oswego,           OR      97034', '1163 State St, Salem,           OR      97301', '6771 SW 162nd Drive, Beaverton,           OR      97007         - 4894', '7811 Old Stage Rd, Central Point,           OR      97502', '17915 NW Lonerock Dr, Portland,           OR      97229', '9026 SW 36th Ave, Portland,           OR      97219', '2138 Lois Drive, Roseburg,           OR      97470', '18340 SW Monte Verdi Blvd, Aloha,           OR      97007', '2248 Potter St, Eugene,           OR      97405', '410 Jefferson Street, Oregon City,           OR      97045', '22760 SW 87th Pl., Tualatin,           OR      97062', '660 Morgan Ave, Ontario,           OR      97914', '805 Kingwood Dr NW, Salem,           OR      973

### Phone

In [11]:
ph_pattern = '\(\d{3}\)\s*\d{3}-\d{4}( x{0,1}\d{0,4}){0,1}'
c = '(Fax|Phone)'
check_values(dfs['statement'], c, ph_pattern)

----------------------------------------
COMMITTEE CAMPAIGN PHONE 

Series([], Name: Committee Campaign Phone, dtype: object) 

----------------------------------------
FAX 

717     5038514351
1280    5038126773
Name: Fax, dtype: object 

----------------------------------------
HOME PHONE 

31      Exempt from public record
448                  541-536-7444
617                    5418060837
728                    5034345365
816                  503 621-6316
1184                   5415799022
Name: Home Phone, dtype: object 

----------------------------------------
TREASURER FAX 

688     5037417574
717     5038514351
1251    5032706995
1280    5038126773
Name: Treasurer Fax, dtype: object 

----------------------------------------
TREASURER HOME PHONE 

210     Exempt from public record
688                    5037417574
1003                   5417586842
1090                   5419332256
1184                   5415799022
1218                   9712664290
1251                   5032706

Phone/Fax numbers all appear to be correct, but the format needs to be standardized.

### Check Dates

In [12]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = 'Committee Filing Effective From'
check_values(dfs['statement'], c, date_pattern)

----------------------------------------
COMMITTEE FILING EFFECTIVE FROM 

Series([], Name: Committee Filing Effective From, dtype: object) 



All dates are good.

### Check Committee ID

In [13]:
id_pattern = '\d+'
c = 'Committee ID'
check_values(dfs['statement'], c, id_pattern)

----------------------------------------
COMMITTEE ID 

Series([], Name: Committee ID, dtype: float64) 



ID values are good

### Clean Statement Columns

#### Standardize Phone/Fax

In [14]:
def std_phone(x):
    if x != x:
        return None
    patterns = {
        '\d{10}': 
            ' '.join([x[:3], x[3:6], x[6:]]),
        ph_pattern: 
            re.sub('(\)\s*|-)', ' ', x.strip('(')),
        '\d{3}-\d{3}-\d{4}( x{0,1}\d{0,4}){0,1}':
            x.replace('-', ' '),
        '\d{3} \d{3}-\d{4}( x{0,1}\d{0,4}){0,1}':
            x.replace('-', ' '),
        '(exempt|Exempt)':
            'Exempt from public record',
    }
    for p in patterns:
        if re.match(p, x):
            return patterns[p]
    return x

In [15]:
cols = [c for c in dfs['statement'] if re.search('(Fax|Phone)', c)]

for c in cols:
    dfs['statement'].loc[:, c] = dfs['statement'].loc[:, c].apply(lambda x: std_phone(x))

In [16]:
ph_pattern = '\d{3} \d{3} \d{4}( x{0,1}\d{0,4}){0,1}'
c = '(Fax|Phone)'
check_values(dfs['statement'], c, ph_pattern)

----------------------------------------
COMMITTEE CAMPAIGN PHONE 

Series([], Name: Committee Campaign Phone, dtype: object) 

----------------------------------------
FAX 

Series([], Name: Fax, dtype: object) 

----------------------------------------
HOME PHONE 

31    Exempt from public record
Name: Home Phone, dtype: object 

----------------------------------------
TREASURER FAX 

Series([], Name: Treasurer Fax, dtype: object) 

----------------------------------------
TREASURER HOME PHONE 

210    Exempt from public record
Name: Treasurer Home Phone, dtype: object 

----------------------------------------
TREASURER WORK PHONE 

Series([], Name: Treasurer Work Phone, dtype: object) 

----------------------------------------
WORK PHONE 

1013    Exempt from public record
Name: Work Phone, dtype: object 



In [17]:
dfs['statement'].to_csv('statement_cleaned.csv', index=False)

## Trans Details DF

In [18]:
dfs['trans_details'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714391 entries, 0 to 714390
Data columns (total 37 columns):
Address                             532482 non-null object
Address Book Type                   536535 non-null object
Agent                               1910 non-null object
Aggregate                           664223 non-null object
Amount                              714391 non-null object
Associations                        10682 non-null object
Check                               152195 non-null float64
City, State, Zip Code               0 non-null float64
CoSigner Obligations                0 non-null float64
Description                         209571 non-null object
Due Date                            714376 non-null object
Employer                            0 non-null float64
Employer City, State                0 non-null float64
Employer Name                       201455 non-null object
Exam Letter Date                    560583 non-null object
Filed Date            

### Inspect Columns

In [19]:
inspect_cols(dfs['trans_details'])

----------------------------------------
ADDRESS 

['1968 Carriage Way West Linn OR 97068', '7095 Mc Ewan Lake Oswego OR 97035', nan, '25900 Heather Place Wilsonville OR 97070', 'PO Box 4005 Acworth GA 30101-9006', '1010 NW 22nd Avenue #144 Portland OR 97210', '316 North Point Rd Lake Oswego OR 97034', '255 Capital Street, NE Salem OR 97310', '89286 Cranberry Lane Bandon OR 97411', '25920 SW Heather Place Wilsonville OR 97070', '2130 SW 21st Avenue Portland OR 97201', '39848 NW Chalmers Ln Forest Grove OR 97116', '19363 Willamette Drive #251 West Linn OR 97068', '1211 SW Fifth Ave, Ste 1000 Portland OR 97204-3710', '11735 SW Queen Elizabeth St., #101 King City OR 97224', '1300 SW Hoffman Rd West Linn OR 97068', '2110 Mission ST SE Suite 310 Salem OR 97302', '12060 SW Garden Place Tigard OR 97223', '20427 N 27th Ave Phoenix AZ 85027-3241', '121 SW Salmon 1wtc0301 Portland OR 97204', '18912 North Creek Pkwy Ste 201 Bothell WA 98011', '867 Liberty St. NE Salem OR 97301', '11095 W Avery St

### Check Dates

In [20]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*\s*Date'
check_values(dfs['trans_details'], c, date_pattern)

----------------------------------------
DUE DATE 

87203    03/13/2023 11:59:00 PM
Name: Due Date, dtype: object 

----------------------------------------
EXAM LETTER DATE 

Series([], Name: Exam Letter Date, dtype: object) 

----------------------------------------
FILED DATE 

Series([], Name: Filed Date, dtype: object) 

----------------------------------------
OCCUPATION LETTER DATE 

Series([], Name: Occupation Letter Date, dtype: object) 

----------------------------------------
TRANSACTION DATE 

33527     06/10/0007
87203     02/11/2023
110448    06/25/0207
293255    05/03/0007
305617    11/03/0209
305618    02/23/0009
305619    03/14/0008
305620    02/19/0007
352842    02/23/0007
389835    01/14/0029
418422    01/05/0007
429549    12/08/0006
Name: Transaction Date, dtype: object 

----------------------------------------
TRANSACTION FILED DATE 

Series([], Name: Transaction Filed Date, dtype: float64) 



Transaction date of 2023?

### Check Trans ID

In [21]:
id_pattern = '\d+'
c = '\w*\s*ID'
check_values(dfs['trans_details'], c, id_pattern)

----------------------------------------
TRANSACTION ID 

Series([], Name: Transaction ID, dtype: object) 



### Check Aggregate and Amount

In [22]:
float_pattern = '[-]{0,1}\d+[.]{0,1}\d{0,2}'
c = '(Agg\w*|Amount)'
check_values(dfs['trans_details'], c, float_pattern)

----------------------------------------
AGGREGATE 

224329       $648.00
224330       $250.00
224332       $200.00
224335       $400.00
224347       $595.00
224348       $415.00
224350       $210.00
224355     $1,000.00
224356       $500.00
224362       $150.00
224371       $300.00
224372       $550.00
224373     $3,000.00
224375       $125.00
224381     $4,720.51
224383     $4,500.00
224385    $24,991.61
224387     $2,000.00
224390    $18,991.61
224392    $10,412.09
224394     $7,275.93
224395     $2,994.08
224398    $15,000.00
224399     $1,500.00
224400     $5,672.81
224401     $5,535.31
224408       $272.27
224415       $350.00
224416       $179.71
224418       $750.00
             ...    
613985       $846.30
613986       $786.30
613988       $686.25
613990       $586.25
613999       $848.75
614011       $611.50
614014     $4,369.75
614015     $4,732.00
614016       $506.50
614020       $401.50
614027    $10,950.67
614028    $11,975.47
614029    $12,250.47
614030    $12,389.56
61

### Clean Trans Detail

#### Dates

In [23]:
def fix_date(d):
    if d != d:
        return None
    x = d[-4:]
    patterns = {
        '(02|91)\d{2}': x[:2][::-1]+x[2:],
        '002\d': x[2] + x[:2] + x[3],
        '000\d': '200' + x[3],
        }
    for p in patterns:
        if re.match(p, x):
            return d[:-4] + patterns[p]
    return d

dfs['trans_details'].loc[:, 'Transaction Date'] = \
    dfs['trans_details']['Transaction Date'].apply(lambda x: fix_date(x))

In [24]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*\s*Date'
check_values(dfs['trans_details'], c, date_pattern)

----------------------------------------
DUE DATE 

87203    03/13/2023 11:59:00 PM
Name: Due Date, dtype: object 

----------------------------------------
EXAM LETTER DATE 

Series([], Name: Exam Letter Date, dtype: object) 

----------------------------------------
FILED DATE 

Series([], Name: Filed Date, dtype: object) 

----------------------------------------
OCCUPATION LETTER DATE 

Series([], Name: Occupation Letter Date, dtype: object) 

----------------------------------------
TRANSACTION DATE 

87203    02/11/2023
Name: Transaction Date, dtype: object 

----------------------------------------
TRANSACTION FILED DATE 

Series([], Name: Transaction Filed Date, dtype: float64) 



#### Agg & Amount

In [25]:
def tofloat(x):
    try:
        if type(x) == float:
            return x
        if x.startswith('$'):
            x = x.strip('$').replace(',', '')
        return float(x)
    except:
        return None
for c in ['Aggregate','Amount']:
    dfs['trans_details'].loc[:, c] = dfs['trans_details'].loc[:, c].apply(lambda x: tofloat(x))

In [26]:
float_pattern = '[-]{0,1}\d+[.]{0,1}\d{0,2}'
c = '(Agg\w*|Amount)'
check_values(dfs['trans_details'], c, float_pattern)

----------------------------------------
AGGREGATE 

Series([], Name: Aggregate, dtype: float64) 

----------------------------------------
AMOUNT 

Series([], Name: Amount, dtype: float64) 

----------------------------------------
TRANSACTION AMOUNT 

Series([], Name: Transaction Amount, dtype: float64) 



In [27]:
dfs['trans_details'].to_csv('trans_details_cleaned.csv', index=False)

## Transactions DF

In [28]:
dfs['transactions'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 996347 entries, 0 to 996346
Data columns (total 8 columns):
Tran ID              996347 non-null object
Tran Date            996347 non-null object
Status               996347 non-null object
Filer/Committee      996347 non-null object
Contributor/Payee    987133 non-null object
Sub Type             996347 non-null object
Amount               996347 non-null object
committee_id         996347 non-null object
dtypes: object(8)
memory usage: 60.8+ MB


### Inspect Columns

In [29]:
inspect_cols(dfs['transactions'])

----------------------------------------
TRAN ID 

[2766154, 2766158, 2766153, 2555590, 2452419, 2439203, 2439212, 2437641, 2437643, 2433266, 2432392, 2432401, 2428655, 2428687, 2428710, 2428713, 2441118, 2441119, 2448120, 2425065, 2425080, 2438974, 2439213, 2439513, 2439511] 

----------------------------------------
TRAN DATE 

['03/05/2018', '05/24/2017', '05/22/2017', '12/02/2016', '11/07/2016', '11/04/2016', '11/03/2016', '11/02/2016', '11/01/2016', '10/31/2016', '10/30/2016', '10/28/2016', '10/27/2016', '10/26/2016', '10/25/2016', '10/23/2016', '10/21/2016', '10/20/2016', '10/18/2016', '10/17/2016', '10/16/2016', '10/13/2016', '10/12/2016', '10/11/2016', '10/10/2016'] 

----------------------------------------
STATUS 

['Original', 'Amended'] 

----------------------------------------
FILER/COMMITTEE 

['Studebaker for Mayor', 'Support City Schools Political Action Committee', 'Support Hood River County Schools', 'Sustainable Forestry Network', 'Friends of Ross', 'Committee to El

### Check Tran ID and Committee ID

In [30]:
id_pattern = '\d+'
c = '(\w*\s*ID|\w*_id)'
check_values(dfs['transactions'], c, id_pattern)

----------------------------------------
TRAN ID 

Series([], Name: Tran ID, dtype: object) 

----------------------------------------
COMMITTEE_ID 

Series([], Name: committee_id, dtype: object) 



### Check Dates

In [31]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*\s*Date'
check_values(dfs['transactions'], c, date_pattern)

----------------------------------------
TRAN DATE 

79939     05/03/0007
92301     11/03/0209
92302     02/23/0009
92303     03/14/0008
92304     02/19/0007
110409    06/10/0007
164085    02/11/2023
530791    04/23/0607
585252    11/17/0006
672518    06/25/0207
783246    06/04/0200
845471    12/26/0006
890108    12/13/0006
Name: Tran Date, dtype: object 



### Check Amount

In [32]:
float_pattern = '[-]{0,1}\d+[.]{0,1}\d{0,2}'
c = '(Agg\w*|Amount)'
check_values(dfs['trans_details'], c, float_pattern)

----------------------------------------
AGGREGATE 

Series([], Name: Aggregate, dtype: float64) 

----------------------------------------
AMOUNT 

Series([], Name: Amount, dtype: float64) 

----------------------------------------
TRANSACTION AMOUNT 

Series([], Name: Transaction Amount, dtype: float64) 



### Clean Transactions

#### Dates

In [33]:
dfs['transactions'].loc[:, 'Tran Date'] = \
    dfs['transactions']['Tran Date'].apply(lambda x: fix_date(x))

In [34]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*\s*Date'
check_values(dfs['transactions'], c, date_pattern)

----------------------------------------
TRAN DATE 

164085    02/11/2023
530791    04/23/0607
Name: Tran Date, dtype: object 



In [38]:
dfs['transactions'].loc[530791, 'Tran Date'] = None

In [39]:
dfs['transactions'].to_csv('transactions_cleaned.csv', index=False)

## Election Activity DF

In [40]:
dfs['election_activity'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15347 entries, 0 to 15346
Data columns (total 5 columns):
election         15347 non-null object
active_date      11395 non-null object
status           15347 non-null object
active_reason    11642 non-null object
committee_id     15347 non-null object
dtypes: object(5)
memory usage: 599.6+ KB


### Inspect Columns

In [42]:
inspect_cols(dfs['election_activity'])

----------------------------------------
ELECTION 

['2016 General Election', '2016 Primary Election', '2014 General Election', '2014 Primary Election', '2012 General Election', '2012 Primary Election', '2010 General Election', '2010 Primary Election', '2008 General Election', '2008 Primary Election', '1998 General Election', '2006 General Election', '2006 Primary Election', '2004 General Election', '2004 Primary Election', '2002 General Election', '2002 Primary Election', '2000 General Election', '2000 Primary Election', '1998 Biennial Primary Election', '1996 General Election', '1996 Primary Election', '1994 General Election', '1994 Primary', '1992 General Election', '1992 Primary Election', '1990 General Election', '2011 May Election', '2018 Primary Election', '2013 November Election', '2017 May Election', '2017 March Election', '2015 May Election', '2010 Statewide Special Election', '2015 November Election', '2004 February Special Election', '2003 January Special Election', '1997 N

### Check Committee ID

In [43]:
id_pattern = '\d+'
c = '(\w*\s*ID|\w*_id)'
check_values(dfs['election_activity'], c, id_pattern)

----------------------------------------
COMMITTEE_ID 

Series([], Name: committee_id, dtype: object) 



### Check Dates

In [48]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '\w*_[dD]ate'
check_values(dfs['election_activity'], c, date_pattern)

----------------------------------------
ACTIVE_DATE 

Series([], Name: active_date, dtype: object) 



In [54]:
dfs['election_activity'].to_csv('election_activity_cleaned.csv', index=False)

## Committee History DF

In [49]:
dfs['comm_history'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9611 entries, 0 to 9610
Data columns (total 6 columns):
committee_name           9611 non-null object
committee_description    9611 non-null object
effective                9611 non-null object
expiration               8012 non-null object
filing_type              9611 non-null object
committee_id             9611 non-null object
dtypes: object(6)
memory usage: 450.6+ KB


In [50]:
inspect_cols(dfs['comm_history'])

----------------------------------------
COMMITTEE_NAME 

['American Federation of Teachers-Oregon Candidate PAC', 'Amercian Federation of Teachers-Oregon Candidate PAC', 'American Federation of Teachers-Oregon Comm on Political Education PAC', 'American Fed of Teachers-Oregon Comm on Political Education PAC', 'AFT-Oregon Cope PAC', 'AFT-Oregon Legislative PAC', 'OFTEHP Legislative PAC', 'OFTEHP Cope', 'OFT Cope Sl', 'Friends of Jessica Adamson', 'Alliance - PAC', 'Speech Hearing Action Committee', 'Speech & Hearing Action Committee', 'Citizens for Ken Strobeck', 'Strobeck, Ken, Citizens for', 'Citizens for Centennial Schools', 'Friends of Clackamas Community College', 'Committee to Elect Carolyn Oakley', 'Bartel for Eugene Water & Electric Board', 'Constitution Party Of Columbia County', 'Committee to Re-Elect Steve Druckenmiller', 'Jones, Dick, Citizens for', 'Jones, Dick, for State Senate', 'Committee to Elect Scot Langton Deschutes County Assessor', 'Yes for Student Success'] 

---

### Check Committee ID

In [None]:
id_pattern = '\d+'
c = '(\w*\s*ID|\w*_id)'
check_values(dfs['comm_history'], c, id_pattern)

### Check Dates

In [53]:
date_pattern = '(0[0-9]|1[0-2])/([0-2][0-9]|3[01])/(19\d{2}|200[0-9]|201[0-8])'
c = '(effective|expiration)'
check_values(dfs['comm_history'], c, date_pattern)

----------------------------------------
EFFECTIVE 

Series([], Name: effective, dtype: object) 

----------------------------------------
EXPIRATION 

Series([], Name: expiration, dtype: object) 



In [55]:
dfs['comm_history'].to_csv('comm_history_cleaned.csv', index=False)

## Run Scrapers

In [39]:
os.chdir('elections-2018')
os.getcwd()

'/home/michaelcrown/Projects/HackOregon/elections-2018'

In [32]:
def parse_ostar(cid, start, stop):
    print('Parsing cid {}'.format(cid))
    i = 1
    while True:
        args = ' '.join([
            str(cid), 
            start.lstrip('0'), 
            stop.lstrip('0'), 
            str(i),
            ])
        print(start, stop)
        pypath = os.path.join(os.getcwd(), 'scrape_files/transactions_scrape_date.py')
        cmd = 'python {} {}'.format(pypath, args)
        os.system(cmd)
        file = 'transactions_gaps{}_{}.csv'.format(cid, i)
        dfpath = os.path.join(os.getcwd(), file)
        df = pd.read_csv(dfpath, parse_dates=['Tran Date'])
        if len(df) < 5000:
            break
        stop = df['Tran Date'].min().date().strftime('%m/%d/%Y')
        i += 1

In [None]:
parser_args = [
#     (4572, '01/1/1900', '12/7/17'),
    (4572, '01/1/1900', '09/28/2016'),
    (3396, '10/15/11', '9/15/14'),
    (191, '5/27/2011', '4/28/13'),
    (39, '12/23/2009', '10/25/16'),
    ]

for args in parser_args:
    parse_ostar(*args)