In [1]:
import zipfile
import os
import glob
import csv
import urllib
import datetime as dt
import pandas as pd

datafolder = 'C:/Berkeley/W209/Final/data'
try:
    os.makedirs(datafolder)
except:
    pass
electionsStr = [
    'November 8, 2016',
    'November 6, 2012',
    'November 4, 2008',
    'November 2, 2004',
    'November 7, 2000',
    'November 5, 1996',
    'November 3, 1992',
    'November 8, 1988',
    'November 6, 1984',
    'November 4, 1980',
    'November 2, 1976'
]
elections = [dt.datetime.strptime(date,'%B %d, %Y') for date in electionsStr]

inflation_rates = {
                   '1978': 3.65, 
                   '1979': 3.28, 
                   '1980': 2.89, 
                   '1981': 2.62, 
                   '1982': 2.47, 
                   '1983': 2.39, 
                   '1984': 2.29, 
                   '1985': 2.21, 
                   '1986': 2.17, 
                   '1987': 2.09, 
                   '1988': 2.01, 
                   '1989': 1.92, 
                   '1990': 1.82, 
                   '1991': 1.75, 
                   '1992': 1.7, 
                   '1993': 1.65, 
                   '1994': 1.61, 
                   '1995': 1.56, 
                   '1996': 1.52, 
                   '1997': 1.48, 
                   '1998': 1.46, 
                   '1999': 1.43, 
                   '2000': 1.38, 
                   '2001': 1.34, 
                   '2002': 1.32, 
                   '2003': 1.29, 
                   '2004': 1.26, 
                   '2005': 1.22, 
                   '2006': 1.18, 
                   '2007': 1.15, 
                   '2008': 1.11, 
                   '2009': 1.11, 
                   '2010': 1.09, 
                   '2011': 1.06, 
                   '2012': 1.04, 
                   '2013': 1.02, 
                   '2014': 1.01,
                   '2015': 1
                  }

In [18]:
def candType(row):
    try:
        return row['Candidate Identification Number'][0]
    except:
        return ''

def getMonth(row):
    try:
        return dt.datetime(row['Transaction Date(MMDDYYYY)'].year,row['Transaction Date(MMDDYYYY)'].month,1)
    except:
        return ''
    
def getNextElection(row):
    mindiff = dt.timedelta(days=99999)
    nextElection = None
    transDate = row['Month'] 
    for date in elections:
        if transDate <= date and date - transDate < mindiff:
            mindiff = date-transDate
            nextElection = date
    return nextElection

def getAdjTransaction(row):
    try:
        inflation = inflation_rates[str(row['Transaction Date(MMDDYYYY)'].year)]
        return abs(float(row['Transaction Amount'])*inflation)
    except Exception as e:
        return ''
#print getAdjTransaction(dfp[:1])

In [23]:
#libraries = ['ccl','cm','cn','indiv','oppexp','oth','pas2']
libraries = ['pas2']
years = [str(year) for year in range(2016,1978,-2)]
dateparse = lambda x: dt.datetime.strptime(str(x), '%m%d%Y')


try:
    os.makedirs(os.path.join(datafolder, lib))
except:
    pass


with open(os.path.join(os.getcwd(),'data','FEC','%s_header_file.csv' % lib)) as headers:
    header=headers.read().strip().split(',')

#header.append('cycle')
'''
    for year in years:
        url = 'ftp://ftp.fec.gov/FEC/%s/' % str(year)
        filename = '%s%s.zip' % (lib,str(year)[2:])
        outfile = os.path.join(datafolder, lib, filename)


        try:
            urllib.urlretrieve(url + filename,outfile)
        except Exception as e:
            print e
'''
for filename in glob.glob(os.path.join(datafolder,'pas2','*.zip')):
    year = os.path.basename(filename).split('.')[0][-2:]

    if os.path.isfile(filename) and zipfile.is_zipfile(filename):
        #if the url passed to the def exists and is a valid zip file
        #added for Linux (was creating an empty file for non-existent url downloads)
        z = zipfile.ZipFile(filename)
        for f in z.namelist():
            if f.find('itpas') > -1:
                #get observation info
                with z.open(f, 'r') as psvfile:
                    df = pd.read_csv(psvfile,sep='|',names=header,dtype=str)
                    df['Transaction Date(MMDDYYYY)'] = pd.to_datetime(df['Transaction Date(MMDDYYYY)'],format='%m%d%Y',errors='ignore')
        df['CandidateType'] = df.apply(candType,axis=1)
        df['Month'] = df.apply(getMonth,axis=1)
        df['NextElection'] = df.apply(getNextElection,axis=1)
        df['Contribution'] = df.apply(getAdjTransaction,axis=1)
        df = df[df['CandidateType']=='P']
        df_aggregated = df.groupby(['Month','NextElection'])['Contribution'].aggregate('sum').to_frame()

        with open(os.path.join(datafolder,'pas2_%s.csv' % year),'w') as csvfile:
            df_aggregated.to_csv(csvfile)
print "Done collecting data!"


Done collecting data!


In [25]:
df = pd.DataFrame(columns = ['Month','NextElection','Contribution'])
for filename in glob.glob(os.path.join(datafolder,'*.csv')):
    with open(filename,'r') as csvfile:
        df_tmp = pd.read_csv(csvfile)
        df = df.append(df_tmp)
df = df.groupby(['Month','NextElection'])['Contribution'].aggregate('sum').to_frame()
with open(os.path.join(datafolder,'all_contributions_1978-2015.csv'),'w') as csvfile:
    df.to_csv(csvfile) 