In [1]:
%matplotlib inline
import os
from zipfile import ZipFile
from multiprocessing import Pool
import requests
from io import BytesIO
from itertools import repeat
import datetime

import pandas as pd

In [2]:
latest_update = '20170415'
next_latest  = '20170115'

year = 2017

dep= 'All'

In [3]:
url = ('http://download.usaspending.gov/data_archives/{UP_MONTH}/'
       'tsv/{{YEAR}}_{DEP}_Contracts_Full_{UP_DATE}.tsv.zip'.format(
            UP_MONTH=latest_update[:-2], DEP=dep, UP_DATE=latest_update))
url

'http://download.usaspending.gov/data_archives/201704/tsv/{YEAR}_All_Contracts_Full_20170415.tsv.zip'

In [4]:
url_legacy = ('http://download.usaspending.gov/data_archives/{UP_MONTH}/'
              'tsv/{{YEAR}}_{DEP}_Contracts_Full_{UP_DATE}.tsv.zip'.format(
                  UP_MONTH=next_latest[:-2], DEP=dep, UP_DATE=next_latest))

In [5]:
data_in = 'data_in/spending'

In [6]:
years = [y for y in range(2000, 2018)]

In [7]:
companies =  ['Corrections Corporation of America', 
              'CoreCivic',
              'TransCor']

In [36]:
companies = ['palantir']

In [8]:
def search_company(row):
    co = row['vendorname']
    if company_meta.get(co):
        company_meta[co] += row['dollarsobligated']
    else:
        company_meta[co] = row['dollarsobligated']

In [9]:
def load_and_sift(year, regex):
    '''Downloads zipped tsv file from: 
    https://www.usaspending.gov/DownloadCenter/Pages/dataarchives.aspx
    to a requests object.
    
    Expands zipfile and reads each file, chunkwise into Pandas dataframes.
    The dataframe (df) is filtered by the conpanies' RegEx expression.
    
    Args:
       year (int):  The fiscal year of records to load.
       regex (string): A regex expression of company name(s).
    
    Returns:
       df: a Pandas Dataframe containing records from the given.
    '''
    print(year)
    r = requests.get(url.format(YEAR=year))
    last_update = datetime.datetime.strptime(latest_update, '%Y%m%d')
    
    if r.status_code == 404: # if url doesn't work, use the legacy url.
        r = requests.get(url_legacy.format(YEAR=year))
        last_update = datetime.datetime.strptime(next_latest, '%Y%m%d')
            
    if r.status_code == 200: # make sure the download was successful.
                     
        # the downloaded stream is a zip archive
        zipfile = ZipFile(BytesIO(r.content))        
        df_final = pd.DataFrame()
        
        # for each file in the zip archive
        for f in zipfile.namelist():              
            # process the file in dataframe chunks!
            for df in pd.read_csv(zipfile.open(f), sep='\t',
                                  chunksize=100000, low_memory=False):
 
                # filter the dataframe chunk for active vendors
                # and relevant company names.
                df = df[(~df['vendorname'].isnull()) &
                        (df['vendorname'].str.contains(regex, case=False))]
                
                # some date tags...
                df['lastupdate'] = last_update
                df['contract_year'] = year
                df['filename'] = f
                df['search_terms'] = regex
                                
                df_final = df_final.append(df, ignore_index=True)
        
        return df_final
    
    else:
        raise "bad request"

In [10]:
with Pool() as pool:
    df_list = pool.starmap(load_and_sift, zip(years, repeat('|'.join(companies))))
df = pd.concat(df_list, ignore_index=True)

2004
2000
2003
2006
2005
2008
2001
2009
2010
2012
2011
2016
2002
2017
2014
2013
2007
2015


In [11]:
df.sample(5)

Unnamed: 0,unique_transaction_id,transaction_status,dollarsobligated,baseandexercisedoptionsvalue,baseandalloptionsvalue,maj_agency_cat,mod_agency,maj_fund_agency_cat,contractingofficeagencyid,contractingofficeid,...,prime_awardee_executive4,prime_awardee_executive4_compensation,prime_awardee_executive5,prime_awardee_executive5_compensation,interagencycontractingauthority,last_modified_date,lastupdate,contract_year,filename,search_terms
181,ae4c451d26b4d35ac362fca5d1db2330,active,2500.78,2500.78,2500.78,4700: GENERAL SERVICES ADMINISTRATION,4740: PUBLIC BUILDINGS SERVICE,:,4740: PUBLIC BUILDINGS SERVICE,HV000: GSA/PUBLIC BUILDINGS SERVICE,...,,0.0,,0.0,X: Not Applicable,05/19/2011,2017-04-15,2006,datafeeds\2006_All_Contracts_Full_20170415.tsv,Corrections Corporation of America|CoreCivic|T...
1360,1c6b0509968f4426a45b5d22b46042d5,active,0.0,0.0,0.0,1500: Department of Justice,1544: U.S. MARSHALS SERVICE,1500: Department of Justice,1544: U.S. MARSHALS SERVICE,"15M400: U.S. DEPT OF JUSTICE, USMS",...,,0.0,,0.0,X: NOT APPLICABLE,05/02/2016,2017-04-15,2016,datafeeds\2016_All_Contracts_Full_20170415.tsv,Corrections Corporation of America|CoreCivic|T...
1337,21c28772a0af41ea8427e56eee947af4,active,-15286.56,-15286.56,-15286.56,1500: Department of Justice,1540: FEDERAL PRISON SYSTEM,1500: Department of Justice,1540: FEDERAL PRISON SYSTEM,70000: CENTRAL OFFICE,...,,0.0,,0.0,X: NOT APPLICABLE,02/18/2016,2017-04-15,2016,datafeeds\2016_All_Contracts_Full_20170415.tsv,Corrections Corporation of America|CoreCivic|T...
1054,b74a05fb720e4af87ffe3019a8128426,active,0.0,0.0,0.0,"1500: JUSTICE, DEPARTMENT OF","1501: OFFICES, BOARDS AND DIVISIONS","1500: JUSTICE, DEPARTMENT OF","1501: OFFICES, BOARDS AND DIVISIONS",OFDT: OFFICE OF THE FEDERAL DETENTION TRUSTEE,...,,0.0,,0.0,X: Not Applicable,02/24/2014,2017-04-15,2014,datafeeds\2014_All_Contracts_Full_20170415.tsv,Corrections Corporation of America|CoreCivic|T...
1091,09fcbccb42cf7430a38791c2638d8fa7,active,13183928.58,13183928.58,13183928.58,"7000: HOMELAND SECURITY, DEPARTMENT OF",7012: U.S. IMMIGRATION AND CUSTOMS ENFORCEMENT,"7000: HOMELAND SECURITY, DEPARTMENT OF",7012: U.S. IMMIGRATION AND CUSTOMS ENFORCEMENT,DMDC0: DETENTION MANAGEMENT - DC OFFICE,...,,0.0,,0.0,X: Not Applicable,05/01/2014,2017-04-15,2014,datafeeds\2014_All_Contracts_Full_20170415.tsv,Corrections Corporation of America|CoreCivic|T...


In [12]:
len(df.columns)

229

In [13]:
[c for c in df.columns if 'state' in c]

['state',
 'vendor_state_code',
 'statecode',
 'pop_state_code',
 'stategovernmentflag',
 'isstatecontrolledinstitutionofhigherlearning',
 'isinterstateentity']

In [14]:
outfile = data_in + '_' + companies[0].replace(' ', '_').lower() + '.tsv.gz'
df.to_csv(outfile, sep='\t', compression='gzip', index=False)
print("Data saved to {}".format(outfile))

Data saved to data_in/spending_corrections_corporation_of_america.tsv.gz


In [32]:
df['contract_year'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017])