In [34]:
import pandas as pd
import pickle

In [7]:
import_years = [2019, 2020, 2021, 2022, 2023]
import_cols = [
    'CO_BUS_SIZE_DETERMINATION',
    'CAGE_CODE',
    'FUNDING_AGENCY_NAME',
    'FUNDING_AGENCY_ID',
    # 'FUNDING_OFFICE_NAME',
    # 'FUNDING_OFFICE_ID',
    'VENDOR_ADDRESS_COUNTRY_NAME',
    'EXTENT_COMPETED',
    'DOLLARS_OBLIGATED',
    'SOLICITATION_ID'
]

In [8]:
def import_dataset(import_cols, years, sam=True):
    """
    Imports, cleans, and joins our data with specified columns and years
    Inputs: 
        import_cols (list [str] of column names)
        years (list [int] of years to import)
        sam (bool of whether to merge with sam dataset)
    Output:
        Cleaned, filtered, and joined dataframe
    """
    if sam:
        SAM = pd.read_csv('SAM.CSV') #imports SAM df
    
    year_dfs = []
    for year in years:
        temp_df = pd.read_parquet(str(year) + '.parquet', columns=import_cols) #import year's data
        
        temp_df = temp_df[temp_df['CO_BUS_SIZE_DETERMINATION'] == "SMALL BUSINESS"] #filter for small business
        temp_df = temp_df[temp_df['VENDOR_ADDRESS_COUNTRY_NAME'] == "UNITED STATES"] #filter for US
        temp_df = temp_df[temp_df['EXTENT_COMPETED'].isin(["A", "D", "E", "CDO"])] #filter for competition
        
        # temp_df['DOLLARS_OBLIGATED'] = pd.to_numeric(temp_df['DOLLARS_OBLIGATED'], errors='coerce') #make numeric
        
        if sam:
            temp_m = pd.merge(temp_df, SAM, on="CAGE_CODE", how="inner") #merge with SAM
        else:
            temp_m = temp_df
        
        # idx = temp_m.groupby(['SOLICITATION_ID','CAGE_CODE'])['DOLLARS_OBLIGATED'].idxmax() #find initial contract win
        # temp_m = temp_m.loc[idx] #filter to initial contract win
        
        temp_m = temp_m[temp_m['DOLLARS_OBLIGATED'] > 0] #filter DOLLARS_OBLIGATED
        
        print(f'{year} shape: {temp_m.shape}')
        year_dfs.append(temp_m) #append year dataset to list of year datasets
    
    merged_df = pd.concat(year_dfs, ignore_index=True) #merge all years
    
    for df in year_dfs:
        del df
    del year_dfs #delete the individual dfs from memory
    
    idx = merged_df.groupby(['SOLICITATION_ID','CAGE_CODE'])['DOLLARS_OBLIGATED'].idxmax() #find initial contracts
    filtered_merged_df = merged_df.loc[idx] #filter to initial contract
    
    print(f'total shape: {filtered_merged_df.shape}')
    
    #place of manufacture conversion
    def convert_place_of_manufacture(value):
        if value == 'D':
            return 'YES' #manufactured in US
        elif value == 'C':
            return 'NO' #not manufactured in US
        elif value in ['N/A', 'A', 'G', 'E', 'H', 'L', 'J', 'F', 'K', 'B', 'I']:
            return 'NONE'
        else:
            return 'NONE' #N/A (provides a service or doesn't qualify fully)
    
    #clean up individual columns
    filtered_merged_df['FUNDING_AGENCY_NAME'] = filtered_merged_df['FUNDING_AGENCY_NAME'].str.strip()
    filtered_merged_df = filtered_merged_df.dropna(subset=filtered_merged_df.columns.difference(['PLACE_OF_MANUFACTURE'])) #remove rows with NAs here
    
    print(f'total filtered shape: {filtered_merged_df.shape}')
    
    return filtered_merged_df

In [9]:
df0 = import_dataset(import_cols, import_years, sam=False)

2019 shape: (1772611, 8)
2020 shape: (1659942, 8)
2021 shape: (1916684, 8)
2022 shape: (1979453, 8)
2023 shape: (1866515, 8)
total shape: (125915, 8)
total filtered shape: (125915, 8)


In [35]:
id_name_list = list(df0.groupby(['FUNDING_AGENCY_ID', 'FUNDING_AGENCY_NAME']).size().index)
agency_id_to_name = {i[0]: i[1] for i in id_name_list}


In [43]:
agency_id_to_name['0061']

KeyError: '0061'

In [36]:
pickle.dump(agency_id_to_name, open('Agency_ID_to_Name.pkl', 'wb'))

In [41]:
agency_id_to_name['0400']

'UNITED STATES GOVERNMENT PUBLISHING OFFICE'