In [125]:
import pandas as pd
import numpy as np

After merging: 
- filter only for unique contracts over the years - do we assume MODIFICATION == 0 shows up only once for each contract

### Cols for Classification Model 

In [126]:
cols = [# RESPONSE VARIABLE
        #'FUNDING_AGENCY_NAME',
        'FUNDING_OFFICE_NAME', # use for main model 
        #'FUNDING_DEPARTMENT_NAME',
    
        # IDENTIFIERS
        'PIID',
        'IDV_PIID', # Indefinite delivery vehicle procurement instrument identifiers
        'FUNDING_DEPARTMENT_ID',
        # Funding Department ID + IDV PIID + PIID concatenated should give unique ID for a contract
        # null IDV PIID + PIID gives the non-IDV contracts
        'CAGE_CODE', # Code for the entity
        # CAGE Code has been used for the entire dataset so can act as a unique entity ID
        # Unique Entity ID
        # 'VENDOR_UEI', # these two don't match about half the time 
        # 'ULTIMATE_UEI', # check previous years - DUNS before 2022

        # OTHER
        'AWARD_FISCAL_YEAR',
        'VENDOR_ADDRESS_ZIP_CODE', #use only first 5 digits, drop NAs, least amount of NAs 
        'TYPE_OF_SET_ASIDE', # Type of Set Aside determined for the contract action 10N in dictionary 
        'DATE_SIGNED', # The date that a mutually binding agreement was reached. "2022-09-15"
        'EVALUATED_PREFERENCE', #------------
        # Evaluated Preference makes all non-HUBZone entities’ bid prices up 10% when evaluating
    
        # Number of Offers
        'NUMBER_OF_OFFERS_RECEIVED', #a lot of NaNs
        # The number of actual offers/bids received in response to the solicitation.
        'IDV_NUMBER_OF_OFFERS',
        'AWARD_OR_IDV', # award or IDV - mostly Award 


        # FILTERING
        'CO_BUS_SIZE_DETERMINATION', # =="SMALL BUSSINES" / "OTHER THAN SMALL BUSINESS"
        'VENDOR_ADDRESS_COUNTRY_NAME', # == USA filter 
        'MODIFICATION_NUMBER', # filtering == 0 
        'EXTENT_COMPETED', # see notes, possibly filter by A,D,E,CDO

        # CONTRACT VALUE
        'ULTIMATE_CONTRACT_VALUE', # Mutually agreed upon TOTAL contract value including all options 
        #(also includes estimated value of all potential orders for IDVs)
    
        # NAICS
        'PRINCIPAL_NAICS_CODE', # 'number' (NAICS codes are updated every 5 years(2012,2017,2022), incosistencies? 
        #The first two digits of the code designate the sector, the third digit designates the subsector, 
        #the fourth digit designates the industry group, the fifth digit designates the NAICS industry, 
        #and the sixth digit designates the national industry. 


        ## GENERAL BUSINESS QUESTIONS + add Indian
        'CORP_ENTITY_NOT_TAX_EXEMPT', #YES/NO, no nan
        'CORP_ENTITY_TAX_EXEMPT', #YES/NO, no nan 
        'LIMITED_LIABILITY_CORPORATION', #YES/NO, no nan
        'PARTNERSHIP_OR_LLP', #YES/NO, no nan
        'SOLE_PROPREITORSHIP', #YES/NO, no nan
        'SMALL_AGRICULTURAL_COOPERATIVE', #YES/NO, no nan
        'INTERNATIONAL_ORGANIZATION',  #YES/NO, no nan
        'ARCHITECTURE_AND_ENGINEERING', #YES/NO, no nan
        'COMMUNITY_CORP_OWNED_FIRM', #YES/NO, no nan
        'CONSTRUCTION_FIRM', #YES/NO, no nan
        'DOMESTIC_SHELTER', #YES/NO, no nan
        'FOUNDATION', #YES/NO, no nan
        'RESEARCH_AND_DEVELOPMENT', #YES/NO, no nan
        'VETERINARY_HOSPITAL', #YES/NO, no nan
        'HISPANIC_SERVICING_INSTITUTION', #YES/NO, no nan
        'FOR_PROFIT_ORGANIZATION', #YES/NO, no nan
        'EDUCATIONAL_INSTITUTION_FLAG', #YES/NO, no nan
        'MANUFACTURER_OF_GOODS', #YES/NO, no nan
        'SERVICE_PROVIDER', #YES/NO, no nan
        'INDIAN_TRIBE', #YES/NO, no nan # Buy Indian

        # MANUFACTURE - is the product you're selling made in the US 
        # YES, NO, Not a manufactured end product 
        # 'PLACE_OF_MANUFACTURE_CLASS'
        'PLACE_OF_MANUFACTURE', 
        

        # NON-CERTIFIED 
        #Veteran-Owned Business
        'VETERAN_OWNED_FLAG',
        #Service-Disabled Veteran-Owned Business
        'SRDVOB_FLAG',
        # Women-Owned Business
        'WOMEN_OWNED_FLAG']

In [None]:
### From SAM DataSet 

        # NON-CERTIFIED

        [#Minority-Owned Business
        'SELF_8A',
        #Women-Owned Small Business
        'SELF_WOSB',
        # Economically Disadvantaged Women-Owned Small Business
        'SELF_EDWOSB',
        #Self-Certified Small Disadvantaged Business
        'SELF_SDB',

        # CERTIFIED 
        
        #SBA Certified 8(a) Program Participant
        'CERT_8A',
        #SBA Certified Small Disadvantaged Business
        'CERT_SDB',
        #SBA-Certified Women-Owned Small Business
        'CERT_WOSB',
        #SBA-Certified Economically Disadvantaged Women-Owned Small Business
        'CERT_EDWOSB',
        
        #SBA Certified HUBZone Firm - How can we use this?
        # Evaluated Preference makes all non-HUBZone entities’ bid prices up 10% when evaluating
        'CERT_HUBZone']

In [127]:
test_2022 = pd.read_parquet('2022.parquet', engine='pyarrow',columns=cols)

In [137]:
test_2020 = pd.read_parquet('2020.parquet', engine='pyarrow',columns=cols)

In [134]:
filtered_data.shape

(4692742, 43)

In [138]:
test_2020[['VENDOR_UEI','ULTIMATE_UEI','CAGE_CODE']][0:5]

Unnamed: 0,VENDOR_UEI,ULTIMATE_UEI,CAGE_CODE
0,,,1XUY5
1,,,1R074
2,,,6D0X3
3,,,8ATX9
4,,,1CR65


In [131]:
same_values_count = (filtered_data['VENDOR_UEI'] == filtered_data['ULTIMATE_UEI']).sum()
same_values_count

2355971

In [124]:
filtered_data['TYPE_OF_SET_ASIDE'].value_counts()

TYPE_OF_SET_ASIDE
NONE        3842742
SBA          748902
SDVOSBC       31359
8A            28879
HZC           20196
SBP           11634
WOSB           5384
8AN            1619
ISBEE           549
SDVOSBS         426
VSA             350
EDWOSB          232
BI              169
WOSBSS          109
IEE              86
HZS              85
EDWOSBSS         12
RSB               7
HMT               1
HMP               1
Name: count, dtype: int64

In [37]:
new_data['EXTENT_COMPETED'].value_counts()

EXTENT_COMPETED
A    3274938
D     701452
E          6
Name: count, dtype: int64

In [56]:
cleaned_data['NUMBER_OF_OFFERS_RECEIVED'].value_counts()

NUMBER_OF_OFFERS_RECEIVED
22.0     188162
5.0       59647
1.0       49425
999.0     42675
3.0       30677
          ...  
150.0         1
130.0         1
173.0         1
53.0          1
48.0          1
Name: count, Length: 79, dtype: int64

In [57]:
cleaned_data['IDV_NUMBER_OF_OFFERS'].value_counts()

IDV_NUMBER_OF_OFFERS
22.0     188498
999.0     83746
5.0       57450
29.0      28375
10.0      23929
          ...  
111.0         1
106.0         1
600.0         1
120.0         1
94.0          1
Name: count, Length: 156, dtype: int64

In [13]:
pd.set_option('display.max_rows', None)

In [141]:
filtered_data['EVALUATED_PREFERENCE'].value_counts()

EVALUATED_PREFERENCE
NONE    4682257
HZE         642
SPS           3
Name: count, dtype: int64

### Notes:
**EXTENT_COMPETED**
1. A - **Full and Open Competition**
   - Report this code if the action resulted from an award pursuant to FAR 6.102(a) - sealed bid, FAR 6.102(b) - competitive proposal, FAR 6.102(c) - Combination, or any other competitive method that did not exclude sources of any type
2. B - **Not Available for Competition**
   - Select this code when the contract is not available for competition
3. C - **Not Competed**
   - Select this code when the contract is not competed.
4. D - **Full and Open Competition after exclusion of sources**
   - Select this code when some sources are excluded before competition
5. E - **Follow On to Competed Action**
   - Select this code when the action is a follow on to an existing competed contract. FAR 6.302-1. (Note: This is not applicable to Version 1.4/1.5 documents.)
6. F - **Competed under SAP**
   - Select this code when the action is competed under the Simplified Acquisition Procedures.
7. G - **Not Competed under SAP**
   - Select this code when the action is NOT competed under the Simplified Acquisition Procedures.
8. CDO - **Competitive Delivery Order**
   - Apply to Full and Open Competition pursuant to FAR 6.1 and only apply to Delivery Orders) Report this code if the IDV Type is a Federal Schedule. Report this code when the Order delivery/task order award was made pursuant to a process that permitted each contract awardee a fair opportunity to be considered. See FAR Part 16.505(b)(1). Report this code if the action is for the award of a multiple award schedule or an order against a multiple award schedule pursuant to FAR 6.102(d)(3) and the applicable provisions referenced there under. (Note: This is not applicable to Version 1.4/1.5 documents.)
9. NDO - **Non-Competitive Delivery Order**
    - Report this code when competitive procedures are not used in awarding the delivery order for a reason not included above (when the action was non-competitive). (Note: This is not applicable to Version 1.4/1.5 documents.)


**TYPE_OF_SET_ASIDE**
1. NONE - **No set aside used**
   - Report this code if the contract award was not a socio-economic program set-aside.
3. SBA - **Small Business Set Aside - Total**
   - Report this code for a small business set aside.      
5. SDVOSBC - **Service-Disabled Veteran-Owned Small Business Set-Aside**
   - Report this code for contract awards exceeding the micro purchase threshold set-aside for competition restricted to service-disabled
     veteran-owned small business concerns. 
7. 8A - **8A Competed**
   - Report this code for an 8a award pursuant to            
9. HZC - **HUBZone Set-Aside**
    - Report this code if the award was made to a HUBZone small business concern after being competed among only HUBZone small business
      concerns      
11. SBP - **Small Business Set-Aside -- Partial**
    - Report this code for a partial small business set aside made pursuant to FAR 19.502-3.
12. ESB - **Emerging Small Business Set-Aside**
    - Enter this code if the action was an emerging small business reserve award (set-aside) within a designated industry group under the
      Small Business Competitiveness Demonstration Program
13. WOSB - **Women-Owned Small Business**
    - Report this code for a Women-Owned Small Business Set-Aside made pursuant to FAR 19.15. This code is only valid for actions signed on
      or after 4/01/2011.
15. 8AN - **8(a) Sole Source**
    - Report this code for actions with the Small Business Administration pursuant to FAR 19.8, or directly with an 8(a) contractor pursuant
      to a memorandum of understanding between the SBA and the reporting agency per 219.811   
16. ISBEE - **Indian Small Business Economic Enterprise**
    - Report this code if applicable. This value can only be used by the Department of Interior. The start date is 7/8/2013.      
17. SDVOSBS - **Service-Disabled Veteran-Owned Small Business Set-Aside**
    - Report this code for contract awards exceeding the micro purchase threshold set-aside for competition restricted to service-disabled
      veteran-owned small business concerns.
18. VSA - **Veteran Set Aside**
    - Report this code for actions with Veteran-Owned Businesses             
20. EDWOSB - **Economically Disadvantaged Women-Owned Small Business**
    - Report this code for an Economically Disadvantaged Women-Owned Small Business Set-Aside made pursuant to FAR 19.15. This code is only
      valid for actions signed on or after 4/01/2011.       
22. BI - **Buy Indian**
    - Report this code if applicable. As of 7/8/2013 this value can no longer be used by the Department of Interior. This code can still be
      used by the Indian Health Service of the Department of Health and Human Services.           
23. WOSBSS - **Women-Owned Small Business Sole Source**
    - Report this code for a Women-Owned Small Business Sole Source made pursuant to FAR 19.15. This code is only valid for actions signed
      on or after 11/13/2015.    
25. IEE - **Indian Economic Enterprise**
    - Report this code if applicable. This value can only be used by the Department of Interior. The start date is 7/8/2013.           
26. HZS - **HUBZone Sole Source**
    - Report this code if the award was made through noncompetitive procedures to a HUBZone small business concern under the authority of
      FAR 19.1306.            
27. EDWOSBSS - **Economically Disadvantaged Women-Owned Small Business Sole Source**
    - Report this code for an Economically Disadvantaged Women-Owned Small Business Sole Source made pursuant to FAR 19.15. This code is
      only valid for actions signed on or after 11/13/2015.       
29. RSB - **Reserved for Small Business**
    - Report this code if awards exceeds the micro-purchase threshold but does not exceed the simplified acquisition threshold. FAR 13. This
      code is valid until 10/31/2009               
30. HMT - **HBCU or MI Set-Aside -- Total**
    - Report this code for an award directed to a Historically Black College/University or Minority Institution          
31. HMP - **HBCU or MI Set-Aside -- Partial**
    - Report this code for contract awards directed to Historically Black College/University of Minority Institution per DFARS 235.016
32. SDVOSBS - **SDVOSB Sole Source**
    - Report this code for contract awards exceeding the micro purchase threshold set-aside for non competitive award to service-disabled
      veteran-owned small business concerns.
33. VSS - **Veteran Sole Source**
    - Report this code for actions with Veteran-Owned Businesses.


**EVALUATED_PREFERENCE**

1. NONE - **No Preference used**
2. HZE - **HUBZone Price Evaluation**
   - Report this code if the award was made through full and open competition with award to a HUBZone small business concern as a result of
     the HUBZone price evaluation preference. See FAR 19.1307
3. SPS - **SDB Preferential Consideration Partial SB Set Aside**
   - Report this code if the award was made through Full and Open Competition after Exclusion of Sources pursuant to FAR 6.2. Enter this
     code if the action was a partial set-aside for small business and - Partial SB Set-Aside preferential consideration resulted in an
     award to an SDB.
4. SDA - **SDB Price Evaluation**
   - Report this code for an award made through full and open competition with award to a small disadvantaged business as a result of the
     application of a price evaluation adjustment pursuant to FAR 19.11.
5. HSD - **Combined HUB/SDB Preference**
   - Report this code if the award was made through full and open competition after the application of both the HUBZone Price Evaluation
     Adjustment Preference (FAR 19.13) and the Small Disadvantaged Business Price Evaluation Adjustment per FAR 19.11

### Feature engineering

In [95]:
# PLACE_OF_MANUFACTURE
# PLACE_OF_MANUFACTURE_CLASS
# Make NaN into N/A
filtered_data['PLACE_OF_MANUFACTURE'] = filtered_data['PLACE_OF_MANUFACTURE'].fillna('N/A')
# create new  column 
# YES - made in US
# NO - not made in US
# NONE - Not selling a product 

def convert_place_of_manufacture(value):
    if value == 'D':
        return 'YES'
    elif value == 'C':
        return 'NO'
    elif value in ['N/A', 'A', 'G', 'E', 'H', 'L', 'J', 'F', 'K', 'B', 'I']:
        return 'NONE'
    else:
        return 'NONE'

filtered_data['PLACE_OF_MANUFACTURE_CLASS'] = filtered_data['PLACE_OF_MANUFACTURE'].apply(convert_place_of_manufacture)


### Transform 

In [31]:
# zip code was in long format, change to standard 5 digits 
test_2022['VENDOR_ADDRESS_ZIP_CODE'] = test_2022['VENDOR_ADDRESS_ZIP_CODE'].astype(str).str[:5]

In [114]:
# assume missing values (not that many) are NONE
test_2022['TYPE_OF_SET_ASIDE'] = test_2022['TYPE_OF_SET_ASIDE'].fillna('NONE')

### Filter

In [129]:
# filter for only small businesses 
filtered_data = test_2022[test_2022['CO_BUS_SIZE_DETERMINATION'] == "SMALL BUSINESS"]
# first time the contract showed up 
filtered_data = test_2022[test_2022['MODIFICATION_NUMBER'] == "0"]
# only US businesses 
filtered_data = test_2022[test_2022['VENDOR_ADDRESS_COUNTRY_NAME'] == "UNITED STATES"]
# contracts only open to competition
filtered_data = filtered_data[filtered_data['EXTENT_COMPETED'].isin(["A", "D", "E", "CDO"])]

### Drop 

In [116]:
filtered_data = filtered_data.drop('CO_BUS_SIZE_DETERMINATION', axis=1)
filtered_data = filtered_data.drop('MODIFICATION_NUMBER', axis=1)
filtered_data = filtered_data.drop('VENDOR_ADDRESS_COUNTRY_NAME', axis=1)

### Drop NA - after everything else is done 

- clean first -> check for NAs

In [87]:
test_2022 = test_2022.dropna(subset=['VENDOR_ADDRESS_ZIP_CODE'])