In [1]:
import polars as pl
import pandas as pd
import plotly.express as px
import numpy as np

pd.options.mode.chained_assignment = None

In [2]:
cols = [# RESPONSE VARIABLE
        #'FUNDING_AGENCY_NAME',
        'FUNDING_OFFICE_NAME', # use for main model 
        #'FUNDING_DEPARTMENT_NAME',
    
        # IDENTIFIERS
        'PIID',
        'IDV_PIID', # Indefinite delivery vehicle procurement instrument identifiers
        'FUNDING_DEPARTMENT_ID',
        # Funding Department ID + IDV PIID + PIID concatenated should give unique ID for a contract
        # null IDV PIID + PIID gives the non-IDV contracts
        'CAGE_CODE', # Code for the entity
        # CAGE Code has been used for the entire dataset so can act as a unique entity ID
        # Unique Entity ID
        #'VENDOR_UEI', # these two don't match about half the time 
        #'ULTIMATE_UEI', # check previous years - DUNS before 2022
        'SOLICITATION_ID',

        # OTHER
        'AWARD_FISCAL_YEAR',
        'VENDOR_ADDRESS_ZIP_CODE', #use only first 5 digits, drop NAs, least amount of NAs 
        'TYPE_OF_SET_ASIDE', # Type of Set Aside determined for the contract action 10N in dictionary 
        'DATE_SIGNED', # The date that a mutually binding agreement was reached. "2022-09-15"
        'EVALUATED_PREFERENCE', #------------
        # Evaluated Preference makes all non-HUBZone entitiesâ€™ bid prices up 10% when evaluating
    
        # Number of Offers
        'NUMBER_OF_OFFERS_RECEIVED', #a lot of NaNs
        # The number of actual offers/bids received in response to the solicitation.
        'IDV_NUMBER_OF_OFFERS',
        'AWARD_OR_IDV', # award or IDV - mostly Award 

        # FILTERING
        'CO_BUS_SIZE_DETERMINATION', # =="SMALL BUSSINES" / "OTHER THAN SMALL BUSINESS"
        'VENDOR_ADDRESS_COUNTRY_NAME', # == USA filter 
        'MODIFICATION_NUMBER', # filtering == 0 
        'EXTENT_COMPETED', # see notes, possibly filter by A,D,E,CDO

        # CONTRACT VALUE
        #'ULTIMATE_CONTRACT_VALUE', # Mutually agreed upon TOTAL contract value including all options 
        #(also includes estimated value of all potential orders for IDVs)
        'DOLLARS_OBLIGATED',
    
        # NAICS
        'PRINCIPAL_NAICS_CODE', # 'number' (NAICS codes are updated every 5 years(2012,2017,2022), incosistencies? 
        #The first two digits of the code designate the sector, the third digit designates the subsector, 
        #the fourth digit designates the industry group, the fifth digit designates the NAICS industry, 
        #and the sixth digit designates the national industry. 

        ## GENERAL BUSINESS QUESTIONS
        'CORP_ENTITY_NOT_TAX_EXEMPT', #YES/NO, no nan
        'CORP_ENTITY_TAX_EXEMPT', #YES/NO, no nan 
        'LIMITED_LIABILITY_CORPORATION', #YES/NO, no nan
        'PARTNERSHIP_OR_LLP', #YES/NO, no nan
        'SOLE_PROPREITORSHIP', #YES/NO, no nan
        'SMALL_AGRICULTURAL_COOPERATIVE', #YES/NO, no nan
        'INTERNATIONAL_ORGANIZATION',  #YES/NO, no nan
        'ARCHITECTURE_AND_ENGINEERING', #YES/NO, no nan
        'COMMUNITY_CORP_OWNED_FIRM', #YES/NO, no nan
        'CONSTRUCTION_FIRM', #YES/NO, no nan
        'DOMESTIC_SHELTER', #YES/NO, no nan
        'FOUNDATION', #YES/NO, no nan
        'RESEARCH_AND_DEVELOPMENT', #YES/NO, no nan
        'VETERINARY_HOSPITAL', #YES/NO, no nan
        'HISPANIC_SERVICING_INSTITUTION', #YES/NO, no nan
        'FOR_PROFIT_ORGANIZATION', #YES/NO, no nan
        'EDUCATIONAL_INSTITUTION_FLAG', #YES/NO, no nan
        'MANUFACTURER_OF_GOODS', #YES/NO, no nan
        'SERVICE_PROVIDER', #YES/NO, no nan
        'INDIAN_TRIBE', #YES/NO, no nan # Buy Indian

        # MANUFACTURE - is the product you're selling made in the US 
        # YES, NO, Not a manufactured end product 
        # 'PLACE_OF_MANUFACTURE_CLASS'
        'PLACE_OF_MANUFACTURE', 
        
        # NON-CERTIFIED 
        #Veteran-Owned Business
        'VETERAN_OWNED_FLAG',
        #Service-Disabled Veteran-Owned Business
        'SRDVOB_FLAG',
        # Women-Owned Business
        'WOMEN_OWNED_FLAG',
        'ANNUAL_REVENUE',
        'NUMBER_OF_EMPLOYEES']

In [23]:
columns = [
    'CONTRACTING_OFFICE_NAME', 'SOLICITATION_ID', 'EVALUATED_PREFERENCE', 'PLACE_OF_MANUFACTURE', 
    'IDV_PIID', 'CAGE_CODE', 'FUNDING_DEPARTMENT_ID', 'DOLLARS_OBLIGATED', 'EXTENT_COMPETED', 
    'VENDOR_UEI', 'TYPE_OF_SET_ASIDE', 'VENDOR_ADDRESS_ZIP_CODE', 'VENDOR_ADDRESS_COUNTRY_NAME', 
    'EDUCATIONAL_INSTITUTION_FLAG', 'FIRM_8A_FLAG', 'WOMEN_OWNED_FLAG', 'FIRM8A_JOINT_VENTURE', 
    'FEDERALLY_FUNDED_R_AND_D_CORP', 'CORP_ENTITY_NOT_TAX_EXEMPT', 'CORP_ENTITY_TAX_EXEMPT', 
    'PARTNERSHIP_OR_LLP', 'SOLE_PROPREITORSHIP', 'SMALL_AGRICULTURAL_COOPERATIVE', 
    'INTERNATIONAL_ORGANIZATION', 'ARCHITECTURE_AND_ENGINEERING', 'COMMUNITY_CORP_OWNED_FIRM', 
    'CONSTRUCTION_FIRM', 'DOMESTIC_SHELTER', 'FOUNDATION', 'MANUFACTURER_OF_GOODS', 
    'RESEARCH_AND_DEVELOPMENT', 'SERVICE_PROVIDER', 'VETERINARY_HOSPITAL', 
    'HISPANIC_SERVICING_INSTITUTION', 'LIMITED_LIABILITY_CORPORATION', 
    'IDV_CONTRACTING_AGENCY_NAME', 'IDV_EXTENT_COMPETED', 'IDV_EVALUATED_PREFERENCE', 
    'IDV_SIGNED_DATE', 'PART8_OR_PART13', 'AWARD_FISCAL_YEAR', 'PRODUCT_OR_SERVICE_TYPE', 
    'TOTAL_ESTIMATED_ORDER_VALUE', 'FOR_PROFIT_ORGANIZATION', 'DOT_CERTIFIED_DISADV_BUS', 'SDB', 
    'CO_BUS_SIZE_DETERMINATION', 'PRINCIPAL_NAICS_CODE', 'PIID', 'FUNDING_OFFICE_NAME', 'PIID', 
    'IDV_PIID', 'FUNDING_DEPARTMENT_ID', 'CAGE_CODE', 'SOLICITATION_ID', 'AWARD_FISCAL_YEAR', 
    'VENDOR_ADDRESS_ZIP_CODE', 'TYPE_OF_SET_ASIDE', 'DATE_SIGNED', 'EVALUATED_PREFERENCE', 
    'NUMBER_OF_OFFERS_RECEIVED', 'IDV_NUMBER_OF_OFFERS', 'AWARD_OR_IDV', 'CO_BUS_SIZE_DETERMINATION', 
    'VENDOR_ADDRESS_COUNTRY_NAME', 'MODIFICATION_NUMBER', 'EXTENT_COMPETED', 'DOLLARS_OBLIGATED', 
    'PRINCIPAL_NAICS_CODE', 'CORP_ENTITY_NOT_TAX_EXEMPT', 'CORP_ENTITY_TAX_EXEMPT', 
    'LIMITED_LIABILITY_CORPORATION', 'PARTNERSHIP_OR_LLP', 'SOLE_PROPREITORSHIP', 
    'SMALL_AGRICULTURAL_COOPERATIVE', 'INTERNATIONAL_ORGANIZATION', 'ARCHITECTURE_AND_ENGINEERING', 
    'COMMUNITY_CORP_OWNED_FIRM', 'CONSTRUCTION_FIRM', 'DOMESTIC_SHELTER', 'FOUNDATION', 
    'MANUFACTURER_OF_GOODS', 'RESEARCH_AND_DEVELOPMENT', 'SERVICE_PROVIDER', 'VETERINARY_HOSPITAL', 
    'HISPANIC_SERVICING_INSTITUTION', 'FOR_PROFIT_ORGANIZATION', 'PLACE_OF_MANUFACTURE', 
    'VETERAN_OWNED_FLAG', 'SRDVOB_FLAG', 'WOMEN_OWNED_FLAG', 'ANNUAL_REVENUE', 'NUMBER_OF_EMPLOYEES'
]

# Eliminate duplicates by converting the list to a set, then back to a list
cols = list(set(columns))


In [4]:
%%time
df0 = pd.read_parquet('fy2023.parquet', columns=cols)

CPU times: user 11.3 s, sys: 1.46 s, total: 12.7 s
Wall time: 7.62 s


In [6]:
df0.shape

(5988988, 59)

In [7]:
SAM = pd.read_csv('../../Documents/College/Capstone/SBA-Capstone/SAM.csv')

In [8]:
SAM.shape

(1194092, 15)

In [34]:
df = df0.copy()
df = df[df['CO_BUS_SIZE_DETERMINATION'] == "SMALL BUSINESS"]
df = df[df['VENDOR_ADDRESS_COUNTRY_NAME'] == "UNITED STATES"]
df = df[df['EXTENT_COMPETED'].isin(["A", "D", "E", "CDO"])]
df['FUNDING_DEPARTMENT_ID'] = df['FUNDING_DEPARTMENT_ID'].str.strip()
df['IDV_PIID'] = df['IDV_PIID'].str.strip()
df['PIID'] = df['PIID'].str.strip()
df['DOLLARS_OBLIGATED'] = pd.to_numeric(df['DOLLARS_OBLIGATED'], errors='coerce')
# m_2023 = pd.merge(df, SAM, on="CAGE_CODE", how="inner")
idx = df.groupby(['SOLICITATION_ID','CAGE_CODE'])['DOLLARS_OBLIGATED'].idxmax()
df = df.loc[idx]
#df = df[df['DOLLARS_OBLIGATED'] > 0]

In [35]:
df.shape

(79829, 59)

## Feature Engineering

In [None]:
df = df[df['ANNUAL_REVENUE'] < 100000000] #make less than 100,000,000
df = df[df['NUMBER_OF_EMPLOYEES'] < 1000] #have less than 1,000 employees
df.shape

In [None]:
px.scatter(df, x='NUMBER_OF_EMPLOYEES', y='ANNUAL_REVENUE')

## Model

In [36]:
# PLACE_OF_MANUFACTURE
# PLACE_OF_MANUFACTURE_CLASS
# Make NaN into N/A
df['PLACE_OF_MANUFACTURE'] = df['PLACE_OF_MANUFACTURE'].fillna('N/A')
# create new  column 
# YES - made in US
# NO - not made in US
# NONE - Not selling a product 

def convert_place_of_manufacture(value):
    if value == 'D':
        return 'YES'
    elif value == 'C':
        return 'NO'
    elif value in ['N/A', 'A', 'G', 'E', 'H', 'L', 'J', 'F', 'K', 'B', 'I']:
        return 'NONE'
    else:
        return 'NONE'

df['PLACE_OF_MANUFACTURE_CLASS'] = df['PLACE_OF_MANUFACTURE'].apply(convert_place_of_manufacture)


In [37]:
# zip code was in long format, change to standard 5 digits 
df['VENDOR_ADDRESS_ZIP_CODE'] = df['VENDOR_ADDRESS_ZIP_CODE'].astype(str).str[:5]

In [38]:
# assume missing values (not that many) are NONE
df['TYPE_OF_SET_ASIDE'] = df['TYPE_OF_SET_ASIDE'].fillna('NONE')

In [39]:
df['EVALUATED_PREFERENCE'] = df['EVALUATED_PREFERENCE'].fillna('NONE')

In [40]:
df = df.dropna(subset=['CORP_ENTITY_NOT_TAX_EXEMPT'])

In [41]:
df = df.dropna(subset=['FUNDING_OFFICE_NAME'])

In [57]:
final_df = df.drop('PLACE_OF_MANUFACTURE', axis=1)
final_df.shape

(78846, 59)

In [58]:
offices_bool = (final_df['FUNDING_OFFICE_NAME'].value_counts() > 1)
offices = offices_bool[offices_bool].index
final_df = final_df[final_df['FUNDING_OFFICE_NAME'].isin(offices)]
final_df.shape

(77309, 59)

In [59]:
# #final_df = final_df[final_df['DOLLARS_OBLIGATED']>10000]
# final_df['DOLLARS_OBLIGATED'] = np.log(final_df['DOLLARS_OBLIGATED']) #change this to hyperbolic whatever tf terry said
# final_df.shape

In [74]:
final_df_sample = final_df.sample(frac=1, random_state=67)
final_df_sample.shape

(77309, 59)

In [75]:
final_df_sample['VENDOR_ADDRESS_ZIP_CODE'] = final_df_sample['VENDOR_ADDRESS_ZIP_CODE'].str[:1]
final_df_sample['PRINCIPAL_NAICS_CODE'] = final_df_sample['PRINCIPAL_NAICS_CODE'].str[:2]

In [76]:
from sklearn.preprocessing import StandardScaler
Xy = final_df_sample[['FUNDING_OFFICE_NAME',
 'EVALUATED_PREFERENCE',
 'PLACE_OF_MANUFACTURE_CLASS', #instead of PLACE_OF_MANUFACTURE
 #'DOLLARS_OBLIGATED',
 'EXTENT_COMPETED',
 'TYPE_OF_SET_ASIDE',
 'VENDOR_ADDRESS_ZIP_CODE',
 'EDUCATIONAL_INSTITUTION_FLAG',
 'FIRM_8A_FLAG',
 'WOMEN_OWNED_FLAG',
 'FIRM8A_JOINT_VENTURE',
 'FEDERALLY_FUNDED_R_AND_D_CORP',
 'CORP_ENTITY_NOT_TAX_EXEMPT',
 'CORP_ENTITY_TAX_EXEMPT',
 'PARTNERSHIP_OR_LLP',
 'SOLE_PROPREITORSHIP',
 'SMALL_AGRICULTURAL_COOPERATIVE',
 'INTERNATIONAL_ORGANIZATION',
 'ARCHITECTURE_AND_ENGINEERING',
 'COMMUNITY_CORP_OWNED_FIRM',
 'CONSTRUCTION_FIRM',
 'DOMESTIC_SHELTER',
 'FOUNDATION',
 'MANUFACTURER_OF_GOODS',
 'RESEARCH_AND_DEVELOPMENT',
 'SERVICE_PROVIDER',
 'VETERINARY_HOSPITAL',
 'HISPANIC_SERVICING_INSTITUTION',
 'LIMITED_LIABILITY_CORPORATION',
 'PRODUCT_OR_SERVICE_TYPE',
 'FOR_PROFIT_ORGANIZATION',
 'DOT_CERTIFIED_DISADV_BUS',
 'SDB',
 'PRINCIPAL_NAICS_CODE',
    'ANNUAL_REVENUE', 'NUMBER_OF_EMPLOYEES'
]].dropna()
X = Xy.drop('FUNDING_OFFICE_NAME', axis=1)

scaler = StandardScaler()
X['NUMBER_OF_EMPLOYEES'] = scaler.fit_transform(np.asarray(X['NUMBER_OF_EMPLOYEES']).reshape(-1, 1))
scaler2 = StandardScaler()
X['ANNUAL_REVENUE'] = scaler.fit_transform(np.asarray(X['ANNUAL_REVENUE']).reshape(-1, 1))

In [77]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

y = Xy['FUNDING_OFFICE_NAME']
# enc = LabelEncoder()
# y = enc.fit_transform(y)


In [78]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder

# # Initialize LabelEncoder
# enc = LabelEncoder()

# # Encode categorical columns individually
# for col in X:
#     X[col] = enc.fit_transform(X[col])

# using OneHot
X = pd.get_dummies(X, drop_first=False)
enc = LabelEncoder()
y = enc.fit_transform(y)


# Initialize the Lasso model
lasso = Lasso(alpha=0.9)  # You can adjust the alpha parameter as needed

# Fit the Lasso model to your data
lasso.fit(X, y)

# Get the coefficients of the features
coefficients = lasso.coef_

# Find the indices of non-zero coefficients
non_zero_indices = [i for i, coef in enumerate(coefficients) if coef != 0]

# Get the column names corresponding to non-zero coefficients
non_zero_columns = X.columns[non_zero_indices]

# Print the column names that were not pushed to zero
print("Columns not pushed to zero by Lasso regression:")
print(non_zero_columns)

Columns not pushed to zero by Lasso regression:
Index(['ANNUAL_REVENUE', 'NUMBER_OF_EMPLOYEES',
       'PLACE_OF_MANUFACTURE_CLASS_NONE', 'PLACE_OF_MANUFACTURE_CLASS_YES',
       'EXTENT_COMPETED_A', 'EXTENT_COMPETED_D', 'TYPE_OF_SET_ASIDE_HZC',
       'TYPE_OF_SET_ASIDE_ISBEE', 'TYPE_OF_SET_ASIDE_SBA',
       'TYPE_OF_SET_ASIDE_SBP', 'TYPE_OF_SET_ASIDE_SDVOSBC',
       'VENDOR_ADDRESS_ZIP_CODE_0', 'VENDOR_ADDRESS_ZIP_CODE_1',
       'VENDOR_ADDRESS_ZIP_CODE_2', 'VENDOR_ADDRESS_ZIP_CODE_3',
       'VENDOR_ADDRESS_ZIP_CODE_4', 'VENDOR_ADDRESS_ZIP_CODE_6',
       'VENDOR_ADDRESS_ZIP_CODE_7', 'VENDOR_ADDRESS_ZIP_CODE_8',
       'VENDOR_ADDRESS_ZIP_CODE_9', 'WOMEN_OWNED_FLAG_NO',
       'MANUFACTURER_OF_GOODS_NO', 'MANUFACTURER_OF_GOODS_YES',
       'LIMITED_LIABILITY_CORPORATION_NO', 'LIMITED_LIABILITY_CORPORATION_YES',
       'PRODUCT_OR_SERVICE_TYPE_P', 'PRODUCT_OR_SERVICE_TYPE_S',
       'FOR_PROFIT_ORGANIZATION_NO', 'FOR_PROFIT_ORGANIZATION_YES', 'SDB_NO',
       'SDB_YES', 'PRINCIPAL

In [79]:
X_dum = pd.get_dummies(X, drop_first=True)
xfinal = X_dum[['ANNUAL_REVENUE', 'NUMBER_OF_EMPLOYEES',
       'PLACE_OF_MANUFACTURE_CLASS_NONE', 'PLACE_OF_MANUFACTURE_CLASS_YES',
       'EXTENT_COMPETED_A', 'EXTENT_COMPETED_D', 'TYPE_OF_SET_ASIDE_HZC',
       'TYPE_OF_SET_ASIDE_ISBEE', 'TYPE_OF_SET_ASIDE_SBA',
       'TYPE_OF_SET_ASIDE_SBP', 'TYPE_OF_SET_ASIDE_SDVOSBC',
       'VENDOR_ADDRESS_ZIP_CODE_0', 'VENDOR_ADDRESS_ZIP_CODE_1',
       'VENDOR_ADDRESS_ZIP_CODE_2', 'VENDOR_ADDRESS_ZIP_CODE_3',
       'VENDOR_ADDRESS_ZIP_CODE_4', 'VENDOR_ADDRESS_ZIP_CODE_6',
       'VENDOR_ADDRESS_ZIP_CODE_7', 'VENDOR_ADDRESS_ZIP_CODE_8',
       'VENDOR_ADDRESS_ZIP_CODE_9', 'WOMEN_OWNED_FLAG_NO',
       'MANUFACTURER_OF_GOODS_NO', 'MANUFACTURER_OF_GOODS_YES',
       'LIMITED_LIABILITY_CORPORATION_NO', 'LIMITED_LIABILITY_CORPORATION_YES',
       'PRODUCT_OR_SERVICE_TYPE_P', 'PRODUCT_OR_SERVICE_TYPE_S',
       'FOR_PROFIT_ORGANIZATION_NO', 'FOR_PROFIT_ORGANIZATION_YES', 'SDB_NO',
       'SDB_YES', 'PRINCIPAL_NAICS_CODE_11', 'PRINCIPAL_NAICS_CODE_23',
       'PRINCIPAL_NAICS_CODE_31', 'PRINCIPAL_NAICS_CODE_33',
       'PRINCIPAL_NAICS_CODE_48', 'PRINCIPAL_NAICS_CODE_54',
       'PRINCIPAL_NAICS_CODE_56', 'PRINCIPAL_NAICS_CODE_62',
       'PRINCIPAL_NAICS_CODE_81']]

# xfinal = X[['PLACE_OF_MANUFACTURE', 'DOLLARS_OBLIGATED', 'EXTENT_COMPETED',
#        'TYPE_OF_SET_ASIDE', 'VENDOR_ADDRESS_ZIP_CODE', 'FIRM_8A_FLAG',
#        'WOMEN_OWNED_FLAG', 'CORP_ENTITY_NOT_TAX_EXEMPT', 'PARTNERSHIP_OR_LLP',
#        'MANUFACTURER_OF_GOODS', 'LIMITED_LIABILITY_CORPORATION',
#        'IDV_EXTENT_COMPETED', 'IDV_EVALUATED_PREFERENCE', 'PART8_OR_PART13',
#        'TOTAL_ESTIMATED_ORDER_VALUE', 'DOT_CERTIFIED_DISADV_BUS', 'SDB',
#        'PRINCIPAL_NAICS_CODE', #'ANNUAL_REVENUE', 'NUMBER_OF_EMPLOYEES'
#            ]]

In [80]:
%%time
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(xfinal, y, train_size=0.7)

# Initialize the Logistic Regression model with softmax activation
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)

# Train the Logistic Regression model on the training data
log_reg.fit(X_train, y_train)

# Predict probabilities for the testing data
probabilities = log_reg.predict_proba(X_test)

# Get the top 3 predicted classes for each sample
top_5_indices = np.argsort(probabilities, axis=1)[:, -5:]

# Check if the true label is in the top 5 predicted classes for each sample
predicted_labels = log_reg.classes_[top_5_indices]
accurate_predictions = np.any(predicted_labels == y_test[:, np.newaxis], axis=1)

# Calculate accuracy based on whether the true label is in the top 3 predicted classes
accuracy = np.mean(accurate_predictions)
print("Accuracy:", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.42890027797081304
CPU times: user 17min 3s, sys: 7min, total: 24min 3s
Wall time: 5min 51s


**With Funding Office**

43.29% top-5 accuracy

**With Contracting Office**

43.21% top-5 accuracy

52.23% top-10 accuracy

73.61% top-50 accuracy