In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [2]:
seed = 2000
protected = False
sam = False
import_cols = [
    'RESEARCH_AND_DEVELOPMENT',
    'DOMESTIC_SHELTER',
    'TYPE_OF_SET_ASIDE',
    'SOLICITATION_ID',
    'CONSTRUCTION_FIRM',
    'CO_BUS_SIZE_DETERMINATION',
    'CAGE_CODE',
    'VETERAN_OWNED_FLAG',
    'CORP_ENTITY_NOT_TAX_EXEMPT',
    'FUNDING_DEPARTMENT_ID',
    # 'FUNDING_AGENCY_NAME',
    'FUNDING_AGENCY_ID',
    # 'FUNDING_OFFICE_NAME',
    'FUNDING_OFFICE_ID',
    'SERVICE_PROVIDER',
    'PRODUCT_OR_SERVICE_TYPE',
    # 'MODIFICATION_NUMBER',
    # 'PIID',
    'FOUNDATION',
    # 'EVALUATED_PREFERENCE',
    'SRDVOB_FLAG',
    'CORP_ENTITY_TAX_EXEMPT',
    'MANUFACTURER_OF_GOODS',
    'VENDOR_ADDRESS_COUNTRY_NAME',
    'VENDOR_ADDRESS_ZIP_CODE',
    'SDB',
    'VETERINARY_HOSPITAL',
    'COMMUNITY_CORP_OWNED_FIRM',
    'DOT_CERTIFIED_DISADV_BUS',
    'PRINCIPAL_NAICS_CODE',
    'EDUCATIONAL_INSTITUTION_FLAG',
    'LIMITED_LIABILITY_CORPORATION',
    'EXTENT_COMPETED',
    'FEDERALLY_FUNDED_R_AND_D_CORP',
    'SOLE_PROPREITORSHIP',
    'WOMEN_OWNED_FLAG',
    'ARCHITECTURE_AND_ENGINEERING',
    'HISPANIC_SERVICING_INSTITUTION',
    # 'IDV_PIID',
    'PLACE_OF_MANUFACTURE',
    # 'IDV_EXTENT_COMPETED',
    'AWARD_FISCAL_YEAR',
    # 'IDV_SIGNED_DATE',
    'FIRM_8A_FLAG',
    'SMALL_AGRICULTURAL_COOPERATIVE',
    'PARTNERSHIP_OR_LLP',
    'DOLLARS_OBLIGATED',
    # 'IDV_NUMBER_OF_OFFERS',
    'FOR_PROFIT_ORGANIZATION',
    # 'AWARD_OR_IDV',
    'FIRM8A_JOINT_VENTURE',
    # 'IDV_CONTRACTING_AGENCY_ID',
]
if protected:
    import_cols = import_cols + ['ANNUAL_REVENUE', 'NUMBER_OF_EMPLOYEES']


import_years = [2019]#, 2020, 2021, 2022, 2023]
min_class_size = 8

model_cols = [
    'RESPONSE',
    # 'FUNDING_AGENCY_ID',
    'RESEARCH_AND_DEVELOPMENT',
    'DOMESTIC_SHELTER',
    'TYPE_OF_SET_ASIDE',
    'CONSTRUCTION_FIRM',
    'VETERAN_OWNED_FLAG',
    'CORP_ENTITY_NOT_TAX_EXEMPT',
    'SERVICE_PROVIDER',
    'PRODUCT_OR_SERVICE_TYPE',
    # 'MODIFICATION_NUMBER',
    # 'PIID',
    'FOUNDATION',
    # 'EVALUATED_PREFERENCE',
    'SRDVOB_FLAG',
    'CORP_ENTITY_TAX_EXEMPT',
    'MANUFACTURER_OF_GOODS',
    'VENDOR_ADDRESS_ZIP_CODE',
    #'VENDOR_STATE', #FEATURE ENGINEERED BELOW
    'SDB',
    'VETERINARY_HOSPITAL',
    'COMMUNITY_CORP_OWNED_FIRM',
    'DOT_CERTIFIED_DISADV_BUS',
    'PRINCIPAL_NAICS_CODE',
    'EDUCATIONAL_INSTITUTION_FLAG',
    'LIMITED_LIABILITY_CORPORATION',
    'FEDERALLY_FUNDED_R_AND_D_CORP',
    'SOLE_PROPREITORSHIP',
    'WOMEN_OWNED_FLAG',
    'ARCHITECTURE_AND_ENGINEERING',
    'HISPANIC_SERVICING_INSTITUTION',
    # 'IDV_PIID',
    'PLACE_OF_MANUFACTURE_CLASS',
    # 'IDV_EXTENT_COMPETED',
    # 'AWARD_FISCAL_YEAR',
    # 'IDV_SIGNED_DATE',
    'FIRM_8A_FLAG',
    'SMALL_AGRICULTURAL_COOPERATIVE',
    'PARTNERSHIP_OR_LLP',
    # 'IDV_NUMBER_OF_OFFERS',
    'FOR_PROFIT_ORGANIZATION',
    # 'AWARD_OR_IDV',
    'FIRM8A_JOINT_VENTURE',
    # 'IDV_CONTRACTING_AGENCY_ID',
    'CONTRACTS_PER_YEAR'
]
if protected:
    model_cols = model_cols + ['ANNUAL_REVENUE', 'NUMBER_OF_EMPLOYEES']

min_importance = 0.015 #at 0.015 we finally get zip, 
col_tree_depth = 4
hard = False

n_trees = 100
max_depth = 7
top_n_classes = 20

## Import Data

In [3]:
def import_dataset(import_cols, years, sam=True):
    """
    Imports, cleans, and joins our data with specified columns and years
    Inputs: 
        import_cols (list [str] of column names)
        years (list [int] of years to import)
        sam (bool of whether to merge with sam dataset)
    Output:
        Cleaned, filtered, and joined dataframe
    """
    if sam:
        SAM = pd.read_csv('SAM.CSV') #imports SAM df
    
    year_dfs = []
    for year in years:
        if protected: #whether we're using protected columns or not
            temp_df = pd.read_parquet('fy' + str(year) + '.parquet', columns=import_cols)
        else:
            temp_df = pd.read_parquet(str(year) + '.parquet', columns=import_cols) #import year's data
        
        temp_df = temp_df[temp_df['CO_BUS_SIZE_DETERMINATION'] == "SMALL BUSINESS"] #filter for small business
        temp_df = temp_df[temp_df['VENDOR_ADDRESS_COUNTRY_NAME'] == "UNITED STATES"] #filter for US
        temp_df = temp_df[temp_df['EXTENT_COMPETED'].isin(["A", "D", "E", "CDO"])] #filter for competition
        
        # temp_df['DOLLARS_OBLIGATED'] = pd.to_numeric(temp_df['DOLLARS_OBLIGATED'], errors='coerce') #make numeric
        
        if sam:
            temp_m = pd.merge(temp_df, SAM, on="CAGE_CODE", how="inner") #merge with SAM
        else:
            temp_m = temp_df
        
        # idx = temp_m.groupby(['SOLICITATION_ID','CAGE_CODE'])['DOLLARS_OBLIGATED'].idxmax() #find initial contract win
        # temp_m = temp_m.loc[idx] #filter to initial contract win
        
        temp_m = temp_m[temp_m['DOLLARS_OBLIGATED'] > 0] #filter DOLLARS_OBLIGATED
        
        print(f'{year} shape: {temp_m.shape}')
        year_dfs.append(temp_m) #append year dataset to list of year datasets
    
    merged_df = pd.concat(year_dfs, ignore_index=True) #merge all years
    
    for df in year_dfs:
        del df
    del year_dfs #delete the individual dfs from memory
    
    idx = merged_df.groupby(['SOLICITATION_ID','CAGE_CODE'])['DOLLARS_OBLIGATED'].idxmax() #find initial contracts
    filtered_merged_df = merged_df.loc[idx] #filter to initial contract
    
    print(f'total shape: {filtered_merged_df.shape}')
    
    #place of manufacture conversion
    def convert_place_of_manufacture(value):
        if value == 'D':
            return 'YES' #manufactured in US
        elif value == 'C':
            return 'NO' #not manufactured in US
        elif value in ['N/A', 'A', 'G', 'E', 'H', 'L', 'J', 'F', 'K', 'B', 'I']:
            return 'NONE'
        else:
            return 'NONE' #N/A (provides a service or doesn't qualify fully)
    
    #clean up individual columns
    filtered_merged_df['FUNDING_DEPARTMENT_ID'] = filtered_merged_df['FUNDING_DEPARTMENT_ID'].str.strip() #clean dept ID
    if protected:
        filtered_merged_df['ANNUAL_REVENUE'] = filtered_merged_df['ANNUAL_REVENUE'].astype(float)
    # filtered_merged_df['IDV_PIID'] = filtered_merged_df['IDV_PIID'].str.strip() #clean IDV PIID
    # filtered_merged_df['PIID'] = filtered_merged_df['PIID'].str.strip() #clean PIID
    filtered_merged_df['PLACE_OF_MANUFACTURE_CLASS'] = filtered_merged_df['PLACE_OF_MANUFACTURE'].apply(convert_place_of_manufacture) #clean PLACE_OF_MANUFACTURE
    filtered_merged_df['VENDOR_ADDRESS_ZIP_CODE'] = filtered_merged_df['VENDOR_ADDRESS_ZIP_CODE'].astype(str).str[:5] #clean ZIP to 5-digit
    filtered_merged_df = filtered_merged_df[filtered_merged_df['VENDOR_ADDRESS_ZIP_CODE'].str.len()==5]
    filtered_merged_df['TYPE_OF_SET_ASIDE'] = filtered_merged_df['TYPE_OF_SET_ASIDE'].fillna('NONE') #assume NA = NONE
    
    filtered_merged_df = filtered_merged_df.dropna(subset=filtered_merged_df.columns.difference(['PLACE_OF_MANUFACTURE'])) #remove rows with NAs here
    
    print(f'total filtered shape: {filtered_merged_df.shape}')
    
    return filtered_merged_df

In [4]:
df0 = import_dataset(import_cols, import_years, sam=False)

2019 shape: (1772611, 41)
total shape: (35321, 41)
total filtered shape: (35321, 42)


## Create Response Variable

In [5]:
def response_var(df, min_class_size=1):
    """
    Creates a new response variable by splitting the top_n most common agencies into their respective offices,
    then filter the dataframe so only classes with at least min_class_size rows remain
    Inputs:
        df (dataframe output from import_dataset())
        min_class_size (integer of minimum # of observations a class can have to be a valid response)
    Outputs:
        dataframe with additional RESPONSE column, filtered
    """
    df['RESPONSE'] = df['FUNDING_OFFICE_ID']
    
    class_sizes = df['RESPONSE'].value_counts()
    classes_to_keep = class_sizes[class_sizes >= min_class_size].index
    df = df[df['RESPONSE'].isin(classes_to_keep)]
    
    print(f"{df['RESPONSE'].nunique()} unique response classes")
    
    return df

In [6]:
df = response_var(df0, min_class_size)

870 unique response classes


## Perform any Feature Engineering Here

In [7]:
df.loc[:,'PRINCIPAL_NAICS_CODE'] = df['PRINCIPAL_NAICS_CODE'].str[:2]

In [8]:
# zip_df = pd.read_csv('zip_code_database.csv', converters={'zip': str})
# zcdb = pd.Series(zip_df['state'].values,index=zip_df['zip']).to_dict()
# df.loc[:,'VENDOR_STATE'] = df["VENDOR_ADDRESS_ZIP_CODE"].map(zcdb, na_action='ignore')
# df = df[~df['VENDOR_STATE'].isna()]

In [9]:
df.loc[:,'VENDOR_ADDRESS_ZIP_CODE'] = df['VENDOR_ADDRESS_ZIP_CODE'].str[:1]

In [10]:
contracts_per_year = df.groupby(['CAGE_CODE', 'AWARD_FISCAL_YEAR']).size().to_frame().reset_index()
contracts_per_year.columns = list(contracts_per_year.columns[:2]) + ['CONTRACTS_PER_YEAR']
df = df.merge(contracts_per_year, on=['CAGE_CODE', 'AWARD_FISCAL_YEAR'])

## Model Preprocessing

In [11]:
def preprocess_data(df, model_cols, train_test=True, train_size=0.8, scale_quant=True):
    """
    Preprocesses the dataframe appropriately after creating the response variable and after feature engineering
    Input:
        df (dataframe output from response_var())
        model_cols (list [str] of cols to use to model)
        train_test (bool of whether to split the data into train/test groups)
        train_size (float [0, 1] of proportion of training data to whole data)
        scale_quant (bool of whether to scale quantitative variables)
        
    Output:
        X (df of X values to be used in modeling, dummy encoded & scaled)
        y (series of y values for modeling, label encoded)
    """
    Xy = df[model_cols] #select only the modeling columns
    
    y = Xy['RESPONSE'] #initialize y
    X = Xy.drop('RESPONSE', axis=1) #initialize X
    
    enc = LabelEncoder()
    y = enc.fit_transform(y) #transform y into labeled column
    y = pd.DataFrame({'RESPONSE': y,
                    'FUNDING_AGENCY_ID': df['FUNDING_AGENCY_ID']})
    global class_ids 
    class_ids = enc.classes_
    
    categoricals = X.select_dtypes(include=['object', 'category']).columns.tolist()
    X_cat = pd.get_dummies(X[categoricals]) #one-hot encode categorical columns
    
    quantitatives = X.columns.difference(categoricals)
    if scale_quant and len(quantitatives) > 0:
        scaler = StandardScaler()
        X_quant = scaler.fit_transform(X[quantitatives]) #scale quantitative columns
        X_quant = pd.DataFrame(X_quant, columns=quantitatives)
        X_cat = X_cat.reset_index()
    else:
        X_quant = X[quantitatives]
    
    X = pd.concat([X_cat, X_quant, df[['FUNDING_AGENCY_ID']]], axis=1).drop('index', axis=1) #combine quant & cat subsets back into one df
    
    if train_test:
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=seed)
        return X_train, X_test, y_train, y_test
    else:
        return X, y

In [12]:
X_train, X_test, y_train, y_test = preprocess_data(df, model_cols, train_test=True, train_size=0.8, scale_quant=True)

## Column Selection for Each Agency

In [13]:
with open('Dashboard_Columns.pkl', 'rb') as f:
        final_cols = pickle.load(f)

## Train Random Forest & Assess for Each Agency

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

In [15]:
def train_forest(X_train, X_test, y_train, y_test, agency_id, columns, top_n_classes, n_trees=1000, max_depth=2):
    X_train = X_train[X_train['FUNDING_AGENCY_ID']==agency_id]
    X_test = X_test[X_test['FUNDING_AGENCY_ID']==agency_id]
    y_train = y_train[y_train['FUNDING_AGENCY_ID']==agency_id].drop('FUNDING_AGENCY_ID', axis=1).to_numpy().ravel()
    y_test = y_test[y_test['FUNDING_AGENCY_ID']==agency_id].drop('FUNDING_AGENCY_ID', axis=1).to_numpy().ravel()
    X_train = X_train[columns]
    X_test = X_test[columns]
    
    if (len(X_train) < 1) or (len(X_test) < 1):
        full_y = np.concatenate((y_train, y_test))
        if len(full_y) < 1:
            return agency_id, None, None
        else:
            empirical_probs = pd.Series(full_y).value_counts(normalize=True)
            return agency_id, empirical_probs, None
    
    # Initialize the Random Forest model
    random_forest = RandomForestClassifier(n_estimators=n_trees, max_depth=max_depth, random_state=seed, criterion='log_loss')
    # random_forest = GradientBoostingClassifier(n_estimators=5, learning_rate=0.1, max_depth=7, random_state=seed)

    # Train the Random Forest model on the training data
    random_forest.fit(X_train, y_train)

    # Predict probabilities for the testing data
    probabilities = random_forest.predict_proba(X_test)

    # Get the top n predicted classes for each sample
    top_n_indices = np.argsort(probabilities, axis=1)[:, -top_n_classes:]

    # Check if the true label is in the top n predicted classes for each sample
    predicted_labels = random_forest.classes_[top_n_indices]
    accurate_predictions = np.any(predicted_labels == y_test[:, np.newaxis], axis=1)

    # Calculate accuracy based on whether the true label is in the top n predicted classes
    accuracy = np.mean(accurate_predictions)
    
    return agency_id, random_forest, accuracy

In [16]:
agency_id, model, acc = train_forest(X_train, X_test, y_train, y_test, 
                                     agency_id='7008',
                                     columns=final_cols, top_n_classes=top_n_classes,
                                     n_trees=n_trees, max_depth=max_depth)
acc

1.0

In [17]:
def train_office_models(X_train, X_test, y_train, y_test, agencies, columns, top_n_classes, n_trees=1000, max_depth=2):
    model_dic = {}
    for agency in agencies:
        agency_id, model, acc = train_forest(X_train, X_test, y_train, y_test, 
                                             agency_id=str(agency),
                                             columns=final_cols, top_n_classes=top_n_classes,
                                             n_trees=n_trees, max_depth=max_depth)
        model_dic[agency_id] = [model, acc]
    return model_dic

In [18]:
train_forest(X_train, X_test, y_train, y_test, 
                                             agency_id='7008',
                                             columns=final_cols, top_n_classes=top_n_classes,
                                             n_trees=n_trees, max_depth=3)

('7008',
 RandomForestClassifier(criterion='log_loss', max_depth=3, random_state=2000),
 1.0)

In [19]:
with open('Agency_ID_to_Name.pkl', 'rb') as f:
        agency_id_to_name = pickle.load(f)

In [20]:
agencies = list(agency_id_to_name.keys())
office_models = train_office_models(X_train, X_test, y_train, y_test,
                                    agencies=agencies,
                                    columns=final_cols,
                                    top_n_classes=5,
                                    n_trees=100,
                                    max_depth=2)
agencies_bool = [False if p[0] is None else True for p in list(office_models.values())]
def filter_dict_by_bools(dictionary, bool_list):
    filtered_dict = {}
    for key, include in zip(dictionary.keys(), bool_list):
        if include:
            filtered_dict[key] = dictionary[key]
    return filtered_dict
office_models = filter_dict_by_bools(office_models, agencies_bool)

In [21]:
with open('Agency_for_Offices.pkl', 'rb') as f:
        agency_clf = pickle.load(f)

In [22]:
with open('Class_IDs.pkl', 'rb') as f:
        agency_class_to_id = pickle.load(f)

In [57]:
def predict(row, y_row, agency_clf, offices_clf, top_n_results, top_n_offices):
    row = row[final_cols]
    agency_probs = pd.DataFrame(agency_clf.predict_proba(row)).T
    agency_probs = agency_probs.reset_index()
    agency_probs['Agency ID'] = agency_probs['index'].map(agency_class_to_id)
    agency_probs.columns = ['Agency Label', 'Agency Probability', 'Agency ID']
    
    agency_probs['Office Model'] = agency_probs.apply(lambda x: offices_clf[x['Agency ID']] if x['Agency ID'] in list(offices_clf.keys()) else [None, 0] , axis=1)
    
    agency_probs = agency_probs.sort_values('Agency Probability', ascending=False).head(20)
    office_dicts = []
    for i in range(len(agency_probs)):
        df_row = agency_probs.iloc[i]['Office Model']
        if df_row[1] is None:
            office_prob = df_row[0].to_list()
            office_label = list(df_row[0].index)
        elif df_row[0] is None:
            office_prob = [0]
            office_label = ["ERROR"]
        else:
            office_prob = df_row[0].predict_proba(row)[0].tolist()
            office_label = df_row[0].classes_.tolist()
    
        office_dicts += [{k: v for k, v in zip(office_label, office_prob)}]
    
    office_dicts_new = []
    for dic in office_dicts:
        office_dicts_new += [[{k: v} for k, v in dic.items()]]
    
    agency_probs['Office Dict'] = office_dicts_new
    total_prob = agency_probs.explode('Office Dict')
    total_prob['Office Label'] = total_prob['Office Dict'].apply(lambda x: list(x.keys())[0])
    office_label_to_id = {k: v for k, v in zip(range(len(class_ids)), class_ids)}
    total_prob['Office ID'] = total_prob['Office Label'].map(office_label_to_id)
    total_prob['Office Probability'] = total_prob['Office Dict'].apply(lambda x: list(x.values())[0])
    total_prob['Agency_Office Probability'] = total_prob['Agency Probability'] * total_prob['Office Probability']
    
    # top_n = total_prob.sort_values('Agency_Office Probability', ascending=False).head(top_n_results)
    # office_result = np.any(top_n['Office Label'] == y_row['RESPONSE'])
    # agency_result = np.any(top_n['Agency ID'] == y_row['FUNDING_AGENCY_ID'])
    
    uniform_prob = 1 / total_prob.groupby('Agency Label').size().to_frame()
    merged_prob = total_prob.merge(uniform_prob, left_on='Agency Label', right_on=uniform_prob.index)
    merged_prob = merged_prob[merged_prob['Office Probability'] >= merged_prob[0]] #gotta be at least better than uniform to make it
    
    merged_prob['Agency_Office Probability'] = merged_prob['Agency Probability'] * merged_prob['Office Probability']
    
    # def get_top_n_rows(group): #get only top_n offices for each agency, also normalize probability based on new subset
    #     top_n = group.nlargest(top_n_offices, 'Office Probability')
    #     top_n['Office Proportion'] = top_n['Office Probability'] / top_n['Office Probability'].sum()
    #     return top_n
    # merged_prob = merged_prob.groupby('Agency Label', group_keys=False).apply(get_top_n_rows, include_groups=False)
    # merged_prob['Agency_Office Probability'] = merged_prob['Agency Probability'] * merged_prob['Office Proportion']
    
    top_n = merged_prob.sort_values(['Agency_Office Probability'], ascending=False).head(top_n_results)
    office_result = np.any(top_n['Office Label'] == y_row['RESPONSE'])
    agency_result = np.any(top_n['Agency ID'] == y_row['FUNDING_AGENCY_ID'])    
    
    return office_result, agency_result, merged_prob

In [41]:
# %%timeit
# office_result, agency_result = predict(pd.DataFrame(X_test.iloc[5]).T, y_test.iloc[5], agency_clf, office_models, top_n_results=5)

In [42]:
## 24 minutes to run through all of them (not limiting agency probabilities)
## 9 minutes for all (top 40 agencies only)

In [58]:
office_results = []
agency_results = []
for i in range(len(X_test)):
    office_result, agency_result, merged_prob = predict(pd.DataFrame(X_test.iloc[i]).T, y_test.iloc[i],
                                                        agency_clf, office_models, top_n_results=20, top_n_offices=20)
    office_results += [office_result]
    agency_results += [agency_result]

In [59]:
sum(office_results) / len(office_results)

0.5199858507251504

In [60]:
sum(agency_results) / len(agency_results)

0.8142907675981605

In [None]:
# better than uniform cutoff & now min number of contracts per year is 8
# 0.5199858507251504
# 0.8142907675981605
# better than uniform cutoff & now min number of contracts per year is 8 (top 10 offices per agency)
# 0.5129112133003184
# 0.8135833038556773
# same as above w/ 20 offices per agency

In [None]:
# added the how many times have you contracted this year variable:
# top 20 office when agency limited to top 20: 0.5008798592225244
# top 20 agency when agency limited to top 20: 0.818109102543593
# require better than uniform office prob to make it:
# top 20 office when agency limited to top 20: 0.5018397056470965
# top 20 agency when agency limited to top 20: 0.8246680531115022
# when you do only top 10 offices per agency:
# 0.47528395456726924
# 0.8027515597504399

In [None]:
# xgboosted agency model
# top 20 office when agency limited to top 20: 0.3469844824828027
# top 20 agency when agency limited to top 20: 0.5155975043992961
# when you add the filtering for only better than uniform offices:
# 0.3519436890097584
# 0.5519116941289394

In [None]:
#0.001, log_loss on agency & office
# top 20 office when agency limited to top 20: 0.40889457686770114
# top 20 agency when agency limited to top 20: 0.7398816189409695

In [None]:
# 0.001 better for column selection
# this cell uses 0.001 and does weighted averages instead of probabilities (80% agency, 20% office)
# top 20 office when agency limited to top 40: 0.3258678611422172
# top 20 agency when agency limited to top 40: 0.48616221404575266

In [None]:
# same as above but (20% agency, 80% office)
# top 20 office when agency limited to top 20: 0.2839545672692369
# top 20 agency when agency limited to top 20: 0.46168613021916494

In [None]:
# these cells use a cutoff of 0.001 for column selection
# top 20 office when agency limited to top 40: 0.4053751399776036
# top 20 agency when agency limited to top 40: 0.7306031035034395
# top 20 office when agency limited to top 20: 0.40457526795712684
# top 20 agency when agency limited to top 20: 0.7358822588385858

In [None]:
# the below cells all use a cutoff of 0.015 for column selection

In [None]:
# top 20 office when agency limited to top 40: 0.39529675251959684
# top 20 agency when agency limited to top 40: 0.710926251799712
# top 20 office when agency limited to top 20: 0.3922572388417853
# top 20 agency when agency limited to top 20: 0.7149256119020957

In [None]:
# top 20: 0.39529675251959684
# top 20 when I limit agency to top 40: 0.39529675251959684
# top 20 when I limit agency to top 20: 0.3922572388417853
# top 10 when I limit agency to top 20: 0.3092305231163014
# top 20 when I limit agency to top 10: 0.3725803871380579