# Grouping Companies for non-downtown buildings

In [1]:
from utils import owners
from utils import geo

In [2]:
import pandas as pd
import numpy as np
import requests
import json
import os
import re
import geopandas as gp
import urllib.parse

## Step 1: Map owner names to entries in the CCFS database

In [None]:
df_districts = gp.read_file("../../../../data/Council_Districts.geojson")
df = pd.read_csv('../../../../data/2020_Building_Energy_Benchmarking.csv')
df = gp.GeoDataFrame(df, geometry=gp.points_from_xy(df.Longitude, df.Latitude))
geo.clean_districts(df, df_districts)

df_filtered = df.loc[df['Neighborhood'] != "DOWNTOWN"]

building_owners = pd.read_csv('../../../../experiments/worst_offenders/updated_owners_2_15_23.csv')
# Map tax ids to landlord name
d = pd.Series(building_owners.Owner.values, index=building_owners.TaxParcelIdentificationNumber).to_dict()
df_filtered['Landlord'] = df_filtered['TaxParcelIdentificationNumber'].map(lambda row: d.get(row, ""))

df_filtered.head()

In [5]:
unique_not_downtown_landlords = df_filtered['Landlord'].unique()
unique_not_downtown_landlords = pd.DataFrame(unique_not_downtown_landlords, columns=['owner_name'])
unique_not_downtown_landlords = unique_not_downtown_landlords[~unique_not_downtown_landlords['owner_name'].isin(['NOT FOUND', 'UNDEFINED'])]
unique_not_downtown_landlords.to_csv('unique_not_downtown_landlords.csv')
owner_search_list = list(unique_not_downtown_landlords['owner_name'])

In [None]:
lookup_helper = owners.LookupCompaniesHelper(os.getcwd())
lookup_helper.get_company_matches_and_export(owner_search_list[:10],1)

## Step 2: Get all companies and their principals
Now that we've mapped company names to their entries in the CCFS database, we create a list of all the companies and all the principals registered to that company. This all-matches-all-principals will be used in step 3 to iterate through and group by shared principals. 

In [None]:
group_helper = owners.GroupCompaniesHelper(os.getcwd(), "companies_and_potential_matches_ntd.csv")

In [None]:
exact_matches_1 = pd.read_csv("exact_matches_1.csv")
exact_matches_1_principals = group_helper.get_companies_principals(exact_matches_1)

In [None]:
all_matches = pd.DataFrame([])
for i in range(1, 11):
    print(f"Getting principals for exact_matches_{i}")
    exact_matches = pd.read_csv(f"exact_matches_{i}.csv")
    exact_matches_principals = group_helper.get_companies_principals(exact_matches)
    all_matches = pd.conact([all_matches, exact_matches_principals])

for i in range(1, 10):
    print(f"Getting principals for potential_matches_{i}")
    potential_matches = pd.read_csv(f"potential_matches_{i}.csv")
    potential_matches = potential_matches[potential_matches['isMatch']==1]
    potential_matches_principals = group_helper.get_companies_principals(potential_matches)
    all_matches = pd.conact([all_matches, potential_matches_principals])

# Step 3: Group Companies by shared principals
Now that we have all of the companies in CCFS database and all of the principals registered to that company, we can group all of the results by shared principals. 

In [None]:
companies_and_matches = group_helper.group_companies_by_principals(all_matches)