# Find Owners of Seattle Buildings: Step 1

In [1]:
# This is a locally defined package.
# If you have trouble with this import, go to the root directory and run `pip install .`

from utils import geo

In [2]:
import pandas as pd
import numpy as np
import requests
import json
import os
import re
import geopandas as gp
import urllib.parse

## Clean up input list

We need to start with a list of landlords for our buildings. We use the parcel owner from King County's eRealProperty site.

This code is just to clean up the data we already have. If you already have a dataframe with a list of owner names, renamed the column `owner_name` and proceed to the next header.

In [3]:
df_districts = gp.read_file("../../../data/Council_Districts.geojson")
df = pd.read_csv('../../../data/2020_Building_Energy_Benchmarking.csv')
df = gp.GeoDataFrame(df, geometry=gp.points_from_xy(df.Longitude, df.Latitude))
geo.clean_districts(df, df_districts)

Building UNION HARBOR CONDOMINIUM 454/ 8807200000 doesn't have a district POINT (-122.33003 47.6401) 
	 Found district 4 for UNION HARBOR CONDOMINIUM
Building WATERWORKS OFFICE & MARINA 1494/ 4088803975 doesn't have a district POINT (-122.33895 47.63575) 
	 Found district 7 for WATERWORKS OFFICE & MARINA
Building NAUTICAL LANDING 1742/ 4088804350 doesn't have a district POINT (-122.34219 47.64306) 
	 Found district 7 for NAUTICAL LANDING
Building THE PIER AT LESCHI 3453/ 6780900000 doesn't have a district POINT (-122.28563 47.59926) 
	 Found district 3 for THE PIER AT LESCHI
Building EDUCARE 3496/ 2895800030 doesn't have a district POINT EMPTY 
Building THE LAKESHORE 3506/ 1180001715 doesn't have a district POINT EMPTY 


In [6]:
# filter to non-downtown neighborhoods
df_filtered = df.loc[df['Neighborhood'] != "DOWNTOWN"]
df_filtered.head()

Unnamed: 0,OSEBuildingID,TaxParcelIdentificationNumber,DataYear,BuildingType,BuildingName,Owner,CouncilDistrictCode,Neighborhood,Units,YearBuilt,...,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,Outlier,ComplianceIssue,ComplianceStatus,Comments,DefaultData,LegislationPropertyType,geometry
0,50148,7733600135,2020,NonResidential,RESIDENCE INN SEATTLE U,TMUD GSL LLC,4.0,NORTHEAST,,2016,...,54695.0,,,,No Issue,Compliant,,,Hotel,POINT (-122.31569 47.66148)
1,50150,1991200090,2020,NonResidential,HYATT HOUSE,HH SEATTLE LLC,7.0,LAKE UNION,,2016,...,28069.0,,,,No Issue,Compliant,,,Hotel,POINT (-122.34799 47.62009)
3,50166,4083306985,2020,NonResidential,TABLEAU (NORTH EDGE),BRE-BMR 34TH LLC,4.0,LAKE UNION,,2016,...,117684.0,Restaurant,2839.0,,No Issue,Compliant,,,Office,POINT (-122.33814 47.64790)
5,50194,276770-3010,2020,NonResidential,BALLARD SPACE,NOT FOUND,6.0,BALLARD,,2016,...,,,,,No Issue,Compliant,,,Office,POINT (-122.38309 47.66673)
6,50306,1959701250,2020,NonResidential,2701 EASTLAKE,SEATTLE CITY OF,4.0,LAKE UNION,,1970,...,,,,,No Issue,Compliant,,,Office,POINT (-122.32609 47.64464)


In [7]:
# this CSV uses consolidated owner names
building_owners = pd.read_csv('../../../experiments/worst_offenders/updated_owners_2_15_23.csv')
# Map tax ids to landlord name
d = pd.Series(building_owners.Owner.values, index=building_owners.TaxParcelIdentificationNumber).to_dict()
df_filtered['Landlord'] = df_filtered['TaxParcelIdentificationNumber'].map(lambda row: d.get(row, ""))

df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,OSEBuildingID,TaxParcelIdentificationNumber,DataYear,BuildingType,BuildingName,Owner,CouncilDistrictCode,Neighborhood,Units,YearBuilt,...,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,Outlier,ComplianceIssue,ComplianceStatus,Comments,DefaultData,LegislationPropertyType,geometry,Landlord
0,50148,7733600135,2020,NonResidential,RESIDENCE INN SEATTLE U,TMUD GSL LLC,4.0,NORTHEAST,,2016,...,,,,No Issue,Compliant,,,Hotel,POINT (-122.31569 47.66148),TMUD GSL LLC
1,50150,1991200090,2020,NonResidential,HYATT HOUSE,HH SEATTLE LLC,7.0,LAKE UNION,,2016,...,,,,No Issue,Compliant,,,Hotel,POINT (-122.34799 47.62009),HH SEATTLE LLC
3,50166,4083306985,2020,NonResidential,TABLEAU (NORTH EDGE),BRE-BMR 34TH LLC,4.0,LAKE UNION,,2016,...,Restaurant,2839.0,,No Issue,Compliant,,,Office,POINT (-122.33814 47.64790),BRE-BMR 34TH LLC
5,50194,276770-3010,2020,NonResidential,BALLARD SPACE,NOT FOUND,6.0,BALLARD,,2016,...,,,,No Issue,Compliant,,,Office,POINT (-122.38309 47.66673),NOT FOUND
6,50306,1959701250,2020,NonResidential,2701 EASTLAKE,SEATTLE CITY OF,4.0,LAKE UNION,,1970,...,,,,No Issue,Compliant,,,Office,POINT (-122.32609 47.64464),CITY OF SEATTLE


In [17]:
unique_not_downtown_landlords = df_filtered['Landlord'].unique()

unique_not_downtown_landlords = pd.DataFrame(unique_not_downtown_landlords, columns=['owner_name'])

unique_not_downtown_landlords = unique_not_downtown_landlords[~unique_not_downtown_landlords['owner_name'].isin(['NOT FOUND', 'UNDEFINED'])]

unique_not_downtown_landlords.to_csv('unique_not_downtown_landlords.csv')

## Get potential company matches from CCFS

The only input we need here is a list of company names from the parcel search. This is the dataframe we produced in the prior step.

**TODO**: move these utils to a module and import

In [38]:
# Utils for finding principals

search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList'

def get_business_search_payload(business_name, page_count, page_num):
    return {
        'Type': 'BusinessName',
        'SearchEntityName': business_name,
        'SearchType': 'BusinessName',
        'SortType': 'ASC',
        'SortBy': 'Entity Name',
        'SearchValue': business_name,
        'SearchCriteria': 'Contains',
        'IsSearch': 'true',
        'PageID': page_num,
        'PageCount': page_count,
    }


def get_business_search_results(business_name, page_num):
    r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num))
    try:
        result = json.loads(r.text)
    #return json.loads(r.text)
    except:
        result = {}
    return result

# given one JSON element `result` in the list of search results, standardize
# the business name and address to collapse results into one 
def standardize_result(search_term, result):
    # Don't care about the result if it doesn't have an "active" status
    if(result["Status"] != "ACTIVE"): return

    # LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P
    # Limited Partnership, Limited liability company
    # Comma before any of the above
    # Just map all the results to be standardized to this name, then drop duplicates based on name? 
    p = re.compile("L[\s.]?L[\s,.]?[PC]" ,flags=re.IGNORECASE)

    result['BusinessName']=result["BusinessName"].replace(",", "")
    result['BusinessName']= re.sub(p, "LLC", result['BusinessName'])
    result['BusinessName']=result["BusinessName"].replace("LIMITED LIABILITY COMPANY", "LLC") 
    result['BusinessName']=result["BusinessName"].replace("LIMITED PARTNERSHIP", "LLC") 

    # Do the same for the search term, so that we have more exact matches
    # TODO: need to add this as an element of the dict
    result["SearchTerm"]=search_term.replace(",", "")
    result["SearchTerm"]=re.sub(p, "LLC", search_term)
    result["SearchTerm"]=search_term.replace("LIMITED PARTNERSHIP", "LLC") 
    result["SearchTerm"]=search_term.replace("LIMITED LIABILITY COMPANY", "LLC") 
    
    # Strip addressses of all commas
    result['Address'].replace(",", "")
    return result

def extract_search_results(search_term, search_req_response):
    # TODO: add all the columns, or at least filing status 
    # TODO: collapse all listings with L.L.P or L.L.C or LLC LLP
    # res_list = [standardize_result(res) for res in search_req_response]
    res_list = [[search_term, res['BusinessName'], res['UBINumber'], res['BusinessID'], res['PrincipalOffice']['PrincipalStreetAddress']['FullAddress'], res["BusinessStatus"]] for res in search_req_response]
    res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status"])
    # print(res_df)
    # res_df = res_df[res_df['Status']`=="Active"]#res_df.drop(res_df[res_df["Status"]=="Terminated"].index)
    # TODO: If there's an exact match, keep only that business 
    # Basically keep a list of exact matches, and build a list of potential matches that we give to human verifiers
    exact_match = res_df.index[res_df['BusinessName'] == search_term].tolist()
    if exact_match:
        # print(res_df)
        # print(exact_match)
        res_df = pd.concat([res_df.iloc[[exact_match[0]],:], res_df.drop(exact_match[0], axis=0)], axis=0)
    return res_df
    

# Mark row as potential match: UBI number is a duplicate, or Address is the same
# df.duplicated just sees if that address is already in the dataframe, NOT that the serach term
# and result have the same address. Could add search terms as a subset for duplicated call
def determine_search_matches(search_results_df):
    search_results_df['address_match'] = search_results_df.duplicated(subset=['Address'], keep=False) 
    search_results_df['ubi_match'] = search_results_df.duplicated(subset=['UBINumber'], keep=False)
    search_results_df['id_match'] = search_results_df.duplicated(subset=['BusinessId'], keep=False)

def get_business_details(business_id):
    url = 'https://cfda.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id)
    r = requests.get(url)
    return json.loads(r.text)

def get_empty_df():
    return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', 'Status', 'address_match', 'ubi_match', 'id_match'])

In [20]:
def get_all_company_name_match_search_results(owner_name):
    n = 1
    res_length = 100
    search_results = []
    
    while res_length == 100:
        res = get_business_search_results(owner_name, n)
        search_results += (res)
        n += 1
        res_length = len(res)
    
    return search_results

In [21]:
def get_potential_company_name_matches(owner_name):
    all_search_results = get_all_company_name_match_search_results(owner_name)
    extracted_results = extract_search_results(owner_name, all_search_results)
    determine_search_matches(extracted_results)
    return extracted_results

In [22]:
# utils to separate search results into exact match, potential match (where no exact match was found), and additional matches (extra matches if there was an exact match and additional matches)

def separate_search_results(results):
    exact_matches = get_empty_df()
    exact_matches.columns
    potential_matches = get_empty_df()
    additional_matches = get_empty_df()
    
    exact_match = results[results['SearchTerm'] == results['BusinessName']]
    if len(exact_match) > 0:
        exact_matches = pd.concat([exact_matches, exact_match], ignore_index=True)
        additional_matches = pd.concat([additional_matches, results[results['SearchTerm'] != results['BusinessName']]], ignore_index=True)
    else:
        potential_matches = pd.concat([potential_matches, results], ignore_index=True)
    
    return exact_matches, potential_matches, additional_matches

In [23]:
def get_company_list_name_matches(owner_list):
    exact_matches = get_empty_df()
    potential_matches = get_empty_df()
    additional_matches = get_empty_df()
    
    for owner in owner_list:
        matches = get_potential_company_name_matches(owner)
        temp_exact, temp_potential, temp_add = separate_search_results(matches)
        exact_matches = pd.concat([temp_exact, exact_matches], ignore_index=True)
        potential_matches = pd.concat([temp_potential, potential_matches], ignore_index=True)
        additional_matches = pd.concat([temp_add, additional_matches], ignore_index=True)
    
    return exact_matches, potential_matches, additional_matches

In [24]:
owner_search_list = list(unique_not_downtown_landlords['owner_name'])
owner_search_list[:10]

['TMUD GSL LLC',
 'HH SEATTLE LLC',
 'BRE-BMR 34TH LLC',
 'CITY OF SEATTLE',
 'CENTRAL PUGET SOUND REGIONAL TRANSIT AUTHORITY',
 'ACADEMIA PLACE LLC',
 'ARIMOCC LLC',
 'NOVO ROOSEVELT LLC',
 'BEACON HILL VENTURES LLC',
 'CONAM HOLMAN 13 OWNER LLC']

In [31]:
owner_search_list_chunks = owner_search_list[::200]
len(owner_search_list_chunks)

9

In [35]:
def get_company_matches_and_export(owner_list, x):
    exact_matches, potential_matches, additional_matches = get_company_list_name_matches(owner_list)
    
    exact_matches.to_csv('not_downtown/exact_matches_' + str(x) + '.csv')
    potential_matches.to_csv('not_downtown/potential_matches_' + str(x) + '.csv')
    additional_matches.to_csv('not_downtown/additional_matches_' + str(x) + '.csv')

In [37]:
get_company_matches_and_export(owner_search_list[:10], 0)

In [39]:
get_company_matches_and_export(owner_search_list[10:30], 0.1)

In [40]:
get_company_matches_and_export(owner_search_list[:100], 1)

In [41]:
get_company_matches_and_export(owner_search_list[100:300], 2)

In [42]:
get_company_matches_and_export(owner_search_list[300:500], 3)

In [43]:
get_company_matches_and_export(owner_search_list[500:700], 4)

In [44]:
get_company_matches_and_export(owner_search_list[700:900], 5)

In [46]:
get_company_matches_and_export(owner_search_list[900:1100], 6)

In [47]:
get_company_matches_and_export(owner_search_list[1100:1300], 7)

In [48]:
get_company_matches_and_export(owner_search_list[1300:1500], 8)

In [49]:
get_company_matches_and_export(owner_search_list[1500:], 9)