# Who Owns the Large Buildings in Seattle?

## Problem

The GHGE dataset does not include buildings' owners. Scraping the eRealProperty website for building owners has two limitations:

1. The data quality is poor and many buildings don't have an owner listed.
1. Many corporations with multiple properties set up a separate LLC for each building. There is no straightforward way to trace a child corporation to its parent coroporation. This obfuscates the portfolio size of each company.

In [123]:
import pandas as pd
import numpy as np
import requests
from fuzzywuzzy import fuzz
import json
import os

1. Find business record
1. Find business' governors
1. Find other businesses with those governors
1. Match the businesses

In [13]:
# Find business record
# first need to search and determine business ID

search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList'

def get_business_search_payload(business_name):
    return {
        'Type': 'BusinessName',
        'SearchType': 'BusinessName',
        'SearchEntityName': business_name,
        'SortType': 'ASC',
        'SortBy': 'Entity Name',
        'SearchValue': business_name,
        'SearchCriteria': 'Contains',
        'IsSearch': 'true',
        'PageID': 1,
        'PageCount': 25,
    }

def get_business_search_results(business_name):
    r = requests.post(search_for_business_url, get_business_search_payload(business_name))
    return json.loads(r.text)

In [14]:
test_name = 'TMUD GSL LLC'

test_search_req = get_business_search_results('TMUD GSL LLC')
test_search_req

[{'IsAvailable': False,
  'PrincipalOffice': {'PrincipalID': 0,
   'SequenceNo': 0,
   'FirstName': None,
   'LastName': None,
   'FullName': None,
   'Title': None,
   'Name': None,
   'MiddleName': None,
   'PhoneNumber': None,
   'EmailAddress': None,
   'TypeID': None,
   'PrincipalBaseType': None,
   'PrincipalMailingAddress': {'Attention': None,
    'NotificationAttention': None,
    'CorrespondenceEmailAddress': None,
    'ConsolidationCorrespondenceEmailAddress': None,
    'ZipExtension': None,
    'AddressEntityType': None,
    'IsAddressSame': False,
    'isUserNonCommercialRegisteredAgent': False,
    'baseEntity': {'FilerID': 0,
     'UserID': 0,
     'CreatedBy': 0,
     'IPAddress': None,
     'ModifiedBy': 0,
     'ModifiedIPAddress': None},
    'IsInvalidState': False,
    'IsAgentInWA': None,
    'isRAStreetAddressValid': False,
    'IsAddressReturnedMail': False,
    'FullAddress': 'BELLEVUE, WA, USA, 4408, 98004',
    'ID': 0,
    'StreetAddress1': 'BELLEVUE',
    'S

In [28]:
def extract_search_results(search_term, search_req_response):
    res_list = [[search_term, res['BusinessName'], res['UBINumber'], res['BusinessID'], res['PrincipalOffice']['PrincipalStreetAddress']['FullAddress']] for res in search_req_response]
    return pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address'])

test_search_results_df = extract_search_results('TMUD GSL LLC', test_search_req)
test_search_results_df

Unnamed: 0,SearchTerm,BusinessName,UBINumber,BusinessId,Address
0,TMUD GSL LLC,"TMUD GSL, LLC",604 874 529,1537697,"11010 NE 8TH ST STE 465, BELLEVUE, WA, 98004-4..."
1,TMUD GSL LLC,"TMUD GSL, LLC",604 848 437,1518300,
2,TMUD GSL LLC,"TMUD-IDMGT, LLC",604 848 437,1518300,"11010 NE 8TH ST STE 465, BELLEVUE, WA, 98004-4..."


In [87]:
# Mark row as potential match if: name is similar, UBI number is a duplicate, or Address is the same
def determine_search_matches(search_results_df):
    name_match = search_results_df.apply(lambda x: fuzz.ratio(x['SearchTerm'], x['BusinessName']) > 90, axis=1)
    meta_match = search_results_df.duplicated(subset=['Address'], keep=False) | search_results_df.duplicated(subset=['UBINumber'], keep=False) | search_results_df.duplicated(subset=['BusinessId'], keep=False)
    search_results_df['IsMatch'] = name_match | meta_match     

In [88]:
determine_search_matches(test_search_results_df)
test_search_results_df

Unnamed: 0,SearchTerm,BusinessName,UBINumber,BusinessId,Address,IsMatch
0,TMUD GSL LLC,"TMUD GSL, LLC",604 874 529,1537697,"11010 NE 8TH ST STE 465, BELLEVUE, WA, 98004-4...",True
1,TMUD GSL LLC,"TMUD GSL, LLC",604 848 437,1518300,,True
2,TMUD GSL LLC,"TMUD-IDMGT, LLC",604 848 437,1518300,"11010 NE 8TH ST STE 465, BELLEVUE, WA, 98004-4...",True


In [89]:
def get_business_details(business_id):
    url = 'https://cfda.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id)
    r = requests.get(url)
    return json.loads(r.text)

In [90]:
dets = get_business_details(1537697)
dets

{'IsAvailable': False,
 'PrincipalOffice': {'PrincipalID': 0,
  'SequenceNo': 0,
  'FirstName': None,
  'LastName': None,
  'FullName': None,
  'Title': None,
  'Name': None,
  'MiddleName': None,
  'PhoneNumber': '',
  'EmailAddress': 'INFO@CHANGLAWGROUP.COM',
  'TypeID': None,
  'PrincipalBaseType': None,
  'PrincipalMailingAddress': {'Attention': None,
   'NotificationAttention': None,
   'CorrespondenceEmailAddress': None,
   'ConsolidationCorrespondenceEmailAddress': None,
   'ZipExtension': None,
   'AddressEntityType': None,
   'IsAddressSame': False,
   'isUserNonCommercialRegisteredAgent': False,
   'baseEntity': {'FilerID': 0,
    'UserID': 0,
    'CreatedBy': 0,
    'IPAddress': None,
    'ModifiedBy': 0,
    'ModifiedIPAddress': None},
   'IsInvalidState': False,
   'IsAgentInWA': None,
   'isRAStreetAddressValid': False,
   'IsAddressReturnedMail': False,
   'FullAddress': '11010 NE 8TH ST STE 465, BELLEVUE, WA, 98004-4408, UNITED STATES',
   'ID': 2668350,
   'StreetAddre

In [103]:
def extract_principals(business_res):
    agent = business_res['Agent']['EntityName']
    rows = [[
        # name of company?
        agent,
        'Entity' if principal['TypeID'] == 'E' else 'Individual',
        principal['PrincipalID'],
         principal['Name'] if principal['TypeID'] == 'E' else principal['FirstName'] + ' ' + principal['LastName']
    ] for principal in business_res['PrincipalsList']]
    return pd.DataFrame(rows, columns=['Agent', 'EntityType', 'PrincipalID', 'PrincipalName'])

e = extract_principals(dets)
e

Unnamed: 0,Agent,EntityType,PrincipalID,PrincipalName
0,CHANG LAW GROUP PC,Entity,3778468,"TMUD-IDMGT, LLC"


In [None]:
test_search_term_list = [
'TMUD GSL LLC'
'HH SEATTLE LLC'
'ACORN DEVELOPMENT LLC'
'BRE-BMR 34TH LLC'
'MIDTOWN21 LLC'
'SEATTLE CITY OF'
]

# only give this a list that is unique names and has no "not found" values

def find_potential_matches(search_list):
    for business in search_list:
        search_results = get_business_search_results(business)
        search_df = extract_search_results(business, search results)
        determine_search_matches(search_df)

In [120]:
def get_governor_payload(governor_name, page_id):
    return "Type=Principal&BusinessStatusID=0&SearchEntityName=&SearchType=&BusinessTypeID=0&AgentName=&PrincipalName={governor_name}&StartDateOfIncorporation=&EndDateOfIncorporation=&ExpirationDate=&IsSearch=true&IsShowAdvanceSearch=true&&&AgentAddress%5BIsAddressSame%5D=false&AgentAddress%5BIsValidAddress%5D=false&AgentAddress%5BisUserNonCommercialRegisteredAgent%5D=false&AgentAddress%5BIsInvalidState%5D=false&AgentAddress%5BbaseEntity%5D%5BFilerID%5D=0&AgentAddress%5BbaseEntity%5D%5BUserID%5D=0&AgentAddress%5BbaseEntity%5D%5BCreatedBy%5D=0&&AgentAddress%5BbaseEntity%5D%5BModifiedBy%5D=0&&AgentAddress%5BFullAddress%5D=%2C%20WA%2C%20USA&AgentAddress%5BID%5D=0&&&&AgentAddress%5BState%5D=WA&&AgentAddress%5BCountry%5D=USA&&&&&&&&PrincipalAddress%5BIsAddressSame%5D=false&PrincipalAddress%5BIsValidAddress%5D=false&PrincipalAddress%5BisUserNonCommercialRegisteredAgent%5D=false&PrincipalAddress%5BIsInvalidState%5D=false&PrincipalAddress%5BbaseEntity%5D%5BFilerID%5D=0&PrincipalAddress%5BbaseEntity%5D%5BUserID%5D=0&PrincipalAddress%5BbaseEntity%5D%5BCreatedBy%5D=0&&PrincipalAddress%5BbaseEntity%5D%5BModifiedBy%5D=0&&PrincipalAddress%5BFullAddress%5D=%2C%20WA%2C%20USA&PrincipalAddress%5BID%5D=0&&&&PrincipalAddress%5BState%5D=&&PrincipalAddress%5BCountry%5D=USA&&&&&&IsHostHomeSearch=&IsPublicBenefitNonProfitSearch=&IsCharitableNonProfitSearch=&IsGrossRevenueNonProfitSearch=&IsHasMembersSearch=&IsHasFEINSearch=&NonProfit%5BIsNonProfitEnabled%5D=false&NonProfit%5BchkSearchByIsHostHome%5D=false&NonProfit%5BchkSearchByIsPublicBenefitNonProfit%5D=false&NonProfit%5BchkSearchByIsCharitableNonProfit%5D=false&NonProfit%5BchkSearchByIsGrossRevenueNonProfit%5D=false&NonProfit%5BchkSearchByIsHasMembers%5D=false&NonProfit%5BchkSearchByIsHasFEIN%5D=false&NonProfit%5BFEINNoSearch%5D=&NonProfit%5BchkIsHostHome%5D%5Bnone%5D=false&NonProfit%5BchkIsHostHome%5D%5Byes%5D=false&NonProfit%5BchkIsHostHome%5D%5Bno%5D=false&NonProfit%5BchkIsPublicBenefitNonProfit%5D%5Bnone%5D=false&NonProfit%5BchkIsPublicBenefitNonProfit%5D%5Byes%5D=false&NonProfit%5BchkIsPublicBenefitNonProfit%5D%5Bno%5D=false&NonProfit%5BchkIsCharitableNonProfit%5D%5Bnone%5D=false&NonProfit%5BchkIsCharitableNonProfit%5D%5Byes%5D=false&NonProfit%5BchkIsCharitableNonProfit%5D%5Bno%5D=false&NonProfit%5BchkIsGrossRevenueNonProfit%5D%5Bnone%5D=false&NonProfit%5BchkIsGrossRevenueNonProfit%5D%5Byes%5D=false&NonProfit%5BchkIsGrossRevenueNonProfit%5D%5Bno%5D=false&NonProfit%5BchkIsGrossRevenueNonProfit%5D%5Bover500k%5D=false&NonProfit%5BchkIsGrossRevenueNonProfit%5D%5Bunder500k%5D=false&NonProfit%5BchkIsHasMembers%5D%5Bnone%5D=false&NonProfit%5BchkIsHasMembers%5D%5Byes%5D=false&NonProfit%5BchkIsHasMembers%5D%5Bno%5D=false&NonProfit%5BchkIsHasFEIN%5D%5Byes%5D=false&NonProfit%5BchkIsHasFEIN%5D%5Bno%5D=false&PageID={page_id}&PageCount=100".format(governor_name=governor_name, page_id=page_id)

In [113]:
governor_headers = {
    'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '2778',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'DNT': '1',
'Host': 'cfda.sos.wa.gov',
'Origin': 'https://ccfs.sos.wa.gov',
'Referer': 'https://ccfs.sos.wa.gov/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "macOS"
}

In [122]:
results = requests.post('https://cfda.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList', data=get_governor_payload('pong', 1), headers=governor_headers)

json.loads(results.text)

[{'IsAvailable': False,
  'PrincipalOffice': {'PrincipalID': 0,
   'SequenceNo': 0,
   'FirstName': None,
   'LastName': None,
   'FullName': None,
   'Title': None,
   'Name': None,
   'MiddleName': None,
   'PhoneNumber': None,
   'EmailAddress': None,
   'TypeID': None,
   'PrincipalBaseType': None,
   'PrincipalMailingAddress': {'Attention': None,
    'NotificationAttention': None,
    'CorrespondenceEmailAddress': None,
    'ConsolidationCorrespondenceEmailAddress': None,
    'ZipExtension': None,
    'AddressEntityType': None,
    'IsAddressSame': False,
    'isUserNonCommercialRegisteredAgent': False,
    'baseEntity': {'FilerID': 0,
     'UserID': 0,
     'CreatedBy': 0,
     'IPAddress': None,
     'ModifiedBy': 0,
     'ModifiedIPAddress': None},
    'IsInvalidState': False,
    'IsAgentInWA': None,
    'isRAStreetAddressValid': False,
    'IsAddressReturnedMail': False,
    'FullAddress': '',
    'ID': 0,
    'StreetAddress1': None,
    'StreetAddress2': None,
    'City': No

32512

In [127]:
def fetch_businesses_from_governor(governor_name):
    results = []
    
    page_index = 100
    while page_index == 100:
        results_page = requests.post('https://cfda.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList', data=get_governor_payload('pong', 1), headers=governor_headers)
        results += json.loads(results_page.text)
    
    return results

def extract_businesses_from_governor_name(governor_name, search_results):
    return [{ 
        'governor_name': governor_name, 
        business_id: result['OnlineReportPrincipalOffice']['BusinessID'],
        business_name: result['OnlineReportPrincipalOffice']['BusinessName']
    } for result in results if result['OnlineReportPrincipalOffice']['BusinessName'] == governor_name]

In [133]:
# businesses_test = extract_businesses_from_governor_name('ALICE SHEN', results)
# businesses_test

t = results.text
tb = json.loads(t)[0]
tb['AgentName']

'CHANG LAW GROUP PC'

In [None]:
how to filter out values as you go?

- start w/ unique owner names
- then when grabbing search results, search for match on id & name & don't search if they match

can you search by address?--no, alas

In [92]:
# wrong url

# def get_principal_details(principal_id):
#     url = 'https://cfda.sos.wa.gov/api/BusinessSearch/BusinessInformation?principalID={principal_id}'.format(principal_id=principal_id)
#     r = requests.get(url)
#     return json.loads(r.text)

# l = get_principal_details(3778468)
l

{'Message': "No HTTP resource was found that matches the request URI 'https://ccfsinternal.sos.wa.gov:444/api/BusinessSearch/BusinessInformation?principalID=3778468'."}