# Group by all-mathces-all-principals instead of a second lookup
We're looking for overlap of governors in companies we already have listed. We don't care about other companies (eg. that could be located in Spokane, not Seattle) so we don't need a second look up to the corps and charities database. Instead, we can make clusters based on the output of all-matches-principals.csv, where there is a row for each governor for each company in our list. 

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
import re
import geopandas as gp
import urllib.parse

In [13]:
df = pd.read_csv("all_matches_principals.csv", index_col=[0])
df.index.name = "index"
df.head()

Unnamed: 0_level_0,SearchTerm,BusinessName,UBINumber,BusinessId,Address,Status,address_match,ubi_match,id_match,isMatch,Agent,EntityType,PrincipalID,PrincipalName
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,KAR STANDARD LLC,KAR STANDARD LLC,604 145 518,763141,"1 FEDERAL ST FL 17, BOSTON, MA, 02110-2003, UN...",Active,False,False,False,1.0,CORPORATION SERVICE COMPANY,Entity,1570323.0,KAONOULU RANCH LLLP
1,300 FIFTH AVENUE LLC,300 FIFTH AVENUE LLC,604 670 512,1399595,"1000 2ND AVE STE 1800, SEATTLE, WA, 98104-3619...",Active,False,False,False,1.0,,Individual,3315049.0,JOHN M. GREELEY
1,300 FIFTH AVENUE LLC,300 FIFTH AVENUE LLC,604 670 512,1399595,"1000 2ND AVE STE 1800, SEATTLE, WA, 98104-3619...",Active,False,False,False,1.0,,Individual,3349317.0,MARTIN SELIG
2,MSI - 1ST & KING LLC,MSI - 1ST & KING LLC,602 739 680,880019,"316 OCCIDENTAL AVE S, STE 300, SEATTLE, WA, 98...",Active,False,False,False,1.0,FIKSO KRETSCHMER SMITH DIXON ORMSETH PS,Individual,2114613.0,"H MARTIN SMITH, III"
3,BRICKMAN PACIFIC LLC,BRICKMAN PACIFIC LLC,603 445 367,74763,"C/O BRICKMAN, ONCE GREENWICH OFFICE PARK, BUIL...",Active,False,False,False,1.0,C T CORPORATION SYSTEM,Entity,3617104.0,BRICKMAN FUND VI REIT INC.


In [43]:
df.value_counts(subset=["BusinessName"])

BusinessName                                
BPP 800 FIFTH PROPERTY OWNER LLC                82
BPP 999 THIRD AVENUE OWNER LLC                  81
BPP 1420 FIFTH AVENUE OWNER LLC                 78
EXPEDITORS INTERNATIONAL OF WASHINGTON, INC.    52
TMT OLIVE LAB, INC.                             18
                                                ..
HINES NORTON SEATTLE LLC                         1
HOWARD BUILDING SEATTLE, LLC                     1
HUDSON 1099 STEWART STREET, LLC                  1
HUDSON 1918 EIGHTH AVENUE, LLC                   1
WRI 2200 WESTLAKE LP                             1
Length: 235, dtype: int64

In [107]:
# Given a principal name, get all of the companies that share this principal
def get_matching_companies_from_principal(principal_match_list, principal_name):
    # Get all of the business names that have this principal
    # then return that subset
    companies_with_same_principal = principal_match_list[principal_match_list['PrincipalName'] == principal_name].BusinessId.unique()

    # if("PLYMOUTH HOUSING GROUP" in (companies_with_same_principal)): print("PLYMOUTH")
    return principal_match_list[principal_match_list["BusinessId"].isin(companies_with_same_principal)]


In [116]:
def group_companies_by_principal(principal_match_list):
    columns=['SearchTerm', 'BusinessName', 'PotentialRelatedCompany', 'UBINumber', 'BusinessId', 'Address', 'Status', 'Agent', 'Principals', 'isMatch', 'notes']
    results = pd.DataFrame([], columns)
    for idx, row in principal_match_list.iterrows():
        possible_matching_companies_df = get_matching_companies_from_principal(principal_match_list, row['PrincipalName'])
        grouped = possible_matching_companies_df.groupby("BusinessName")
        for name, group in grouped:
            principals_list = group["PrincipalName"].tolist()
            principals_list.sort()
            poss_company = group.iloc[0]

            # # row['BusinessName']: the name we mapped to from SearchTerm
            # # poss_company['BusinessName]: PotentialRelatedCompany
            new_row = pd.Series(data=[row['SearchTerm'], 
                            row["BusinessName"], 
                            poss_company["BusinessName"],
                            poss_company['UBINumber'],
                            poss_company["BusinessId"],
                            poss_company["Address"],
                            poss_company["Status"],
                            poss_company["Agent"],
                            principals_list,
                            "", # isMatch
                            ""  # Notes
                            ], 
                        index = columns)
            results = pd.concat([new_row.to_frame().T, results], ignore_index=True).drop_duplicates(subset=["UBINumber"]).dropna(how='all')
            # results = results[results['BusinessId'].isin(all_matches['BusinessId'])]
        if(idx % 25 == 0): 
            #print(f"Processing row {idx} of principal_match_list, results is {len(results)}")
            results.to_csv("companies_and_potential_matches_no_lookup.csv")
    return results

In [117]:
df_clusters = group_companies_by_principal(df)

In [118]:
df_clusters

Unnamed: 0,SearchTerm,BusinessName,PotentialRelatedCompany,UBINumber,BusinessId,Address,Status,Agent,Principals,isMatch,notes
0,COLLINS BUILDING LLC,"COLLINS FINE HOMEBUILDING, LLC","COLLINS FINE HOMEBUILDING, LLC",603 528 051,1105761,"1023 S THORP HWY, ELLENSBURG, WA, 98926-8006, ...",Active,LUCAS COLLINS,"[LUCAS COLLINS, LUCAS COLLINS]",,
1,HARTFORD BUILDING LLC,"HARTFORD BUILDING, LLC","WASHINGTON SHOE BUILDING, LLC",602 647 733,1054283,"208 JAMES ST #C, SEATTLE, WA, 98104, UNITED ST...",Active,LINDA SULLIVAN,"[ANNE DEVOE LAWLER, SAMIS LAND CO]",,
2,HARTFORD BUILDING LLC,"HARTFORD BUILDING, LLC","HARTFORD BUILDING, LLC",604 990 747,1609954,"208 JAMES ST, SEATTLE, WA, 98104-2220, UNITED ...",Active,SAMIS LAND CO,"[JEFFREY C STEINERT, JEFFREY C STEINERT, SAMIS...",,
3,HARTFORD BUILDING LLC,"HARTFORD BUILDING, LLC","COLLINS BUILDING, LLC",604 976 252,1601986,"208 JAMES ST, SEATTLE, WA, 98104-2220, UNITED ...",Active,SAMIS FOUNDATION,"[JEFFREY C STEINERT, SAMIS FOUNDATION]",,
4,8TH & PINE JOINT VENTURE,8TH & PINE JOINT VENTURE LLC,8TH & PINE JOINT VENTURE LLC,603 459 107,1092741,"10400 NE 4TH ST, #2225, BELLEVUE, WA, 98004-51...",Active,CORPORATION SERVICE COMPANY,[8TH & PINE FINANCIAL LIMITED LIABILITY COMPANY],,
...,...,...,...,...,...,...,...,...,...,...,...
366,300 FIFTH AVENUE LLC,300 FIFTH AVENUE LLC,SREH 2014 LLC,603 391 691,1017673,"1000 2ND AVE STE 1800, SEATTLE, WA, 98104-3619...",Active,,[MARTIN SELIG],,
367,300 FIFTH AVENUE LLC,300 FIFTH AVENUE LLC,SELIG HOLDINGS COMPANY L.L.C.,602 559 881,928036,"1000 2ND AVE STE 1800, SEATTLE, WA, 98104-3619...",Active,,[MARTIN SELIG],,
368,300 FIFTH AVENUE LLC,300 FIFTH AVENUE LLC,88 VINE LLC,604 638 328,1374207,"1000 2ND AVE STE 1800, SEATTLE, WA, 98104-3619...",Active,,"[JOHN M GREELEY, MARTIN SELIG]",,
369,300 FIFTH AVENUE LLC,300 FIFTH AVENUE LLC,300 FIFTH AVENUE LLC,604 670 512,1399595,"1000 2ND AVE STE 1800, SEATTLE, WA, 98104-3619...",Active,,"[JOHN M. GREELEY, MARTIN SELIG]",,
