# Automate Potential Matches
A lot of the time, CCFS database won't come back with an exact match from our search result. Results may instead vary simply by punctuation, which can still be easily automated to safe time with human verification.

NOTE: In `LookupCompaniesHelper` in `owners.py`, extracting exact matches with regex patterns has been added to the process. This notebook was used on old versions of potential_matches_* that didn't extract exact matches with regex matching. In the future, you can just use the helper functions and not worry about doing this by hand. 

In [37]:
import pandas as pd
import numpy as np
import requests
import json
import os
import re
import geopandas as gp
import urllib.parse

In [38]:
df = pd.read_csv("non-downtown buildings\potential_matches_5.csv", index_col=[0]).reset_index()

In [39]:
df.head()

Unnamed: 0,index,SearchTerm,BusinessName,UBINumber,BusinessId,Address,Status,address_match,ubi_match,id_match
0,0,FTW LLC,"-LDL- SOFTWARE, INC.",601 763 820,1425,"5129 NE 4TH CT, RENTON, WA, 98059-4566, UNITED...",Active,False,False,False
1,1,FTW LLC,"18009 DRIFTWOOD, LLC",604 534 883,1323128,"16816 24TH STREET CT E, LAKE TAPPS, WA, 98391-...",Active,False,False,False
2,2,FTW LLC,"1915 SOFTWARE, LLC",603 566 774,62799,"811 1ST AVE #480, SEATTLE, WA, 98104, UNITED S...",Administratively Dissolved,False,False,False
3,3,FTW LLC,205 SOFTWARE LLC,604 536 699,1326449,"9602 NE 63RD ST, VANCOUVER, WA, 98662, UNITED ...",Active,False,False,False
4,4,FTW LLC,"21ST CENTURY SOFTWARE TECHNOLOGIES, INC.",603 153 248,6769,"9623 173RD PL NE, REDMOND, WA, 98052, UNITED S...",Terminated,False,False,False


In [40]:
# standardize SearchTerm and BusinessName to see if they are matching
def is_exact_match(row):
    search = row["SearchTerm"]
    result = row["BusinessName"]

    p = re.compile("L[\s.]?L[\s,.]?[PC][.]" ,flags=re.IGNORECASE)
    result=result.replace(",", "")
    result= re.sub(p, "LLC", result)
    result=result.replace("LIMITED LIABILITY COMPANY", "LLC") 
    result=result.replace("LIMITED PARTNERSHIP", "LLC") 

    # Do the same for the search term, so that we have more exact matches
    search=search.replace(",", "")
    search=re.sub(p, "LLC", search)
    search=search.replace("LIMITED PARTNERSHIP", "LLC") 
    search=search.replace("LIMITED LIABILITY COMPANY", "LLC")
    return search == result

In [41]:
df_exact = df[df.apply(lambda row: is_exact_match(row), axis=1)]

In [42]:
df_exact.head()

Unnamed: 0,index,SearchTerm,BusinessName,UBINumber,BusinessId,Address,Status,address_match,ubi_match,id_match
966,966,FTW LLC,"FTW, LLC",602 088 226,128023,"1037 NE 65TH ST # 339, SEATTLE, WA, 98115-6655...",Active,False,False,False
2806,2806,QUAIL PARK OF WEST SEATTLE LLC,"QUAIL PARK OF WEST SEATTLE, LLC",603 348 245,952757,"146 N CANAL ST STE 350, SEATTLE, WA, 98103, UN...",Active,False,False,False
2807,2807,SABRA PARK WEST LLC,"SABRA PARK WEST, LLC",604 169 651,988232,"C/O SABRA HEALTH CARE REIT, INC., 18500 VON KA...",Active,False,False,False
2814,2814,DNXT LLC,"DNXT, LLC",604 219 279,1181278,"17735 NE 65TH ST STE 120, REDMOND, WA, 98052-4...",Active,True,False,False
2816,2816,ACI REAL ESTATE SPE 137 LLC,"ACI REAL ESTATE SPE 137, LLC",604 621 842,1364724,"250 E PARKCENTER BLVD, BOISE, ID, 83706-3940, ...",Active,False,False,False


In [43]:
df.loc[df['SearchTerm']==df['BusinessName'], 'isMatch']=1
print("Matches with regex: ", len(df_exact))
print("Total unique search terms: ", len(df['SearchTerm'].unique()))

Matches with regex:  56
Total unique search terms:  95


In [26]:
matched_values = df[df['SearchTerm']==df['BusinessName']]['SearchTerm'].values
df_pot_remains = df[~df['SearchTerm'].isin(matched_values)]

In [80]:
df_extracted_exact = pd.DataFrame([], columns=df.columns)
for i in range(5,10):
    print(f"Extracting standardized from potential_matches_{i}")
    df_pot = pd.read_csv(f"non-downtown buildings\potential_matches_{i}.csv", index_col=[0]).reset_index()
    df_exact = df_pot[df_pot.apply(lambda row: is_exact_match(row), axis=1)]
    matched_values = df_exact['SearchTerm'].unique()

    print("Matches with regex: ", len(df_exact))
    print("Total unique search terms: ", len(df_pot['SearchTerm'].unique()))    

    df_extracted_exact = pd.concat([df_extracted_exact, df_exact])
    # print(len(df_extracted_exact))

    df_pot_remains = df_pot[~df_pot['SearchTerm'].isin(matched_values)]
    df_pot_remains = df_pot_remains.set_index("index").reset_index(drop=True)
    df_pot_remains.to_csv(f"non-downtown buildings\potential_matches_{i}.csv", index=True)

Extracting standardized from potential_matches_5
Matches with regex:  56
Total unique search terms:  95
Extracting standardized from potential_matches_6
Matches with regex:  49
Total unique search terms:  101
Extracting standardized from potential_matches_7
Matches with regex:  41
Total unique search terms:  83
Extracting standardized from potential_matches_8


  df_extracted_exact = pd.concat([df_extracted_exact, df_exact])
  df_extracted_exact = pd.concat([df_extracted_exact, df_exact])


Matches with regex:  67
Total unique search terms:  109
Extracting standardized from potential_matches_9
Matches with regex:  60
Total unique search terms:  106


  df_extracted_exact = pd.concat([df_extracted_exact, df_exact])
  df_extracted_exact = pd.concat([df_extracted_exact, df_exact])


In [64]:
print(len(df_extracted_exact))
df_extracted_exact.to_csv("non-downtown buildings/exact_matches_10.csv", index=False)

273


In [75]:
df_extracted_exact.set_index('index').reset_index(drop=True).to_csv("non-downtown buildings/exact_matches_10.csv", index=False)