# Automate Potential Matches
A lot of the time, CCFS database won't come back with an exact match from our search result. Results may instead vary simply by punctuation, which can still be easily automated to safe time with human verification. 

In [20]:
import pandas as pd
import numpy as np
import requests
import json
import os
import re
import geopandas as gp
import urllib.parse

In [21]:
df = pd.read_csv("non-downtown buildings\potential_matches_5.csv", index_col=[0])

In [22]:
df.head()

Unnamed: 0,SearchTerm,BusinessName,UBINumber,BusinessId,Address,Status,address_match,ubi_match,id_match
0,FTW LLC,"-LDL- SOFTWARE, INC.",601 763 820,1425,"5129 NE 4TH CT, RENTON, WA, 98059-4566, UNITED...",Active,False,False,False
1,FTW LLC,"18009 DRIFTWOOD, LLC",604 534 883,1323128,"16816 24TH STREET CT E, LAKE TAPPS, WA, 98391-...",Active,False,False,False
2,FTW LLC,"1915 SOFTWARE, LLC",603 566 774,62799,"811 1ST AVE #480, SEATTLE, WA, 98104, UNITED S...",Administratively Dissolved,False,False,False
3,FTW LLC,205 SOFTWARE LLC,604 536 699,1326449,"9602 NE 63RD ST, VANCOUVER, WA, 98662, UNITED ...",Active,False,False,False
4,FTW LLC,"21ST CENTURY SOFTWARE TECHNOLOGIES, INC.",603 153 248,6769,"9623 173RD PL NE, REDMOND, WA, 98052, UNITED S...",Terminated,False,False,False


In [23]:
# standardize SearchTerm and BusinessName to see if they are matching
def standardize_result(row):
    # examples: LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P, LLC.
    # Limited Partnership, Limited liability company
    # Comma before any of the above
    # Just map all the results to be standardized to this name, then drop duplicates based on name? 
    p = re.compile("L[\s.]?L[\s,.]?[PC][.]" ,flags=re.IGNORECASE)

    row['BusinessName']=row["BusinessName"].replace(",", "")
    row['BusinessName']= re.sub(p, "LLC", row['BusinessName'])
    row['BusinessName']=row["BusinessName"].replace("LIMITED LIABILITY COMPANY", "LLC") 
    row['BusinessName']=row["BusinessName"].replace("LIMITED PARTNERSHIP", "LLC") 

    # Do the same for the search term, so that we have more exact matches
    # row["SearchTerm"]=row["SearchTerm"].replace(",", "")
    # row["SearchTerm"]=re.sub(p, "LLC", row["SearchTerm"])
    # row["SearchTerm"]=row["SearchTerm"].replace("LIMITED PARTNERSHIP", "LLC") 
    # row["SearchTerm"]=row["SearchTerm"].replace("LIMITED LIABILITY COMPANY", "LLC") 
    
    return row

In [24]:
df = df.apply(lambda row: standardize_result(row), axis=1)

In [25]:
df.loc[df['SearchTerm']==df['BusinessName'], 'isMatch']=1
print("Matches with regex: ", len(df[df['SearchTerm']==df['BusinessName']]))
print("Total unique search terms: ", len(df['SearchTerm'].unique()))

Matches with regex:  56
Total unique search terms:  95


In [26]:
matched_values = df[df['SearchTerm']==df['BusinessName']]['SearchTerm'].values


In [27]:
df_pot_remains = df[~df['SearchTerm'].isin(matched_values)]

In [28]:
len(df[df['isMatch']==1])

56

In [29]:
df_extracted_exact = pd.DataFrame([], columns=df.columns)
for i in range(5,10):
    print(f"Extracting standardized from potential_matches_{i}")
    df_pot = pd.read_csv(f"non-downtown buildings\potential_matches_{i}.csv", index_col=[0])
    df_pot = df_pot.apply(lambda row: standardize_result(row), axis=1)
    df_pot.loc[df_pot['SearchTerm']==df_pot['BusinessName'], 'isMatch']=1

    print("Matches with regex: ", len(df_pot[df_pot['SearchTerm']==df_pot['BusinessName']]))
    print("Total unique search terms: ", len(df_pot['SearchTerm'].unique()))    

    df_extracted_exact = pd.concat([df_extracted_exact, df_pot[df_pot['isMatch']==1]])
    print(len(df_extracted_exact))

    df_pot_remains = df_pot[~df_pot['SearchTerm'].isin(matched_values)]
    df_pot_remains.to_csv(f"non-downtown buildings\potential_matches_{i}_reduced.csv", index=False)

Extracting standardized from potential_matches_5
Matches with regex:  56
Total unique search terms:  95
56
Extracting standardized from potential_matches_6
Matches with regex:  48
Total unique search terms:  101
104
Extracting standardized from potential_matches_7
Matches with regex:  41
Total unique search terms:  83
145
Extracting standardized from potential_matches_8


  df_extracted_exact = pd.concat([df_extracted_exact, df_pot[df_pot['isMatch']==1]])
  df_extracted_exact = pd.concat([df_extracted_exact, df_pot[df_pot['isMatch']==1]])


Matches with regex:  66
Total unique search terms:  109
211
Extracting standardized from potential_matches_9


  df_extracted_exact = pd.concat([df_extracted_exact, df_pot[df_pot['isMatch']==1]])


Matches with regex:  60
Total unique search terms:  106
271


  df_extracted_exact = pd.concat([df_extracted_exact, df_pot[df_pot['isMatch']==1]])


In [30]:
print(len(df_extracted_exact))
df_extracted_exact.to_csv("non-downtown buildings/exact_matches_10.csv", index=False)

271
