# Automate Potential Matches
A lot of the time, CCFS database won't come back with an exact match from our search result. Results may instead vary simply by punctuation, which can still be easily automated to safe time with human verification. 

In [2]:
import pandas as pd
import numpy as np
import requests
import json
import os
import re
import geopandas as gp
import urllib.parse

In [33]:
df_pot = pd.read_csv("non-downtown buildings\potential_matches_5.csv", index_col=[0])

In [34]:
df_pot.head()

Unnamed: 0,SearchTerm,BusinessName,UBINumber,BusinessId,Address,Status,address_match,ubi_match,id_match
0,FTW LLC,"-LDL- SOFTWARE, INC.",601 763 820,1425,"5129 NE 4TH CT, RENTON, WA, 98059-4566, UNITED...",Active,False,False,False
1,FTW LLC,"18009 DRIFTWOOD, LLC",604 534 883,1323128,"16816 24TH STREET CT E, LAKE TAPPS, WA, 98391-...",Active,False,False,False
2,FTW LLC,"1915 SOFTWARE, LLC",603 566 774,62799,"811 1ST AVE #480, SEATTLE, WA, 98104, UNITED S...",Administratively Dissolved,False,False,False
3,FTW LLC,205 SOFTWARE LLC,604 536 699,1326449,"9602 NE 63RD ST, VANCOUVER, WA, 98662, UNITED ...",Active,False,False,False
4,FTW LLC,"21ST CENTURY SOFTWARE TECHNOLOGIES, INC.",603 153 248,6769,"9623 173RD PL NE, REDMOND, WA, 98052, UNITED S...",Terminated,False,False,False


In [35]:
# standardize SearchTerm and BusinessName to see if they are matching
def standardize_result(row):
    # LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P
    # Limited Partnership, Limited liability company
    # Comma before any of the above
    # Just map all the results to be standardized to this name, then drop duplicates based on name? 
    p = re.compile("L[\s.]?L[\s,.]?[PC]" ,flags=re.IGNORECASE)

    row['BusinessName']=row["BusinessName"].replace(",", "")
    row['BusinessName']= re.sub(p, "LLC", row['BusinessName'])
    row['BusinessName']=row["BusinessName"].replace("LIMITED LIABILITY COMPANY", "LLC") 
    row['BusinessName']=row["BusinessName"].replace("LIMITED PARTNERSHIP", "LLC") 

    # Do the same for the search term, so that we have more exact matches
    # TODO: need to add this as an element of the dict
    row["SearchTerm"]=row["SearchTerm"].replace(",", "")
    row["SearchTerm"]=re.sub(p, "LLC", row["SearchTerm"])
    row["SearchTerm"]=row["SearchTerm"].replace("LIMITED PARTNERSHIP", "LLC") 
    row["SearchTerm"]=row["SearchTerm"].replace("LIMITED LIABILITY COMPANY", "LLC") 
    
    # Strip addressses of all commas
    return row

In [36]:
df_pot = df_pot.apply(lambda row: standardize_result(row), axis=1)

In [38]:
df_pot['isMatch'] = df_pot[df_pot['SearchTerm']==df_pot['BusinessName']]

ValueError: Cannot set a DataFrame with multiple columns to the single column isMatch

In [40]:
df_pot[df_pot['SearchTerm']==df_pot['BusinessName']]['isMatch']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pot[df_pot['SearchTerm']==df_pot['BusinessName']]['isMatch'] = 1


In [55]:
df_pot.loc[df_pot['SearchTerm']==df_pot['BusinessName'], 'isMatch']=1
print("Matches with regex: ", len(df_pot[df_pot['SearchTerm']==df_pot['BusinessName']]))
print("Total unique search terms: ", len(df_pot['SearchTerm'].unique()))

Matches with regex:  55
Total unique search terms:  95


In [62]:
matched_values = df_pot[df_pot['SearchTerm']==df_pot['BusinessName']]['SearchTerm'].values


In [64]:
df_pot.drop()

array(['FTW LLC', 'QUAIL PARK OF WEST SEATTLE LLC', 'SABRA PARK WEST LLC',
       'DNXT LLC', 'ACI REAL ESTATE SPE 137 LLC', 'NEWARK MEADOWS LLC',
       'ROOSEVELT SQUARE REGENCY LLC', '68TH AND ROOSEVELT LLC',
       'SRMAHULTRACK66 LLC', 'SRMAHULBROOKLYN65 LLC',
       'GREEN LAKE PHASE II LLC', 'ROOSTER APARTMENTS LLC',
       'SRMAHULLUNA LLC', 'LANGER PROPERTIES LLC',
       'CASCADIA PARTNERS LLC', 'SOB DEVELOPMENT LLC',
       'HIVE APARTMENTS LLC', 'JASPER 91 LLC', 'RAVENNA HOUSE 2015 LLC',
       'CRYSTAL COURT APARTMENTS LLC', 'TRINITY LAKEVIEW COURT LLC',
       'NORTHGATE ATHLETIC CLUB LLC',
       'NORTH SEATTLE PROFESSIONAL CENTER LLC', 'CSH MAPLE LEAF LLC',
       'IDA CULVER HOUSE RAVENNA LLC', 'WILLOW CREEK PROPERTIES LLC',
       'BRYSON PROPERTIES LLC', 'NORTHGATE LODGING LLLC',
       'EPIC PROPERTY MANAGEMENT LLC', 'WC SEATTLE VI SPE LLC',
       '1608 EAST REPUBLICAN LLC', 'MADISON GATE INVESTORS 2016 LLC',
       'PARKSIDE LLC', 'RUTH COURT APARTMENTS LLC', 'RM 