# Address Matching

# 1. Introduction

## 1.1 String Match

### Brute Force Approach

In [None]:
#Brute Force Approach
def brute_force_search(text, pattern):
    #Returns list of all starting indices where pattern is found
    n, m = len(text), len(pattern)
    matches = []
    
    for i in range(n - m + 1):
        j = 0
        while j < m and text[i + j] == pattern[j]:
            j += 1
        if j == m:
            matches.append(i)
    
    return matches

### Rabin-Karp Algorithm

In [None]:
#Rabin-Karp Algorithm
def rabin_karp_search(text, pattern, d=256, q=101):

    #d = number of characters in alphabet
    #q = prime number for modulo'

    n, m = len(text), len(pattern)
    matches = []
    
    if m > n:
        return matches
    
    h = pow(d, m - 1, q)  # d^(m-1) % q
    p_hash = 0  # pattern hash
    t_hash = 0  # text window hash
    
    # Calculate initial hash values
    for i in range(m):
        p_hash = (d * p_hash + ord(pattern[i])) % q
        t_hash = (d * t_hash + ord(text[i])) % q
    
    # Slide pattern over text
    for i in range(n - m + 1):
        # Check if hash matches
        if p_hash == t_hash:
            # Verify character by character (spurious hit check)
            if text[i:i + m] == pattern:
                matches.append(i)
        
        # Calculate hash for next window
        if i < n - m:
            t_hash = (d * (t_hash - ord(text[i]) * h) + ord(text[i + m])) % q
            if t_hash < 0:
                t_hash += q
    
    return matches

### Knuth-Morris-Pratt (KMP) Algorithm

In [None]:
def kmp_search(text, pattern):
    
    #Prefix function to avoid redundant comparisons

    def compute_lps(pattern):
        """Compute Longest Prefix Suffix array"""
        m = len(pattern)
        lps = [0] * m
        length = 0
        i = 1
        
        while i < m:
            if pattern[i] == pattern[length]:
                length += 1
                lps[i] = length
                i += 1
            else:
                if length != 0:
                    length = lps[length - 1]
                else:
                    lps[i] = 0
                    i += 1
        return lps
    
    n, m = len(text), len(pattern)
    matches = []
    
    if m == 0 or m > n:
        return matches
    
    lps = compute_lps(pattern)
    i = j = 0
    
    while i < n:
        if pattern[j] == text[i]:
            i += 1
            j += 1
        
        if j == m:
            matches.append(i - j)
            j = lps[j - 1]
        elif i < n and pattern[j] != text[i]:
            if j != 0:
                j = lps[j - 1]
            else:
                i += 1
    
    return matches

### Boyer-Moore Algorithm

In [None]:
def boyer_moore_search(text, pattern):
    
    #Uses bad character heuristic

    def bad_char_heuristic(pattern):
        #Preprocessing for bad character heuristic
        m = len(pattern)
        bad_char = {}
        
        for i in range(m):
            bad_char[pattern[i]] = i
        
        return bad_char
    
    n, m = len(text), len(pattern)
    matches = []
    
    if m > n:
        return matches
    
    bad_char = bad_char_heuristic(pattern)
    s = 0  # shift of pattern
    
    while s <= n - m:
        j = m - 1
        
        # Reduce j while characters match
        while j >= 0 and pattern[j] == text[s + j]:
            j -= 1
        
        if j < 0:
            matches.append(s)
            # Shift pattern so next character aligns
            s += (m - bad_char.get(text[s + m], -1) if s + m < n else 1)
        else:
            # Shift pattern to align bad character
            s += max(1, j - bad_char.get(text[s + j], -1))
    
    return matches

### Apostolico-Giancarlo Algorithm

In [None]:
def apostolico_giancarlo_search(text, pattern):

    #Combines KMP and Boyer-Moore approaches
    def compute_lps(pattern):
        #Compute LPS array (KMP preprocessing)
        m = len(pattern)
        lps = [0] * m
        length = 0
        i = 1
        
        while i < m:
            if pattern[i] == pattern[length]:
                length += 1
                lps[i] = length
                i += 1
            else:
                if length != 0:
                    length = lps[length - 1]
                else:
                    lps[i] = 0
                    i += 1
        return lps
    
    def bad_char_heuristic(pattern):
        #Bad character preprocessing
        m = len(pattern)
        bad_char = {}
        for i in range(m):
            bad_char[pattern[i]] = i
        return bad_char
    
    n, m = len(text), len(pattern)
    matches = []
    
    if m > n:
        return matches
    
    # Preprocessing
    lps = compute_lps(pattern)
    bad_char = bad_char_heuristic(pattern)
    skip = [0] * n  # Skip array for text
    
    s = 0  # shift
    
    while s <= n - m:
        j = m - 1
        k = 0  # characters matched from previous occurrence
        
        # Use skip information if available
        if s > 0 and skip[s + m - 1] > 0:
            k = min(skip[s + m - 1], m - 1)
            j = m - 1 - k
        
        # Match from right to left
        while j >= 0 and pattern[j] == text[s + j]:
            j -= 1
        
        if j < 0:
            matches.append(s)
            # Store skip information
            if s + m < n:
                skip[s + m] = m
            s += (m - lps[m - 1] if m > 1 else 1)
        else:
            # Store partial match information
            if s + m < n:
                skip[s + m - 1] = m - 1 - j
            
            # Shift using bad character rule
            shift = max(1, j - bad_char.get(text[s + j], -1))
            s += shift
    
    return matches

### Example

In [None]:
# Test pattern searching
text = "ABABDABACDABABCABAB"
pattern = "ABABCABAB"

print(f"Text: {text}")
print(f"Pattern: {pattern}")
print()

print("Brute Force:")
result = brute_force_search(text, pattern)
print(f"Found at indices: {result}")
print()

print("Rabin-Karp:")
result = rabin_karp_search(text, pattern)
print(f"Found at indices: {result}")
print()

print("Knuth-Morris-Pratt:")
result = kmp_search(text, pattern)
print(f"Found at indices: {result}")
print()

print("Boyer-Moore:")
result = boyer_moore_search(text, pattern)
print(f"Found at indices: {result}")
print()

print("Apostolico-Giancarlo:")
result = apostolico_giancarlo_search(text, pattern)
print(f"Found at indices: {result}")
print()

# Performance comparison with longer text
text2 = "ABCDEFGH" * 1000 + "PATTERN" + "XYZABC" * 500
pattern2 = "PATTERN"

import time

print("Performance Test:")
algorithms = [
    ("Brute Force", brute_force_search),
    ("Rabin-Karp", rabin_karp_search),
    ("KMP", kmp_search),
    ("Boyer-Moore", boyer_moore_search),
    ("Apostolico-Giancarlo", apostolico_giancarlo_search)
]

for name, func in algorithms:
    start = time.perf_counter()
    result = func(text2, pattern2)
    elapsed = time.perf_counter() - start
    print(f"{name:25} - Found at {result[0] if result else 'N/A':5}, Time: {elapsed*1000:.3f}ms")

## 1.2 Similar String Matching (fuzzy matching)

### Hamming Distance

In [None]:
#pip install textdistance

Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.6.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import textdistance

In [None]:
string1 = "ABCXYZ"
string2 = "ABCXYA"
hamming_distance = textdistance.hamming(string1, string2)
print(f"Hamming distance between '{string1}' and '{string2}': {hamming_distance}")

Hamming distance between 'ABCXYZ' and 'ABCXYA': 1


### Jaro-Winkler Distance

In [18]:
string1 = "ABCXYZ"
string2 = "ABXCYZ "
jaro_winkler_distance = textdistance.jaro_winkler(string1, string2)
print(f"Jaro-Winkler similarity between '{string1}' and '{string2}': {jaro_winkler_distance:.4f}")

Jaro-Winkler similarity between 'ABCXYZ' and 'ABXCYZ ': 0.9175


### Levenshtein Distance

In [19]:
string1 = "ABCXYZ"
string2 = "BBXYZZ"
levenshtein_distance = textdistance.levenshtein(string1, string2)
print(f"Levenshtein distance between '{string1}' and '{string2}': {jaro_winkler_distance:.4f}")

Levenshtein distance between 'ABCXYZ' and 'BBXYZZ': 0.9175


# 2. Address Matching

## 2.1 Similar Matching (Fuzzy Matching)


In [37]:
import pandas as pd

addrs_company1 = pd.read_csv('Source1.csv')
addrs_company2 = pd.read_csv('Source2.csv')

In [38]:
addrs_company1.head()

Unnamed: 0,Street,City,ZipCode
0,101 Maple St,Springfield,IL 62704
1,202 Oak Ave,Madison,WI 53703
2,303 Pine Dr,Raleigh,NC 27609
3,404 Cedar Ct,Denver,CO 80203
4,505 Birch Blvd,Portland,OR 97205


In [28]:
addrs_company1.describe()

Unnamed: 0,Street,City,ZipCode
count,40,40,40
unique,40,38,39
top,101 Maple St,Springfield,UT 84111
freq,1,2,2


In [39]:
addrs_company2.head()

Unnamed: 0,Street,City,ZipCode
0,101 Mapel Street,Springfeld,IL
1,202 Oke Av.,Madson WI,
2,303 Pyn Dr.,Raliegh NC,
3,404 Ceder Ct.,Denvr CO,
4,505 Burch Blv,Portlnd OR,


In [40]:
addrs_company2.describe()

Unnamed: 0,Street,City,ZipCode
count,30,30,11
unique,30,30,11
top,101 Mapel Street,Springfeld,IL
freq,1,1,1


### Address Normalization

USPS.com:

C1 Street Suffix Abbreviations:

https://pe.usps.com/text/pub28/28apc_003.htm

C2 Secondary Unit Designators:

https://pe.usps.com/text/pub28/28apc_002.htm

In [48]:
df_norm1 = pd.read_csv('Abb1.csv', sep='\t')
df_norm2 = pd.read_csv('Abb2.csv', sep='\t')

In [50]:
df_norm1.head()

Unnamed: 0,Used,Abb
0,ALLEE,ALY
1,ALLEY,ALY
2,ALLY,ALY
3,ALY,ALY
4,ANEX,ANX


In [51]:
df_norm2.head()

Unnamed: 0,Desc,Abb
0,Apartment,APT
1,Basement,BSMT**
2,"Blank, unable to determine*",
3,Building,BLDG
4,Department,DEPT


In [None]:
# Turn normalization data into a dictionary
norm1_dictionary = df_norm1.set_index('Used')['Abb'].to_dict()

In [54]:
norm1_dictionary

{'ALLEE': 'ALY',
 'ALLEY': 'ALY',
 'ALLY': 'ALY',
 'ALY': 'ALY',
 'ANEX': 'ANX',
 'ANNEX': 'ANX',
 'ANNX': 'ANX',
 'ANX': 'ANX',
 'ARC': 'ARC',
 'ARCADE': 'ARC',
 'AV': 'AVE',
 'AVE': 'AVE',
 'AVEN': 'AVE',
 'AVENU': 'AVE',
 'AVENUE': 'AVE',
 'AVN': 'AVE',
 'AVNUE': 'AVE',
 'BAYOO': 'BYU',
 'BAYOU': 'BYU',
 'BCH': 'BCH',
 'BEACH': 'BCH',
 'BEND': 'BND',
 'BND': 'BND',
 'BLF': 'BLF',
 'BLUF': 'BLF',
 'BLUFF': 'BLF',
 'BLUFFS': 'BLFS',
 'BOT': 'BTM',
 'BTM': 'BTM',
 'BOTTM': 'BTM',
 'BOTTOM': 'BTM',
 'BLVD': 'BLVD',
 'BOUL': 'BLVD',
 'BOULEVARD': 'BLVD',
 'BOULV': 'BLVD',
 'BR': 'BR',
 'BRNCH': 'BR',
 'BRANCH': 'BR',
 'BRDGE': 'BRG',
 'BRG': 'BRG',
 'BRIDGE': 'BRG',
 'BRK': 'BRK',
 'BROOK': 'BRK',
 'BROOKS': 'BRKS',
 'BURG': 'BG',
 'BURGS': 'BGS',
 'BYP': 'BYP',
 'BYPA': 'BYP',
 'BYPAS': 'BYP',
 'BYPASS': 'BYP',
 'BYPS': 'BYP',
 'CAMP': 'CP',
 'CP': 'CP',
 'CMP': 'CP',
 'CANYN': 'CYN',
 'CANYON': 'CYN',
 'CNYN': 'CYN',
 'CAPE': 'CPE',
 'CPE': 'CPE',
 'CAUSEWAY': 'CSWY',
 'CAUSWA': 'CSWY'

In [None]:
# Turn normalization data into a dictionary
norm2_dictionary = df_norm2.set_index('Desc')['Abb'].to_dict()

In [57]:
norm2_dictionary

{'Apartment': 'APT',
 'Basement': 'BSMT**',
 'Blank, unable to determine*': nan,
 'Building': 'BLDG',
 'Department': 'DEPT',
 'Floor': 'FL',
 'Front': 'FRNT**',
 'Hanger': 'HNGR',
 'Key': 'KEY',
 'Lobby': 'LBBY**',
 'Lot': 'LOT',
 'Lower': 'LOWR**',
 'Office': 'OFC**',
 'Penthouse': 'PH**',
 'Pier': 'PIER',
 'Rear': 'REAR**',
 'Room': 'RM',
 'Side': 'SIDE**',
 'Slip': 'SLIP',
 'Space': 'SPC',
 'Stop': 'STOP',
 'Suite': 'STE',
 'Trailer': 'TRLR',
 'Unit': 'UNIT',
 'Upper': 'UPPR**'}

In [119]:
# Create string for each address for both companies
# Fill NaN with empty string to avoid issues during concatenation ( string concatenated with NaN results in NaN )

addrs_string_comp1 = addrs_company1['Street'].fillna('') + addrs_company1['City'].fillna('') + addrs_company1['ZipCode'].fillna('')
addrs_string_comp2 = addrs_company2['Street'].fillna('') + addrs_company2['City'].fillna('') + addrs_company2['ZipCode'].fillna('')

In [None]:
import re

# Remove punctuation and convert to uppercase
# Use regex=True to ensure that the replacement is done correctly for all occurrences
# Note: This will remove all non-alphanumeric characters, including spaces. If you want to keep spaces, you can modify the regex pattern accordingly.

addrs_string_comp1 = addrs_string_comp1.str.replace(r'[^\w]', '', regex=True).str.upper()
addrs_string_comp1

0        101MAPLESTSPRINGFIELDIL62704
1             202OAKAVEMADISONWI53703
2             303PINEDRRALEIGHNC27609
3             404CEDARCTDENVERCO80203
4         505BIRCHBLVDPORTLANDOR97205
5           606WALNUTWAYAUSTINTX78701
6          707CHESTNUTSTBOSTONMA02108
7          808REDWOODRDSANJOSECA95112
8      909ASPENAVESALTLAKECITYUT84111
9          111HICKORYSTATLANTAGA30303
10          222POPLARPLSEATTLEWA98101
11          333MAGNOLIADRMIAMIFL33130
12       444SYCAMORELNCOLUMBUSOH43215
13     555DOGWOODSTMINNEAPOLISMN55401
14             666FIRCTPHOENIXAZ85004
15       777SPRUCESTKANSASCITYMO64106
16       888WILLOWWAYNASHVILLETN37203
17        999PALMAVELOSANGELESCA90012
18         121JUNIPERDRCHICAGOIL60601
19        131ALDERSTNEWORLEANSLA70112
20               10ELMSTALBANYNY12207
21              20OAKSTTRENTONNJ08608
22          30PINESTHARRISBURGPA17101
23              40CEDARSTDOVERDE19901
24           50BIRCHSTRICHMONDVA23219
25        60WALNUTSTCHARLESTONWV25301
26        70

In [121]:
addrs_string_comp2 = addrs_string_comp2.str.replace(r'[^\w]', '', regex=True).str.upper()
addrs_string_comp2

0     101MAPELSTREETSPRINGFELDIL
1               202OKEAVMADSONWI
2              303PYNDRRALIEGHNC
3              404CEDERCTDENVRCO
4           505BURCHBLVPORTLNDOR
5              606WALNTWYAUSTNTX
6         707CHESTNUTSTRTBOSTNMA
7            808REDWDRDSANJOSECA
8            909ASPNAVSLTLKCTYUT
9            111HICKRYSTATLNTAGA
10            222POPLRPLSEATLEWA
11           333MAGNLIADRMAIMIFL
12         444SYCMORELNCOLMBUSOH
13        555DOGWDSTMINNEAPLISMN
14              666FRRCTPHONIXAZ
15          777SPRCESTKNSASCTYMO
16          888WILOWWYNASHVILETN
17          999PLMAVLOSANGELESCA
18            121JUNPRDRCHICGOIL
19         131ALDRSTNEWORLEANSLA
20        11MAPLESTHELENAMT59601
21        22OAKSTCHEYENNEWY82001
22       33PINESTBISMARCKND58501
23        44CEDARSTPIERRESD57501
24        55BIRCHSTJUNEAUAK99801
25     66WALNUTSTHONOLULUHI96813
26      77CHESTNUTSTSALEMOR97301
27     88REDWOODSTOLYMPIAWA98501
28    99ASPENSTCARSONCITYNV89701
29    111HICKORYSTPHOENIXAZ85004
dtype: obj

In [122]:
# Replace abbreviations in the address strings using the normalization dictionary
# Use regex=True to ensure that the replacement is done correctly for all occurrences
# Loop through the normalization dictionary and replace abbreviations in the address strings

for abbr, full in norm1_dictionary.items():
    addrs_string_comp1 = addrs_string_comp1.replace(abbr, full, regex = True)
    addrs_string_comp2 = addrs_string_comp2.replace(abbr, full, regex = True)

In [123]:
addrs_string_comp1.head()

0       101MAPLESTSPGFLDIL62704
1      202OAKAVEEMADISONWI53703
2        303PNEDRRALEIGHNC27609
3       404CEDARCTDENVERCO80203
4    505BIRCHBLVDPRTLANDOR97205
dtype: object

In [124]:
addrs_string_comp2.head()

0    101MAPELSTSPGFELDIL
1      202OKEAVEMADSONWI
2      303PYNDRRALIEGHNC
3      404CEDERCTDENVRCO
4    505BURCHBLVPRTLNDOR
dtype: object

In [78]:
import numpy as np

In [None]:
# These are the lengths of the address lists for both companies to construct the similarity matrix

length_source1 = len(addrs_string_comp1)
length_source2 = len(addrs_string_comp2)

In [155]:
# Initialize similarity matrix
# Each entry (i, j) will hold the Levenshtein distance between addrs_string_comp1[i] and addrs_string_comp2[j]

similarity_matrix = np.zeros((length_source2, length_source1), dtype=int)

In [156]:
# Compute Levenshtein distance for each pair of addresses
# Loop through each address in the first company and compare it with each address in the second company
# Note: This can be very slow for large datasets, consider using more efficient algorithms or parallel processing for larger datasets

for i in range(length_source2):
    for j in range(length_source1):
        similarity_matrix[i][j] = textdistance.levenshtein(addrs_string_comp2[i], addrs_string_comp1[j])


In [157]:
# Find the index of the minimum distance for each address in the first company

argmin_index = similarity_matrix.argmin(axis=1)


In [169]:
# Combine the results into a DataFrame for better visualization
# Note: We reset the index of the second company to ensure it aligns correctly with the first company

addrs_company1_sim_match = addrs_company1.iloc[argmin_index,:].reset_index(drop=True)

addrs_comp1vs2 = pd.concat([addrs_company2, addrs_company1_sim_match], axis=1)

addrs_comp1vs2


Unnamed: 0,Street,City,ZipCode,Street.1,City.1,ZipCode.1
0,101 Mapel Street,Springfeld,IL,101 Maple St,Springfield,IL 62704
1,202 Oke Av.,Madson WI,,202 Oak Ave,Madison,WI 53703
2,303 Pyn Dr.,Raliegh NC,,303 Pine Dr,Raleigh,NC 27609
3,404 Ceder Ct.,Denvr CO,,404 Cedar Ct,Denver,CO 80203
4,505 Burch Blv,Portlnd OR,,505 Birch Blvd,Portland,OR 97205
5,606 Walnt Wy,Austn TX,,606 Walnut Way,Austin,TX 78701
6,707 Chestnut Strt,Bostn MA,,707 Chestnut St,Boston,MA 02108
7,808 Redwd Rd.,SanJose CA,,808 Redwood Rd,San Jose,CA 95112
8,909 Aspn Av,Slt Lk Cty UT,,909 Aspen Ave,Salt Lake City,UT 84111
9,111 Hickry St.,Atlnta GA,,111 Hickory St,Atlanta,GA 30303


Result: not perfect but we can make this more accurate by adding conditions:

* Enforce that the states match exactly
* Try other string distances
* Phonetic matching: jellyfish.soundex, jellyfish.metaphone

In [None]:
# Find the index of the second minimum distance for each address in the first company

argmin2_index = np.argsort(similarity_matrix, axis=1)[:,1]

addrs_company1_sim_match2 = addrs_company1.iloc[argmin2_index,:].reset_index(drop=True)

addrs_comp1vs2_2 = pd.concat([addrs_comp1vs2, addrs_company1_sim_match2], axis=1)

addrs_comp1vs2_2

Unnamed: 0,Street,City,ZipCode,Street.1,City.1,ZipCode.1,Street.2,City.2,ZipCode.2
0,101 Mapel Street,Springfeld,IL,101 Maple St,Springfield,IL 62704,130 Sycamore St,Springfield,MO 65806
1,202 Oke Av.,Madson WI,,202 Oak Ave,Madison,WI 53703,20 Oak St,Trenton,NJ 08608
2,303 Pyn Dr.,Raliegh NC,,303 Pine Dr,Raleigh,NC 27609,30 Pine St,Harrisburg,PA 17101
3,404 Ceder Ct.,Denvr CO,,404 Cedar Ct,Denver,CO 80203,40 Cedar St,Dover,DE 19901
4,505 Burch Blv,Portlnd OR,,505 Birch Blvd,Portland,OR 97205,50 Birch St,Richmond,VA 23219
5,606 Walnt Wy,Austn TX,,606 Walnut Way,Austin,TX 78701,10 Elm St,Albany,NY 12207
6,707 Chestnut Strt,Bostn MA,,707 Chestnut St,Boston,MA 02108,70 Chestnut St,Columbia,SC 29201
7,808 Redwd Rd.,SanJose CA,,808 Redwood Rd,San Jose,CA 95112,20 Oak St,Trenton,NJ 08608
8,909 Aspn Av,Slt Lk Cty UT,,909 Aspen Ave,Salt Lake City,UT 84111,190 Juniper St,Salt Lake City,UT 84111
9,111 Hickry St.,Atlnta GA,,111 Hickory St,Atlanta,GA 30303,100 Hickory St,Jackson,MS 39201


## 2.2 NLP

In [None]:
import spacy

# To use spaCy for NER, you need to install the English model first:
# Python -m spacy download en_core_web_sm
# Python -m spacy download en_core_web_md
# Python -m spacy download en_core_web_lg
# Python -m spacy download en

# Load the NER spaCy model
nlp = spacy.load("en_core_web_md")

In [186]:
nlp_addrs_string_comp1 = addrs_company1['Street'].fillna('') + addrs_company1['City'].fillna('') + addrs_company1['ZipCode'].fillna('')
nlp_addrs_string_comp2 = addrs_company2['Street'].fillna('') + addrs_company2['City'].fillna('') + addrs_company2['ZipCode'].fillna('')

In [188]:
nlp_addrs_string_comp1.head()

0    101 Maple St Springfield IL 62704
1         202 Oak Ave Madison WI 53703
2         303 Pine Dr Raleigh NC 27609
3         404 Cedar Ct Denver CO 80203
4     505 Birch Blvd Portland OR 97205
dtype: object

In [189]:
nlp_addrs_string_comp2.head()

0    101 Mapel Street Springfeld IL
1             202 Oke Av. Madson WI
2            303 Pyn Dr. Raliegh NC
3            404 Ceder Ct. Denvr CO
4          505 Burch Blv Portlnd OR
dtype: object

In [None]:
# Convert addresses in first list to vectors
docs1 = [nlp(addr) for addr in nlp_addrs_string_comp1]

# Match each address in list2
results = []
for addr2 in nlp_addrs_string_comp2:
    # Convert addresses in second list to vectors
    doc2 = nlp(addr2)
    best_match = None
    best_score = -1

    for doc1 in docs1:
        # Compute similarity score
        score = doc2.similarity(doc1)
        if score > best_score:
            best_score = score
            best_match = doc1.text

    results.append((addr2, best_match,round(best_score, 4)))

df_results = pd.DataFrame(results, columns=["Input Address", "Best Match in List1", "Similarity Score"])


In [194]:
df_results


Unnamed: 0,Input Address,Best Match in List1,Similarity Score
0,101 Mapel Street Springfeld IL,121 Juniper Dr Chicago IL 60601,0.707
1,202 Oke Av. Madson WI,170 Willow St Oklahoma City OK 73102,0.6581
2,303 Pyn Dr. Raliegh NC,303 Pine Dr Raleigh NC 27609,0.8377
3,404 Ceder Ct. Denvr CO,404 Cedar Ct Denver CO 80203,0.8372
4,505 Burch Blv Portlnd OR,505 Birch Blvd Portland OR 97205,0.5395
5,606 Walnt Wy Austn TX,555 Dogwood St Minneapolis MN 55401,0.6859
6,707 Chestnut Strt Bostn MA,707 Chestnut St Boston MA 02108,0.7877
7,808 Redwd Rd. SanJose CA,808 Redwood Rd San Jose CA 95112,0.7871
8,909 Aspn Av Slt Lk Cty UT,140 Dogwood St Des Moines IA 50309,0.6102
9,111 Hickry St. Atlnta GA,170 Willow St Oklahoma City OK 73102,0.8474
