In [39]:
from string import punctuation
import re

def clean(s):
    res = re.sub(r'(\w)(\()(\w)', '\g<1> \g<2>\g<3>', s)
    res = re.sub(r"\b\d+\b", "", res)
    res = re.sub(r'(\w)([),.:;]+)(\w)', '\g<1>\g<2> \g<3>', res)
    res = re.sub(r'(\w)(\.\()(\w)', '\g<1>. (\g<3>', res)
    res = re.sub(r'\s+', ' ', res)
    res = res.strip()
    return res

def stripclean(arr):
    res = [s.strip().strip(punctuation) for s in arr]
    return ' '.join([i for i in res if i != '']).upper()

def dummy(x):
    # stupid workaround to deep copy array cause i couldn't get it to work properly
    return [s for s in x]

In [1]:
from collections import defaultdict

class TrieNode:
    def __init__(self):
        self.children = defaultdict(TrieNode) 
        self.smallest_str = None
        self.end = None
    def __getitem__(self, c):
        return self.children[c]
class Trie:
    def __init__(self):
        self.root = TrieNode()
    def insert(self, s: str):
        node = self.root
        for c in s:
            node = node[c]
            if node.smallest_str is None:
                node.smallest_str = s
        node.end = s
    def get_similar(self, s):
        node = self.root
        for i, c in enumerate(s):
            if c not in node.children:
                i -= 1
                break
            node = node[c]
        return (node.smallest_str or node.end, i + 1)


class Matcher:
    def __init__(self, dic: dict):
        self.trie = Trie()
        for s in dic:
            self.trie.insert(s)

    def get_match(self, s: str) -> tuple:
        return self.trie.get_similar(s)



In [31]:
import pandas as pd 

df = pd.read_csv('data-full-digit.csv')

In [42]:
raw_txt = ' Số Thanh     Niên P Cẩm Th '
query_txt = stripclean(clean(str(raw_txt)).split())

In [43]:
s = {}
for id, row in df.iterrows():
    s[row.text.upper()] = row.lbl
lbl_dict = {0: "SELLER", 1: "ADDRESS", 2: "TIMESTAMP", 3: "TOTAL_COST"}

matcher = Matcher(s)
key, score = matcher.get_match(query_txt)
print(f'{key}\n\ntype:{lbl_dict[s[key]]}, match words: {score}')

SỐ THANH NIÊN P CẨM THÀNH

type:ADDRESS, match words: 22
