In [11]:
import time
from typing import List
from konlpy.tag import Mecab

def jaccard_similarity(tokens1: List[str], tokens2: List[str]) -> float:
    """jaccard similarity of tokenized texts"""
    set1, set2 = set(tokens1), set(tokens2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

def deduplicate_strings(strings, threshold=0.5):
    mecab = Mecab()
    deduped = []
    tokenized_deduped = []
    
    for s in strings:
        tokens = mecab.morphs(s)
        duplicate_found = False
        for tokens_existing in tokenized_deduped:
            similarity  = jaccard_similarity(tokens, tokens_existing)
            print(f"SIM: {similarity:.4f}")
            if similarity >= threshold:
                duplicate_found = True
                break
        if not duplicate_found:
            deduped.append(s)
            tokenized_deduped.append(tokens)
    return deduped

In [12]:
strings = [
    "안녕하세요. 저는 학생입니다.",
    "안녕하세요 저는 학생입니다",
    "반갑습니다. 저는 학생이에요.",
    "안녕하세요, 학생입니다.",
    "이것은 다른 문장입니다."
]

strings = [
    "hello"*1000,
    "hallo"*1000,
    "hello"*1000,
    "world"*1000,
    "wurld"*1000,
    "worlds"*1000
]

start = time.time()
result = deduplicate_strings(strings, threshold=0.5)
end = time.time()
print(f"{end-start:.3f}",len(result))

SIM: 0.4286
SIM: 1.0000
SIM: 0.2222
SIM: 0.2222
SIM: 0.1000
SIM: 0.1000
SIM: 0.5000
SIM: 0.2000
SIM: 0.2000
SIM: 0.6250
0.140 3


In [10]:
for r in result:
    print(r)

hellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohellohello