In [None]:
import pandas as pd
import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from rapidfuzz import process, fuzz
import wordninja

# -------------------------
# Step 1: 数据准备
# -------------------------
primary = pd.read_csv("primary.csv")
alternate = pd.read_csv("alternate.csv")
test = pd.read_excel("test_02.xlsx", sheet_name="Sheet2")

# -------------------------
# Step 2: 清洗 + 分词（适用于乱序匹配）
# -------------------------
def clean_for_rapidfuzz(s):
    if pd.isna(s):
        return ""
    s = re.sub(r'[^A-Z0-9]', '', s.upper())
    words = wordninja.split(s.lower())
    return ' '.join(words)  # 留空格用于 token 匹配

primary['NGRAM_NAME'] = primary['NAME'].apply(clean_for_rapidfuzz)
alternate['NGRAM_NAME'] = alternate['NAME'].apply(clean_for_rapidfuzz)
test['NGRAM_NAME'] = test['NAME'].apply(clean_for_rapidfuzz)

# -------------------------
# Step 3: 构造参考集和映射
# -------------------------
reference = pd.concat([
    primary[['ID', 'NGRAM_NAME']],
    alternate[['ID', 'NGRAM_NAME']]
], ignore_index=True)

name_to_ids = defaultdict(set)
for _, row in reference.iterrows():
    name_to_ids[row['NGRAM_NAME']].add(row['ID'])

choices = reference['NGRAM_NAME'].tolist()
name_to_id = dict(zip(reference['NGRAM_NAME'], reference['ID']))

# -------------------------
# Step 4: RapidFuzz 匹配（乱序鲁棒）
# -------------------------
def match_rapidfuzz_token_sort(query):
    if not query:
        return None, None, 0
    matched_name, score, _ = process.extractOne(
        query, choices, scorer=fuzz.token_sort_ratio)
    matched_id = name_to_id.get(matched_name)
    return matched_id, matched_name, score / 100  # 转换为 0~1 区间

# -------------------------
# Step 5: 并行匹配
# -------------------------
NUM_THREADS = 8

results = []
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
    futures = {executor.submit(match_rapidfuzz_token_sort, row['NGRAM_NAME']): idx for idx, row in test.iterrows()}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Matching"):
        results.append(future.result())

matched_ids, matched_names, scores = zip(*results)
test['MATCHED_ID'] = matched_ids
test['MATCHED_NAME'] = matched_names
test['SCORE'] = scores

# -------------------------
# Step 6: 匹配验证与输出
# -------------------------
test['CORRECT'] = test.apply(
    lambda row: row['ID'] in name_to_ids.get(row['MATCHED_NAME'], set()),
    axis=1
)

print(test[['NAME', 'MATCHED_NAME', 'SCORE', 'ID', 'CORRECT']].head())
print(f"匹配准确率（RapidFuzz token_sort_ratio）: {test['CORRECT'].mean():.2%}")


Matching: 100%|██████████| 1000/1000 [00:00<00:00, 599100.70it/s]

                                                NAME  \
0                    TRADINGLIMITEDGENERALALCARDINAL   
1       ROSKAPSTROYWATERLIABILITYLIMITEDCLEANCOMPANY   
2                               AMERICANDESATUNECVUP   
3                           JorgeGOMEZRODRIGUEZJesus   
4  CLOSEDSTOCKJOINTSERVICESSPECIALCOMPANYTRANSPOR...   

                                        MATCHED_NAME     SCORE     ID  CORRECT  
0                          herculesinternationalship  0.576923  27591    False  
1  militaryconstructioncomplexoftheministryofdefense  0.552381  46040    False  
2     limitedliabilitycompanykeramaksverkhnyayasalda  0.652174  11246    False  
3                        mandegarbasparkimiyacompany  0.703704  25078    False  
4                          peridotshippingtradingllc  0.754717  47475    False  
匹配准确率（RapidFuzz fuzz.ratio）: 0.00%



