In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import process
from tqdm import tqdm

# Load the data
#只载入前1000行
alternate_names = pd.read_csv('alternate.csv')
primary_names = pd.read_csv('primary.csv')
test_names = pd.read_csv('test_01.csv')
print(alternate_names.head())
print(primary_names.head())
print(test_names.head())

#Number Maps
number_map = {'0': ['zero', 'o'], 
              '1': ['one','|','i','l'], 
              '2': ['two', 'z', 'ii'],
              '3': ['three', 'iii'],
              '4': ['four', 'iv'], 
              '5': ['five', 's', 'v'], 
              '6': ['six', 'vi'],
              '7': ['seven', 'vii'],
              '8': ['eight', 'viii'],
              '9': ['nine', 'ix']}

    ID                   NAME
0   36         AERO-CARIBBEAN
1  173            AVIA IMPORT
2  306  NATIONAL BANK OF CUBA
3  540                  COIBA
4  552                 CRYMSA
    ID                       NAME TYPE
0   36     AEROCARIBBEAN AIRLINES    C
1  173  ANGLO-CARIBBEAN CO., LTD.    C
2  306     BANCO NACIONAL DE CUBA    C
3  424         BOUTIQUE LA MAISON    C
4  475               CASA DE CUBA    C
    ID                   VARIANT
0   36     AEROCARIBEA NAIRLINES
1  173  UNGLO-CARIBBEAN CO., LT.
2  306    BANCO NACIONAL DE gUBA
3  424       BOUTIQUE# LA MAISON
4  475               CASA E CUBA




In [3]:
def fuzzy_match_test_to_primary(test_names, primary_names):
    correct_matches = 0 
    total_tests = len(test_names) 
    for _, test_row in tqdm(test_names.iterrows(), total=total_tests):
        test_id = test_row['ID']
        test_variant = test_row['VARIANT']

        match, score, _ = process.extractOne(test_variant, primary_names['NAME'])

        matched_row = primary_names[primary_names['NAME'] == match]
        matched_id = matched_row['ID'].values[0]

        if matched_id == test_id:
            correct_matches += 1

    accuracy = correct_matches / total_tests
    return accuracy

# 计算准确率
accuracy = fuzzy_match_test_to_primary(test_names, primary_names)
print(f"准确率: {accuracy * 100:.2f}%")


100%|██████████| 1000/1000 [04:10<00:00,  3.99it/s]

准确率: 93.20%





In [None]:
import pandas as pd
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# 加载数据
alternate_names = pd.read_csv('alternate.csv')
primary_names = pd.read_csv('primary.csv')
test_names = pd.read_csv('test_01.csv')

# 准备 primary name 列表
primary_names_list = primary_names['NAME'].tolist()
primary_ids = primary_names['ID'].tolist()

# 匹配函数
def match_test_variant(test_row):
    test_id = test_row['ID']
    test_variant = test_row['VARIANT']

    match, score, _ = process.extractOne(
        test_variant, 
        primary_names_list, 
        scorer=fuzz.token_sort_ratio
    )

    matched_index = primary_names_list.index(match)
    matched_id = primary_ids[matched_index]

    return {
        'test_id': test_id,
        'test_variant': test_variant,
        'matched_id': matched_id,
        'matched_name': match,
        'score': score,
        'is_correct': int(test_id == matched_id)
    }

# 并行处理并收集结果
def fuzzy_match_test_to_primary(test_names):
    matched_results = []
    total_tests = len(test_names)

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(match_test_variant, row) for row in test_names.to_dict('records')]

        for future in tqdm(as_completed(futures), total=total_tests, desc="匹配进度"):
            result = future.result()
            matched_results.append(result)

    return matched_results

# 执行匹配
results = fuzzy_match_test_to_primary(test_names)

# 转换为 DataFrame
results_df = pd.DataFrame(results)

# 计算准确率
accuracy = results_df['is_correct'].sum() / len(results_df)
print(f"准确率: {accuracy * 100:.2f}%")

# 保存为 CSV
results_df.to_csv('./Matched_results/Test1_matched_results.csv', index=False)
print("匹配结果已保存到 matched_results.csv")


匹配进度: 100%|██████████| 16041/16041 [01:52<00:00, 143.02it/s]

准确率: 93.69%
匹配结果已保存到 matched_results.csv





In [7]:
import pandas as pd
from rapidfuzz import fuzz
from tqdm import tqdm
import re


# 简单清洗函数：统一大小写 + 去除非字母数字
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9 ]', '', name.lower())

def deduplicate_exact_then_fuzzy(df, threshold=90, high_score_cutoff=98):
    df = df.copy()
    # --- Step 1: 完全重复的清洗 ---
    exact_groups = {}
    print(df.shape)
    for _, row in df.iterrows():
        clean = row['NAME']
        if clean not in exact_groups:
            exact_groups[clean] = [row.to_dict()]
        else:
            exact_groups[clean].append(row.to_dict())

    # 从 exact_groups 中提取未归组的代表项（每组取第一个代表进入模糊匹配）
    unique_representatives = [group[0] for group in exact_groups.values()]
    print(f"找到 {len(unique_representatives)} 个唯一代表项（精确匹配）")
    ungrouped = unique_representatives.copy()

    # --- Step 2: 模糊聚类 ---
    grouped = []
    total_matches = 0
    wrong_matches = 0
    compress = 0

    # 用来收集高分但错误的匹配
    high_score_errors = []

    pbar = tqdm(total=len(ungrouped), desc="模糊聚类中")

    while ungrouped:
        base = ungrouped.pop(0)
        base_clean = base['NAME']
        base_group = [base]
        to_remove = []

        # 收集本轮内所有“匹配”对，用于后续筛选错误时查分数
        match_pairs = []

        for other in ungrouped:
            other_clean = other['NAME']
            score = fuzz.token_sort_ratio(base_clean, other_clean)
            if score >= threshold:
                base_group.append(other)
                to_remove.append(other)
                match_pairs.append((base, other, score))

        # 聚合同 base_clean 相同的所有 exact group 成员
        final_group = []
        for item in base_group:
            final_group.extend(exact_groups[item['NAME']])

        # 统计错误匹配：用最短名字的 ID 为标准
        compress += 1
        total_matches += len(final_group) - 1
        shortest_item = min(final_group, key=lambda x: len(x['NAME']))
        representative_id = shortest_item['ID']

        # 遍历所有 match_pairs，找出那些实际被分在一起但 ID != representative_id 的高级错误
        for a, b, score in match_pairs:
            # 找到它们在 final_group 中的所有原始条目
            entries = exact_groups[b['NAME']]
            for entry in entries:
                if entry['ID'] != representative_id and score > high_score_cutoff:
                    high_score_errors.append({
                        'base_ID': base['ID'],
                        'base_NAME': base['NAME'],
                        'other_ID': entry['ID'],
                        'other_NAME': entry['NAME'],
                        'score': score
                    })

        # 统计错误数
        wrong_in_group = sum(1 for item in final_group if item['ID'] != representative_id)
        wrong_matches += wrong_in_group

        grouped.append(final_group)

        # 移除已归组成员
        ungrouped = [u for u in ungrouped if u not in to_remove]
        pbar.update(1)

    pbar.close()

    # 打印出 score > high_score_cutoff 的错误匹配
    if high_score_errors:
        print(f"\n=== 高分（>{high_score_cutoff}）错误匹配列表 ===")
        for err in high_score_errors:
            print(f"Base(ID={err['base_ID']}, NAME='{err['base_NAME']}')  <->  "
                  f"Other(ID={err['other_ID']}, NAME='{err['other_NAME']}')  |  score={err['score']}")

    accuracy = 1 - (wrong_matches / total_matches) if total_matches else 1.0
    compression_ratio = compress / len(df)
    return grouped, accuracy, compression_ratio

# 用法示例
alternate_names = pd.read_csv('alternate.csv')
dedup_result_rows = []
groups, acc, compress_ratio = deduplicate_exact_then_fuzzy(primary_names, threshold=90)
print(f"\n去重准确率（含精确+模糊）: {acc:.4f}")
print(f"压缩比: {compress_ratio:.4f}")
for group_idx, group in enumerate(groups):
    representative = min(group, key=lambda x: len(x['NAME']))
    rep_id = representative['ID']
    for entry in group:
        dedup_result_rows.append({
            'original_ID': entry['ID'],
            'original_NAME': entry['NAME'],
            'group_ID': rep_id,
            'group_index': group_idx
        })

dedup_df = pd.DataFrame(dedup_result_rows)
# dedup_df.to_csv('deduplicated_alternate_names.csv', index=False)
# print("去重结果已保存到 deduplicated_alternate_names.csv")


(16041, 3)
找到 16017 个唯一代表项（精确匹配）


模糊聚类中:  97%|█████████▋| 15525/16017 [02:05<00:03, 123.46it/s] 


=== 高分（>98）错误匹配列表 ===
Base(ID=26466, NAME='SMILE PROPERTY & TRAVEL LTD')  <->  Other(ID=26603, NAME='SMILE PROPERTY & TRAVEL LTD.')  |  score=98.18181818181819
Base(ID=43125, NAME='SEVERNAYA ZVEZDA LIMITED LIABILITY COMPANY')  <->  Other(ID=49147, NAME='LIMITED LIABILITY COMPANY SEVERNAYA ZVEZDA')  |  score=100.0
Base(ID=44264, NAME='NAI ENERGY EUROPE GMBH & CO. KG')  <->  Other(ID=44270, NAME='NAI EUROPE ENERGY GMBH & CO. KG')  |  score=100.0
Base(ID=46022, NAME='LIMITED LIABILITY COMPANY KISMET TELECOM INFRASTRUCTURE')  <->  Other(ID=46028, NAME='LIMITED LIABILITY COMPANY KISMET TELECOM INFRASTRUCTURE 2')  |  score=98.21428571428571
Base(ID=50016, NAME='FEDERAL STATE GOVERNMENTAL INSTITUTION 4 CENTRAL RESEARCH INSTITUTE OF THE MINISTRY OF DEFENSE OF THE RUSSIAN FEDERATION')  <->  Other(ID=50017, NAME='FEDERAL STATE GOVERNMENTAL INSTITUTION 27 CENTRAL RESEARCH INSTITUTE OF THE MINISTRY OF DEFENSE OF THE RUSSIAN FEDERATION')  |  score=98.7551867219917
Base(ID=6706, NAME='ARELLANO FELI




In [8]:
representative_rows = []
for group in groups:
    representative = min(group, key=lambda x: len(x['NAME']))
    representative_rows.append({
        'ID': representative['ID'],
        'NAME': representative['NAME']
    })

# 保存为 CSV
representative_df = pd.DataFrame(representative_rows)
representative_df.to_csv('deduplicated_representatives_primary.csv', index=False)
print("已保存所有去重后代表项到 deduplicated_representatives_primary.csv")

已保存所有去重后代表项到 deduplicated_representatives_primary.csv
