In [15]:
import pandas as pd

df = pd.read_excel("test_02.xlsx")

print(df)

   Sheet1     remove all special characters and white spaces
0  Sheet2  shuffle the words then remove all special char...
1  Sheet3            separate every character by white space
2  Sheet4  shuffle the words then separate every characte...
3  Sheet5                                       word removal
4  Sheet6                                    word truncation
5  Sheet7                                           initials
6  Sheet8                     simulated names, no true match


In [16]:
df_1 = pd.read_excel("test_02.xlsx", sheet_name="Sheet1")
df_2 = pd.read_excel("test_02.xlsx", sheet_name="Sheet2")
df_3 = pd.read_excel("test_02.xlsx", sheet_name="Sheet3")
df_4 = pd.read_excel("test_02.xlsx", sheet_name="Sheet4")
df_5 = pd.read_excel("test_02.xlsx", sheet_name="Sheet5")
df_6 = pd.read_excel("test_02.xlsx", sheet_name="Sheet6")
df_7 = pd.read_excel("test_02.xlsx", sheet_name="Sheet7")
df_8 = pd.read_excel("test_02.xlsx", sheet_name="Sheet8")



In [18]:
import pandas as pd
import re
import wordninja
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# -------------------------
# Step 1: 数据加载与清洗
# -------------------------

# 读取 primary 和 alternate 数据
primary_names = pd.read_csv('primary.csv')
alternate_names = pd.read_csv('alternate.csv', nrows=1000)

# 读取测试数据（Sheet4）
df_4 = pd.read_excel("test_02.xlsx", sheet_name="Sheet2")

# -------------------------
# Step 2: 分词 + 排序 处理函数
# -------------------------

def normalize_and_sort_words(name):
    if pd.isna(name):
        return ''
    clean = re.sub(r'[^a-zA-Z0-9]', '', name)  # 去除非字母数字
    segments = wordninja.split(clean.lower())  # 自动分词
    return ' '.join(sorted(segments))  # 小写+排序+空格连接（适合 token_sort_ratio）

# 对测试集添加处理列
df_4['PROCESSED_NAME'] = df_4['NAME'].apply(normalize_and_sort_words)

# 对 primary_names 中的 NAME 字段也处理（用于匹配）
primary_names['PROCESSED_NAME'] = primary_names['NAME'].apply(normalize_and_sort_words)
primary_name_list = primary_names['PROCESSED_NAME'].tolist()
primary_id_list = primary_names['ID'].tolist()

# -------------------------
# Step 3: 匹配函数
# -------------------------

def match_test_variant(test_row):
    test_id = test_row['ID']
    test_variant = test_row['PROCESSED_NAME']

    # 使用 RapidFuzz 进行模糊匹配
    match, score, _ = process.extractOne(
        test_variant, 
        primary_name_list,
        processor=None,  # 已预处理，无需再次处理
        scorer=fuzz.token_sort_ratio
    )

    matched_index = primary_name_list.index(match)
    matched_id = primary_id_list[matched_index]

    return test_id, matched_id, match, score

# -------------------------
# Step 4: 并行匹配执行
# -------------------------

def fuzzy_match_test_to_primary(test_df):
    results = []
    correct_matches = 0
    total = len(test_df)

    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [
            executor.submit(match_test_variant, row)
            for row in test_df.to_dict('records')
        ]

        for future in tqdm(as_completed(futures), total=total, desc="匹配进度"):
            test_id, matched_id, matched_name, score = future.result()
            original_id = df_4.loc[df_4['ID'] == test_id, 'ID'].values[0]
            is_correct = (test_id == matched_id)
            if is_correct and score < 75:
                print(f"低分但正确：{test_id} 得分: {score}")
            if is_correct:
                correct_matches += 1
            results.append({
                'ID': test_id,
                'NAME': df_4.loc[df_4['ID'] == test_id, 'NAME'].values[0],
                'PROCESSED_NAME': df_4.loc[df_4['ID'] == test_id, 'PROCESSED_NAME'].values[0],
                'MATCHED_ID': matched_id,
                'MATCHED_NAME': matched_name,
                'SCORE': score,
                'CORRECT': is_correct
            })

    accuracy = correct_matches / total
    return pd.DataFrame(results), accuracy

# -------------------------
# Step 5: 执行并保存结果
# -------------------------

matched_df, acc = fuzzy_match_test_to_primary(df_4)

# 保存结果
matched_df.to_csv("matched_results_df2.csv", index=False)
print(f"\n准确率: {acc * 100:.2f}%")
print("匹配结果已保存至 matched_results.csv")


匹配进度:  28%|██▊       | 280/1000 [00:01<00:04, 154.00it/s] 

低分但正确：45741 得分: 67.45562130177515
低分但正确：46923 得分: 72.22222222222221
低分但正确：41277 得分: 74.35897435897436
低分但正确：16990 得分: 67.74193548387098
低分但正确：8332 得分: 61.53846153846154
低分但正确：24563 得分: 70.76923076923076
低分但正确：16476 得分: 73.07692307692308
低分但正确：15342 得分: 72.97297297297297
低分但正确：52249 得分: 71.69811320754718
低分但正确：49795 得分: 72.34042553191489
低分但正确：29023 得分: 68.08510638297872
低分但正确：11773 得分: 73.6842105263158
低分但正确：10359 得分: 74.57627118644068


匹配进度:  39%|███▉      | 389/1000 [00:04<00:10, 61.04it/s] 

低分但正确：48742 得分: 72.3076923076923
低分但正确：50159 得分: 73.1958762886598
低分但正确：7585 得分: 63.01369863013699
低分但正确：18095 得分: 69.38775510204081


匹配进度: 100%|██████████| 1000/1000 [00:04<00:00, 200.28it/s]

低分但正确：10849 得分: 70.0
低分但正确：13407 得分: 64.86486486486487
低分但正确：50980 得分: 73.2824427480916
低分但正确：8407 得分: 70.0
低分但正确：7286 得分: 64.28571428571428
低分但正确：40942 得分: 74.4186046511628
低分但正确：21114 得分: 69.23076923076923
低分但正确：10640 得分: 62.5
低分但正确：52689 得分: 72.26890756302521
低分但正确：44787 得分: 74.32432432432432
低分但正确：41178 得分: 59.01639344262295
低分但正确：18087 得分: 70.27027027027026
低分但正确：48836 得分: 68.35443037974683
低分但正确：47548 得分: 70.89947089947091
低分但正确：11909 得分: 64.28571428571428
低分但正确：10657 得分: 67.64705882352942
低分但正确：34916 得分: 62.96296296296296
低分但正确：27446 得分: 74.07407407407408

准确率: 96.10%
匹配结果已保存至 matched_results.csv





In [11]:
import pandas as pd
import re
import wordninja

# 分词函数：去除空格 + 自动分词 + 转大写
def recover_and_segment(name):
    if pd.isna(name):  # 处理缺失值
        return ''
    clean = re.sub(r'[^a-zA-Z0-9]', '', name)
    segments = wordninja.split(clean)  # 分词
    return ' '.join(sorted(segments))  # 转大写并排序

# 读取 Excel 的 Sheet2
df_4 = pd.read_excel("test_02.xlsx", sheet_name="Sheet4")

# 假设需要处理的列名为 'NAME'，可替换为你的实际列名
df_4['PROCESSED_NAME'] = df_4['NAME'].apply(recover_and_segment)

# 保存为 CSV
df_4.to_csv("processed_sheet2.csv", index=False)

print("处理完成并写入 processed_sheet2.csv")


处理完成并写入 processed_sheet2.csv


In [13]:
import pandas as pd
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm 
import re
import wordninja # 导入 tqdm

# 加载数据
alternate_names = pd.read_csv('alternate.csv', nrows=1000)
primary_names = pd.read_csv('primary.csv')
test_names = df_1

# 查看数据
print(alternate_names.head())
print(primary_names.head())
print(test_names.head())

# 将 primary_names 的 NAME 字段转换为列表，方便快速匹配
primary_names_list = primary_names['NAME'].tolist()
primary_ids = primary_names['ID'].tolist()

def normalize_and_sort_words(name):
    clean = re.sub(r'[^a-zA-Z0-9]', '', name)
    # Step 2: 小写（便于 wordninja 识别）再切分
    segments = wordninja.split(clean.lower())
    # Step 3: 恢复大写首字母（可选）
    return ' '.join(segments)

# 匹配函数
def match_test_variant(test_row):
    test_id = test_row['ID']
    test_variant = test_row['PROCESSED_NAME']

    # 使用 rapidfuzz 找到最匹配的 NAME
    match, score, _ = process.extractOne(
        test_variant, 
        primary_names_list,
        processor=lambda x: ' '.join(sorted(re.sub(r'[^a-z0-9 ]', '', x.lower()).split())),
        scorer= fuzz.token_sort_ratio  # 使用 token_sort_ratio 提高匹配精度
    )

    # 获取匹配的 NAME 对应的 ID
    matched_index = primary_names_list.index(match)
    matched_id = primary_ids[matched_index]

    # 返回匹配结果
    return test_id, matched_id, score

# 并行匹配
def fuzzy_match_test_to_primary(test_names, primary_names):
    correct_matches = 0  # 记录正确匹配的数量
    total_tests = len(test_names)  # 总测试数量

    # 使用多线程并行匹配
    with ThreadPoolExecutor() as executor:
        # 提交任务
        futures = [
            executor.submit(match_test_variant, test_row)
            for test_row in test_names.to_dict('records')
        ]

        # 使用 tqdm 包装 futures，显示进度条
        for future in tqdm(as_completed(futures), total=total_tests, desc="匹配进度"):
            test_id, matched_id, score = future.result()
            if test_id == matched_id:
                if score < 75:
                    print(test_id, score)
                correct_matches += 1

    # 计算准确率
    accuracy = correct_matches / total_tests
    return accuracy

accuracy = fuzzy_match_test_to_primary( df_4, primary_names)
print(f"准确率: {accuracy * 100:.2f}%")


    ID                   NAME
0   36         AERO-CARIBBEAN
1  173            AVIA IMPORT
2  306  NATIONAL BANK OF CUBA
3  540                  COIBA
4  552                 CRYMSA
    ID                       NAME TYPE
0   36     AEROCARIBBEAN AIRLINES    C
1  173  ANGLO-CARIBBEAN CO., LTD.    C
2  306     BANCO NACIONAL DE CUBA    C
3  424         BOUTIQUE LA MAISON    C
4  475               CASA DE CUBA    C
      ID                                               NAME
0  16463                            PADIERNAPENALuisOrlando
1  18750  INSURANCECOMPANYSBERBANKINSURANCELIMITEDLIABIL...
2  52600                                 PORTEXTRADELIMITED
3  48766                       MAKAROVSergeiVyacheslavovich
4  19889                                    ISILSAUDIARABIA


匹配进度:   1%|          | 6/1000 [00:01<03:36,  4.60it/s]

43252 65.7718120805369
9385 53.46534653465347


匹配进度: 100%|██████████| 1000/1000 [00:18<00:00, 55.17it/s]

51229 48.54368932038835
40586 47.72727272727273
50408 59.79381443298969
37006 57.692307692307686
48742 56.4516129032258
50249 68.13186813186813
37670 71.55963302752293
29506 72.41379310344827
7853 60.215053763440864
15270 65.85365853658536
12168 62.295081967213115
32118 74.4186046511628
29850 60.0
12457 52.25225225225225
36097 69.76744186046511
27286 69.87951807228916
9400 69.6969696969697
23889 55.696202531645575
50621 74.66666666666666
47333 53.65853658536586
18588 70.88607594936708
7288 61.29032258064516
48770 73.52941176470588
22483 69.44444444444444
19949 69.56521739130434
21575 68.0
33014 64.0
40212 74.50980392156863
49821 60.31746031746032
50437 63.565891472868216
37877 66.66666666666667
45743 57.971014492753625
34120 68.57142857142857
10601 60.46511627906976
31143 73.6842105263158
23346 63.1578947368421
12529 60.60606060606061
12667 69.44444444444444
52180 52.38095238095239
50110 49.382716049382715
52548 67.13286713286713
52186 71.83098591549295
51192 65.3061224489796
19389 68.




In [5]:
import pandas as pd
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm 
import re
import wordninja # 导入 tqdm

# 加载数据
alternate_names = pd.read_csv('alternate.csv', nrows=1000)
primary_names = pd.read_csv('primary.csv')
test_names = df_1

# 查看数据
print(alternate_names.head())
print(primary_names.head())
print(test_names.head())

# 将 primary_names 的 NAME 字段转换为列表，方便快速匹配
primary_names_list = primary_names['NAME'].tolist()
primary_ids = primary_names['ID'].tolist()

def normalize_and_sort_words(name):
    clean = re.sub(r'[^a-zA-Z0-9]', '', name)
    # Step 2: 小写（便于 wordninja 识别）再切分
    segments = wordninja.split(clean.lower())
    # Step 3: 恢复大写首字母（可选）
    return ' '.join(segments)

def recover_and_segment(name):
    if pd.isna(name):  # 处理缺失值
        return ''
    clean = re.sub(r'[^a-zA-Z0-9]', '', name)  # 去掉所有非字母数字字符
    segments = wordninja.split(clean.lower())  # 分词
    return ' '.join([w.upper() for w in segments])  # 全大写输出

# 匹配函数
def match_test_variant(test_row):
    test_id = test_row['ID']
    test_variant_split = recover_and_segment(test_row['NAME'])
    test_variant = test_row['NAME']

    
    # 使用 rapidfuzz 找到最匹配的 NAME
    match1, score1, _ = process.extractOne(
        test_variant, 
        primary_names_list,
        processor=lambda x: re.sub(r'[^a-z0-9]', '', x.lower()),
        scorer=fuzz.WRatio  # 使用 token_sort_ratio 提高匹配精度
    )

    match2, score2, _ = process.extractOne(
        test_variant_split, 
        primary_names_list,
        processor=lambda x: re.sub(r'[^a-z0-9 ]', '', x.lower()),
        scorer=fuzz.token_sort_ratio  # 使用 token_sort_ratio 提高匹配精度
    )

    if score1 > score2:
        match = match1
        score = score1
    else:
        match = match2
        score = score2
    # 获取匹配的 NAME 对应的 ID
    matched_index = primary_names_list.index(match)
    matched_id = primary_ids[matched_index]

    # 返回匹配结果
    return {
        'test_id': test_id,
        'test_name': test_variant,
        'matched_id': matched_id,
        'matched_name': match,
        'score': score,
        'is_correct': int(test_id == matched_id and score > 75)
    }

# 并行匹配
# def fuzzy_match_test_to_primary(test_names, primary_names):
    correct_matches = 0  # 记录正确匹配的数量
    total_tests = len(test_names)  # 总测试数量

    # 使用多线程并行匹配
    with ThreadPoolExecutor() as executor:
        # 提交任务
        futures = [
            executor.submit(match_test_variant, test_row)
            for test_row in test_names.to_dict('records')
        ]

        # 使用 tqdm 包装 futures，显示进度条
        for future in tqdm(as_completed(futures), total=total_tests, desc="匹配进度"):
            test_id, matched_id, score = future.result()
            if test_id == matched_id:
                if score > 80:
                    correct_matches += 1

    # 计算准确率
    accuracy = correct_matches / total_tests
    return accuracy

def fuzzy_match_test_to_primary(test_names, primary_names, file_prefix='df'):
    total_tests = len(test_names)
    matched_results = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(match_test_variant, row) for row in test_names.to_dict('records')]
        for future in tqdm(as_completed(futures), total=total_tests, desc=f"匹配中 {file_prefix}"):
            result = future.result()
            matched_results.append(result)

    # 转换为 DataFrame
    results_df = pd.DataFrame(matched_results)
    accuracy = results_df['is_correct'].sum() / len(results_df)

    # 保存 CSV
    results_df.to_csv(f'matched_result_{file_prefix}.csv', index=False)
    print(f"[{file_prefix}] 匹配结果已保存，准确率: {accuracy * 100:.2f}%")
    return accuracy

# 计算准确率
df_list = [df_1,df_2,df_3,df_4,df_5,df_6,df_7,df_8]
for i in range(7):
    accuracy = fuzzy_match_test_to_primary(df_list[i], primary_names, file_prefix=f'df{i+1}')
    print(f"准确率: {accuracy * 100:.2f}%")

    ID                   NAME
0   36         AERO-CARIBBEAN
1  173            AVIA IMPORT
2  306  NATIONAL BANK OF CUBA
3  540                  COIBA
4  552                 CRYMSA
    ID                       NAME TYPE
0   36     AEROCARIBBEAN AIRLINES    C
1  173  ANGLO-CARIBBEAN CO., LTD.    C
2  306     BANCO NACIONAL DE CUBA    C
3  424         BOUTIQUE LA MAISON    C
4  475               CASA DE CUBA    C
      ID                                               NAME
0  16463                            PADIERNAPENALuisOrlando
1  18750  INSURANCECOMPANYSBERBANKINSURANCELIMITEDLIABIL...
2  52600                                 PORTEXTRADELIMITED
3  48766                       MAKAROVSergeiVyacheslavovich
4  19889                                    ISILSAUDIARABIA


匹配中 df1: 100%|██████████| 1000/1000 [00:30<00:00, 32.43it/s]


[df1] 匹配结果已保存，准确率: 99.70%
准确率: 99.70%


匹配中 df2: 100%|██████████| 1000/1000 [00:48<00:00, 20.51it/s]


[df2] 匹配结果已保存，准确率: 70.30%
准确率: 70.30%


匹配中 df3: 100%|██████████| 1000/1000 [00:31<00:00, 32.11it/s]


[df3] 匹配结果已保存，准确率: 99.50%
准确率: 99.50%


匹配中 df4: 100%|██████████| 1000/1000 [00:47<00:00, 20.86it/s]


[df4] 匹配结果已保存，准确率: 70.40%
准确率: 70.40%


匹配中 df5: 100%|██████████| 1000/1000 [00:42<00:00, 23.79it/s]


[df5] 匹配结果已保存，准确率: 91.00%
准确率: 91.00%


匹配中 df6: 100%|██████████| 1000/1000 [00:40<00:00, 24.65it/s]


[df6] 匹配结果已保存，准确率: 98.90%
准确率: 98.90%


匹配中 df7: 100%|██████████| 1000/1000 [00:45<00:00, 21.91it/s]

[df7] 匹配结果已保存，准确率: 89.10%
准确率: 89.10%





In [7]:
import pandas as pd
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm 
import re
import wordninja # 导入 tqdm

# 加载数据
alternate_names = pd.read_csv('alternate.csv', nrows=1000)
primary_names = pd.read_csv('primary.csv')
test_names = df_1

# 查看数据
print(alternate_names.head())
print(primary_names.head())
print(test_names.head())

# 将 primary_names 的 NAME 字段转换为列表，方便快速匹配
primary_names_list = primary_names['NAME'].tolist()
primary_ids = primary_names['ID'].tolist()

def normalize_and_sort_words(name):
    clean = re.sub(r'[^a-zA-Z0-9]', '', name)
    # Step 2: 小写（便于 wordninja 识别）再切分
    segments = wordninja.split(clean.lower())
    # Step 3: 恢复大写首字母（可选）
    return ' '.join(segments)

def recover_and_segment(name):
    if pd.isna(name):  # 处理缺失值
        return ''
    clean = re.sub(r'[^a-zA-Z0-9]', '', name)  # 去掉所有非字母数字字符
    segments = wordninja.split(clean.lower())  # 分词
    return ' '.join([w.upper() for w in segments])  # 全大写输出

# 匹配函数
def match_test_variant(test_row):
    # test_id = test_row['ID']
    test_variant_split = recover_and_segment(test_row['NAME'])
    test_variant = test_row['NAME']

    
    # 使用 rapidfuzz 找到最匹配的 NAME
    match1, score1, _ = process.extractOne(
        test_variant, 
        primary_names_list,
        processor=lambda x: re.sub(r'[^a-z0-9]', '', x.lower()),
        scorer=fuzz.WRatio  # 使用 token_sort_ratio 提高匹配精度
    )

    match2, score2, _ = process.extractOne(
        test_variant_split, 
        primary_names_list,
        processor=lambda x: re.sub(r'[^a-z0-9 ]', '', x.lower()),
        scorer=fuzz.token_sort_ratio  # 使用 token_sort_ratio 提高匹配精度
    )

    if score1 > score2:
        match = match1
        score = score1
    else:
        match = match2
        score = score2
    # 获取匹配的 NAME 对应的 ID
    matched_index = primary_names_list.index(match)
    matched_id = primary_ids[matched_index]

    # 返回匹配结果
    return matched_id, score

# 并行匹配
def fuzzy_match_test_to_primary(test_names, primary_names):
    correct_matches = 0  # 记录正确匹配的数量
    total_tests = len(test_names)  # 总测试数量

    # 使用多线程并行匹配
    with ThreadPoolExecutor() as executor:
        # 提交任务
        futures = [
            executor.submit(match_test_variant, test_row)
            for test_row in test_names.to_dict('records')
        ]

        # 使用 tqdm 包装 futures，显示进度条
        for future in tqdm(as_completed(futures), total=total_tests, desc="匹配进度"):
            matched_id, score = future.result()
            if score > 80:
                correct_matches += 1

    # 计算准确率
    accuracy = correct_matches / total_tests
    return accuracy

# 计算准确率
# df_list = [df_1,df_2,df_3,df_4,df_5,df_6,df_7,df_8]
# for i in range(7):
#     accuracy = fuzzy_match_test_to_primary(df_list[i], primary_names)
#     print(f"准确率: {accuracy * 100:.2f}%")

accuracy = fuzzy_match_test_to_primary(df_8, primary_names)
print(f"准确率: {accuracy * 100:.2f}%")

    ID                   NAME
0   36         AERO-CARIBBEAN
1  173            AVIA IMPORT
2  306  NATIONAL BANK OF CUBA
3  540                  COIBA
4  552                 CRYMSA
    ID                       NAME TYPE
0   36     AEROCARIBBEAN AIRLINES    C
1  173  ANGLO-CARIBBEAN CO., LTD.    C
2  306     BANCO NACIONAL DE CUBA    C
3  424         BOUTIQUE LA MAISON    C
4  475               CASA DE CUBA    C
      ID                                               NAME
0  16463                            PADIERNAPENALuisOrlando
1  18750  INSURANCECOMPANYSBERBANKINSURANCELIMITEDLIABIL...
2  52600                                 PORTEXTRADELIMITED
3  48766                       MAKAROVSergeiVyacheslavovich
4  19889                                    ISILSAUDIARABIA


匹配进度: 100%|██████████| 1000/1000 [00:50<00:00, 19.83it/s]

准确率: 10.20%





In [20]:
import pandas as pd

test_cases = [
    ("JAMES", "James"),
    ("Emma", "Em-ma"),
    ("M & M L.L.C.", "MM LLC"),
    ("Anna", "An na"),
    ("John Edward Smith", "John Smith"),
    ("Lee", "Mr. Lee"),
    ("Carlos Alfonzo Diaz", "Diaz Carlos Alfonzo"),
    ("Technology", "Tech"),
    ("Abdul Rasheed", "Abd al-Rashid"),
    ("Sarah", "Sara"),
    ("Center", "Cetner"),
    ("James Earl Smith", "J.E. Smith"),
    ("James Earl Smith", "J. E. Smith"),
    ("James Earl Smith", "JE Smith"),
    ("James Earl Smith", "J E Smith"),
    ("ltd", "limited"),
    ("shpg", "shipping"),
    ("apt", "apartment"),
    ("No.", "number"),
    ("NE", "northeast"),
    ("rd", "road"),
    ("US", "United States"),
    ("USA", "United States of America"),
    ("Eagle Pharmaceuticals Inc.", "Eagle Drugs Co"),
    ("company", "compañía"),
    ("company", "компания"),
    ("公司", "会社"),
    ("公司", "شركة"),
    ("William", "Billy"),
    ("Robert", "Bob"),
    ("Elizabeth", "Betty"),
    ("4", "four"),
    ("IV", "four"),
    ("1st", "first"),
    ("0", "O"),
    ("1", "I"),
    ("1", "l"),
    ("2", "Z"),
    ("5", "S"),
    ("6", "b"),
    ("8", "B"),
    ("13", "B"),
    ("m", "rn"),
    ("L", "|_"),
    ("W", "\/\/")
]

df_test = pd.DataFrame(test_cases, columns=["Name_1", "Name_2"])
df_test.to_csv("fuzzy_match_test_cases.csv", index=False)

def match_test_variant(df_test):
    for i in range(len(df_test)):
        match1, score1, _ = process.extractOne(
            df_test['Name_1'], 
            df_test['Name_2'],
            processor=lambda x: re.sub(r'[^a-z0-9 ]'),
            scorer=fuzz.WRatio  # 使用 token_sort_ratio 提高匹配精度
        )
        print(match1, score1)

match_test_variant(df_test)

  ("W", "\/\/")
  ("W", "\/\/")


TypeError: sub() missing 2 required positional arguments: 'repl' and 'string'

In [8]:
import re
import wordninja
from rapidfuzz import fuzz

# ------------------ 文本预处理 ------------------
def normalize(text):
    # 全大写，去除标点和多余空格
    text = text.upper()
    text = re.sub(r'[^A-Z0-9]', '', text)
    return ' '.join(wordninja.split(text))

# ------------------ 匹配并评分 ------------------
def compare_pairs(name1, name2):
    raw1, raw2 = name1.strip(), name2.strip()
    norm1, norm2 = normalize(raw1), normalize(raw2)

    score_raw = fuzz.WRatio(raw1, raw2)
    score_norm = fuzz.token_sort_ratio(norm1, norm2)
    score_partial = fuzz.partial_ratio(norm1, norm2)

    best_score = max(score_raw, score_norm, score_partial)

    print("原始字符串:")
    print(f"  A: {raw1}")
    print(f"  B: {raw2}")
    print("预处理后:")
    print(f"  A: {norm1}")
    print(f"  B: {norm2}")
    print("匹配分数:")
    print(f"  WRatio:             {score_raw}")
    print(f"  token_sort_ratio:   {score_norm}")
    print(f"  partial_ratio:      {score_partial}")
    print(f"✅ 最佳匹配得分:        {best_score}")
    print("=" * 60)

# ------------------ 样本对 ------------------
pairs = [
    ("TRADINGLIMITEDGENERALALCARDINAL", "ALCARDINAL GENERAL TRiADING LIMITED"),
    ("PADIERNAPENALuisOrlando", "Luisg PENA, PADIERNAM Orlando"),
    ("G O L D E N S T A R C O", "GOLDEN TAR CO"),
    ("S H A N G H A I S E A X U A N W U L T D F R E I G H T Y O U B I C O", "XUANWU YOUBI SEA FREI`HT SHANGHAI CO LTD"),
    ("HITTA, Sidan", "HITTA, Sidan Ag"),
    ("PREM INVESTMENT GROUP SAL (OFF-SHORE)", "PREMIER INVESTMENT GROUP SAL (OFF-SHORE)"),
    ("SHELESTENKO, Hennadiy O.", "SHELESTENKO, Hennadiy Oleksandrovych")
]

# ------------------ 执行匹配 ------------------
for a, b in pairs:
    compare_pairs(a, b)


原始字符串:
  A: TRADINGLIMITEDGENERALALCARDINAL
  B: ALCARDINAL GENERAL TRiADING LIMITED
预处理后:
  A: TRADING LIMITED GENERAL AL CARDINAL
  B: AL CARDINAL GENERAL TRIAD ING LIMITED
匹配分数:
  WRatio:             51.515151515151516
  token_sort_ratio:   88.88888888888889
  partial_ratio:      57.692307692307686
✅ 最佳匹配得分:        88.88888888888889
原始字符串:
  A: PADIERNAPENALuisOrlando
  B: Luisg PENA, PADIERNAM Orlando
预处理后:
  A: PA DIE RNA PENA LUIS ORLANDO
  B: LUIS GP E NAPA DIE RNA M ORLANDO
匹配分数:
  WRatio:             57.692307692307686
  token_sort_ratio:   66.66666666666667
  partial_ratio:      79.16666666666666
✅ 最佳匹配得分:        79.16666666666666
原始字符串:
  A: G O L D E N S T A R C O
  B: GOLDEN TAR CO
预处理后:
  A: GOLDEN STAR CO
  B: GOLDEN T ARCO
匹配分数:
  WRatio:             72.22222222222221
  token_sort_ratio:   81.4814814814815
  partial_ratio:      84.61538461538461
✅ 最佳匹配得分:        84.61538461538461
原始字符串:
  A: S H A N G H A I S E A X U A N W U L T D F R E I G H T Y O U B I C O
  B: XUANWU