In [None]:
# set paths
from paths import DATA_DIR

In [15]:
import pandas as pd

In [16]:
# 重新加载原始数据
df_state_115 = pd.read_csv(DATA_DIR / "final_state_115_0803.csv")
df_state_name = pd.read_excel(DATA_DIR / "州名和州码.xlsx")

print("=== 去重前数据检查 ===")
print(f"df_state_name原始行数: {len(df_state_name)}")
print(f"df_state_name中state_name的唯一值数量: {df_state_name['state_name'].nunique()}")

# 对df_state_name按state_name去重，保留第一条记录
df_state_name_unique = df_state_name.drop_duplicates(subset=['state_name'], keep='first')

print(f"\n=== 去重后数据检查 ===")
print(f"df_state_name去重后行数: {len(df_state_name_unique)}")
print(f"df_state_name去重后state_name的唯一值数量: {df_state_name_unique['state_name'].nunique()}")

# 统一大小写和格式，提高匹配率
df_state_115['NAME_clean'] = df_state_115['NAME'].str.strip().str.lower()
df_state_name_unique['state_name_clean'] = df_state_name_unique['state_name'].str.strip().str.lower()

print(f"\n=== 清理后的数据样例 ===")
print("df_state_115中的NAME_clean前10个:")
print(df_state_115['NAME_clean'].head(10).tolist())
print("\ndf_state_name_unique中的state_name_clean前10个:")
print(df_state_name_unique['state_name_clean'].head(10).tolist())

# 执行合并，使用清理后的字段
df_state_115_merged = df_state_115.merge(
    df_state_name_unique,
    left_on='NAME_clean',
    right_on='state_name_clean',
    how='left'
)

print(f"\n=== 合并结果 ===")
print(f"合并前df_state_115行数: {len(df_state_115)}")
print(f"合并后行数: {len(df_state_115_merged)}")

# 统计匹配情况
if 'state_code' in df_state_115_merged.columns:
    matched_count = df_state_115_merged['state_code'].notna().sum()
    unmatched_count = df_state_115_merged['state_code'].isna().sum()
    total_count = len(df_state_115_merged)

    print(f"\n匹配上的记录数: {matched_count}")
    print(f"未匹配上的记录数: {unmatched_count}")
    print(f"总记录数: {total_count}")
    print(f"匹配率: {matched_count / total_count * 100:.2f}%")

    # 查看未匹配的州名
    if unmatched_count > 0:
        print(f"\n未匹配的州名:")
        unmatched_states = df_state_115_merged[df_state_115_merged['state_code'].isna()]['NAME'].unique()
        for state in unmatched_states:
            print(f"  - {state}")

# 删除临时的清理列
df_state_115_merged = df_state_115_merged.drop(columns=['NAME_clean', 'state_name_clean'])

# 更新df_state_115
df_state_115 = df_state_115_merged

=== 去重前数据检查 ===
df_state_name原始行数: 156
df_state_name中state_name的唯一值数量: 52

=== 去重后数据检查 ===
df_state_name去重后行数: 52
df_state_name去重后state_name的唯一值数量: 52

=== 清理后的数据样例 ===
df_state_115中的NAME_clean前10个:
['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'district of columbia', 'florida']

df_state_name_unique中的state_name_clean前10个:
['alaska', 'alabama', 'arkansas', 'colorado', 'district of columbia', 'georgia', 'iowa', 'idaho', 'illinois', 'kansas']

=== 合并结果 ===
合并前df_state_115行数: 52
合并后行数: 52

匹配上的记录数: 52
未匹配上的记录数: 0
总记录数: 52
匹配率: 100.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_state_name_unique['state_name_clean'] = df_state_name_unique['state_name'].str.strip().str.lower()


In [17]:
df_state_115.to_csv(DATA_DIR / "state_115_1220_with_codes.csv", index=False)