In [14]:
!pip install wbgapi pandas



In [15]:
import wbgapi as wb
import pandas as pd
from datetime import date

### 1) 设定区域与年份 

In [16]:
APAC_REGIONS = ['EAS', 'SAS']   # 东亚太平洋 + 南亚
END_YEAR = date.today().year - 1         # 用去年作为最近完整年
START_YEAR = END_YEAR - 9                 # 近10年
YEARS = range(START_YEAR, END_YEAR + 1)

### 2) 选择指标（>=8 个，含时间维度）

In [17]:
INDICATORS = {
    'NY.GDP.MKTP.CD'      : 'gdp_current_usd',        # GDP(现价美元)
    'NY.GDP.MKTP.KD.ZG'   : 'gdp_real_growth_pct',    # 实际GDP增速(%)
    'NY.GDP.PCAP.CD'      : 'gdp_per_capita_usd',     # 人均GDP(美元)
    'FP.CPI.TOTL.ZG'      : 'inflation_cpi_pct',      # 居民消费价格通胀(%)
    'SL.UEM.TOTL.ZS'      : 'unemployment_pct',       # 失业率(%)
    'NE.EXP.GNFS.ZS'      : 'exports_gdp_pct',        # 货物与服务出口/GDP(%)
    'NE.IMP.GNFS.ZS'      : 'imports_gdp_pct',        # 货物与服务进口/GDP(%)
    'GC.NLD.TOTL.GD.ZS'   : 'fiscal_balance_gdp_pct', # 政府现金收支差/GDP(%)
    'BX.KLT.DINV.WD.GD.ZS': 'fdi_inflow_gdp_pct',     # 外商直接投资净流入/GDP(%)
}

# === 3) 取亚太地区成员经济体（返回 ISO3 代码列表）===
# WBGAPI 支持按“地区→成员国”查询
economies = sorted({e for r in APAC_REGIONS for e in wb.region.members(r)})



### 4) 批量拉取数据：多个指标 × 多国 × 多年 → DataFrame

In [18]:
df_wide = wb.data.DataFrame(
    list(INDICATORS.keys()),
    economies,
    time=YEARS,
    index=['economy','time'],
    columns='series',
    labels=True,
    numericTimeKeys=True
)

# 3) 整理列名
df = (df_wide
      .rename(columns=INDICATORS)
      .reset_index()
      .rename(columns={'economy': 'iso3', 'time': 'year', 'Country': 'country_name'}))


In [19]:
df_wide.columns

Index(['Country', 'Time', 'BX.KLT.DINV.WD.GD.ZS', 'FP.CPI.TOTL.ZG',
       'GC.NLD.TOTL.GD.ZS', 'NE.EXP.GNFS.ZS', 'NE.IMP.GNFS.ZS',
       'NY.GDP.MKTP.CD', 'NY.GDP.MKTP.KD.ZG', 'NY.GDP.PCAP.CD',
       'SL.UEM.TOTL.ZS'],
      dtype='object')

In [20]:
df

Unnamed: 0,iso3,year,country_name,Time,fdi_inflow_gdp_pct,inflation_cpi_pct,fiscal_balance_gdp_pct,exports_gdp_pct,imports_gdp_pct,gdp_current_usd,gdp_real_growth_pct,gdp_per_capita_usd,unemployment_pct
0,WSM,2024,Samoa,2024,0.350274,2.172455,,29.315509,53.816154,1.068025e+09,9.422350,4898.771408,4.551
1,WSM,2023,Samoa,2023,0.255376,7.921647,4.885785,28.753800,61.903681,9.381894e+08,9.208649,4330.178405,4.977
2,WSM,2022,Samoa,2022,0.576323,10.961882,6.559183,12.266383,52.716798,8.329452e+08,-5.306291,3869.466395,5.048
3,WSM,2021,Samoa,2021,1.060395,3.133205,0.983685,11.685690,49.203196,8.439236e+08,-7.078831,3947.645179,6.612
4,WSM,2020,Samoa,2020,0.509390,-1.568912,7.638424,29.195960,48.287640,8.688984e+08,-3.108444,4099.660091,7.435
...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,ASM,2019,American Samoa,2019,,,,61.669243,94.899536,6.470000e+08,-0.487805,12886.135952,
426,ASM,2018,American Samoa,2018,,,,68.231612,103.599374,6.390000e+08,2.671119,12552.054687,
427,ASM,2017,American Samoa,2017,,,,59.150327,102.287582,6.120000e+08,-6.987578,11863.683945,
428,ASM,2016,American Samoa,2016,,,,63.934426,95.230999,6.710000e+08,-1.679389,12843.334290,


In [21]:
meta = wb.economy.DataFrame(economies).reset_index().rename(columns={'index': 'iso3'})
#re colum name id to iso3
meta = meta.rename(columns={'id': 'iso3'})

In [22]:

df = df.merge(meta[['iso3', 'region', 'incomeLevel']], on='iso3', how='left')

# 5) 数值列转数值
value_cols = list(INDICATORS.values())
df[value_cols] = df[value_cols].apply(pd.to_numeric, errors='coerce')

# 6) 保存
df = df[['iso3','country_name','region','incomeLevel','year'] + value_cols].sort_values(['iso3','year'])
df.to_csv('apac_wdi_10y.csv', index=False)
print(f"OK: {START_YEAR}-{END_YEAR}, {len(economies)} 个经济体, {len(INDICATORS)} 指标 → apac_wdi_10y.csv")

OK: 2015-2024, 43 个经济体, 9 指标 → apac_wdi_10y.csv


In [27]:
# 你计划使用的“可读指标名”（有多少就用多少）
all_candidates = [
    'gdp_current_usd', 'gdp_per_capita_usd', 'gdp_real_growth_pct',
    'inflation_cpi_pct', 'unemployment_pct',
    'exports_gdp_pct', 'imports_gdp_pct',
    'fdi_inflow_gdp_pct', 'fiscal_balance_gdp_pct'
]
value_cols = [c for c in all_candidates if c in df.columns]

# 类型划分
level_cols = [c for c in ['gdp_current_usd', 'gdp_per_capita_usd'] if c in value_cols]
rate_cols  = [c for c in value_cols if c not in level_cols]  # 百分比/比率/增速等

# ---- 1) 缺失诊断报告（可选打印） ----
missing_by_col = df[value_cols].isna().mean().sort_values(ascending=False)
print('[各指标缺失率]')
print((missing_by_col*100).round(1).astype(str) + '%')

coverage_by_country = 1 - df.groupby('iso3')[value_cols].apply(lambda x: x.isna().mean()).mean(axis=1)
print('\n[各国家平均覆盖率Top/Bottom]')
print(coverage_by_country.sort_values(ascending=False).head(5))
print(coverage_by_country.sort_values().head(5))



[各指标缺失率]
fiscal_balance_gdp_pct    45.8%
inflation_cpi_pct         23.3%
unemployment_pct          18.6%
exports_gdp_pct           18.1%
imports_gdp_pct           18.1%
fdi_inflow_gdp_pct        14.2%
gdp_real_growth_pct        6.3%
gdp_current_usd            5.6%
gdp_per_capita_usd         5.6%
dtype: object

[各国家平均覆盖率Top/Bottom]
iso3
WSM    0.988889
PHL    0.988889
MYS    0.988889
THA    0.988889
LKA    0.988889
dtype: float64
iso3
PRK    0.111111
TUV    0.400000
ASM    0.444444
MNP    0.444444
NCL    0.522222
dtype: float64
