# Q1: Filter the data for the corresponding country.



In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv(Path.home() / "OneDrive" / "Big Data Analysis" / "01 WDI data analysis" / "Data"/ "WDIData.csv")

In [2]:
###Select U.S. data.
df_US = df[df["Country Name"] == "United States"]

# Q2: How many rows and columns are in this country's data?

In [3]:
df_US.shape


(1478, 68)

# Q3: Convert your data to panel data.

In [4]:
### Convert to panel data.
data_pd = df_US.drop(columns="Indicator Code").melt(
    id_vars=["Country Name", "Country Code", "Indicator Name"],
    var_name="Year"
).pivot_table(
    values="value",
    index=["Country Name", "Country Code", "Year"],
    columns="Indicator Name",
).reset_index().rename_axis("", axis=1)

data_pd.head()

Unnamed: 0,Country Name,Country Code,Year,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),...,"Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Vulnerable employment, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)",Women Business and the Law Index Score (scale 1-100),Women's share of population ages 15+ living with HIV (%),Young people (ages 15-24) newly infected with HIV
0,United States,USA,1960,,,,,,,,...,,,,,,,,,,
1,United States,USA,1961,,,,,,,,...,,,,,,,,,,
2,United States,USA,1962,,,,,,,,...,,,,,,,,,,
3,United States,USA,1963,,,,,,,,...,,,,,,,,,,
4,United States,USA,1964,,,,,,,,...,,,,,,,,,,


In [5]:
### Re-examine the data format
data_pd.dtypes
### Change the Year data format
data_pd["Year"] = data_pd['Year'].astype(str).astype(int)
data_pd.dtypes


Country Name                                                                                         object
Country Code                                                                                         object
Year                                                                                                  int64
Access to clean fuels and technologies for cooking (% of population)                                float64
Access to clean fuels and technologies for cooking, rural (% of rural population)                   float64
                                                                                                     ...   
Wage and salaried workers, total (% of total employment) (modeled ILO estimate)                     float64
Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)    float64
Women Business and the Law Index Score (scale 1-100)                                                float64
Women's share of population

# Q4: Export the panel data

In [6]:
data_pd.to_csv(Path.home() / "OneDrive" / "Big Data Analysis" / "01 WDI data analysis" / "Data"/ "WDI_US.csv", index=False)

# Q5: How many missing numbers in each variable?

In [7]:
missing_counts = data_pd.drop(columns=['Country Name', 'Country Code' ,'Year']).isna().sum()
least_missing_vars = missing_counts.nsmallest(30)
least_missing_vars 


Age dependency ratio (% of working-age population)                               0
Age dependency ratio, old (% of working-age population)                          0
Age dependency ratio, young (% of working-age population)                        0
Arms exports (SIPRI trend indicator values)                                      0
Arms imports (SIPRI trend indicator values)                                      0
Claims on central government, etc. (% GDP)                                       0
Consumer price index (2010 = 100)                                                0
DEC alternative conversion factor (LCU per US$)                                  0
Domestic credit to private sector by banks (% of GDP)                            0
Employment to population ratio, 15+, female (%) (national estimate)              0
Employment to population ratio, 15+, male (%) (national estimate)                0
Employment to population ratio, 15+, total (%) (national estimate)               0
Emp

# Bonus
# Q6: Select the top 30 indicators with the fewest missing values and organize them into panel data.

In [8]:
### Non-standard answer.
columns_to_keep = ['Country Name', 'Year'] + least_missing_vars.index.tolist()  # 用 .index.tolist() 获取列名
panel_data = data_pd.loc[:, columns_to_keep]

panel_data

Unnamed: 0,Country Name,Year,Age dependency ratio (% of working-age population),"Age dependency ratio, old (% of working-age population)","Age dependency ratio, young (% of working-age population)",Arms exports (SIPRI trend indicator values),Arms imports (SIPRI trend indicator values),"Claims on central government, etc. (% GDP)",Consumer price index (2010 = 100),DEC alternative conversion factor (LCU per US$),...,GDP per capita (constant 2015 US$),GDP per capita (constant LCU),GDP per capita (current LCU),GDP per capita (current US$),GNI (current LCU),GNI (current US$),GNI per capita (current LCU),"Inflation, consumer prices (annual %)","Labor force participation rate for ages 15-24, female (%) (national estimate)","Labor force participation rate for ages 15-24, male (%) (national estimate)"
0,United States,1960,66.793151,15.395741,51.397410,6.132000e+09,221000000.0,23.797534,13.563061,1.0,...,19135.268182,19135.268182,3007.123445,3007.123445,5.464000e+11,5.464000e+11,3024.281705,1.457976,42.80,71.70
1,United States,1961,67.286807,15.575261,51.711546,6.290000e+09,238000000.0,24.512693,13.708284,1.0,...,19253.547329,19253.547329,3066.562869,3066.562869,5.668000e+11,5.668000e+11,3085.616606,1.070724,43.40,70.80
2,United States,1962,67.007202,15.658328,51.348875,6.130000e+09,264000000.0,23.362585,13.872615,1.0,...,20116.235124,20116.235124,3243.843078,3243.843078,6.092000e+11,6.092000e+11,3265.822513,1.198773,43.30,70.00
3,United States,1963,66.738972,15.759154,50.979818,9.360000e+09,323000000.0,22.395866,14.044590,1.0,...,20701.269947,20701.269947,3374.515171,3374.515171,6.431000e+11,6.431000e+11,3398.294248,1.239669,42.90,69.10
4,United States,1964,66.832702,15.880100,50.952602,1.231600e+10,326000000.0,21.592593,14.224207,1.0,...,21599.818705,21599.818705,3573.941185,3573.941185,6.907000e+11,6.907000e+11,3599.476781,1.278912,43.30,68.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,United States,2018,52.227204,23.439484,28.787720,9.674000e+09,380000000.0,38.991742,115.157303,1.0,...,59607.393660,59607.393660,62823.309438,62823.309438,2.093736e+13,2.093736e+13,64060.305876,2.442583,54.52,55.90
59,United States,2019,52.736350,24.119820,28.616530,1.088800e+10,890000000.0,41.231486,117.244195,1.0,...,60698.011299,60698.011299,65120.394663,65120.394663,2.176454e+13,2.176454e+13,66288.612419,1.812210,55.15,56.60
60,United States,2020,53.220894,24.857639,28.363255,9.426000e+09,801000000.0,53.591031,118.690502,1.0,...,58451.606715,58451.606715,63528.634303,63528.634303,2.147236e+13,2.147236e+13,64771.084028,1.233584,53.21,54.60
61,United States,2021,53.661595,25.629057,28.032538,1.099400e+10,868000000.0,62.072280,124.266414,1.0,...,61829.845627,61829.845627,70219.472454,70219.472454,2.361711e+13,2.361711e+13,71129.122264,4.697859,54.53,56.51
