In [11]:
import pandas as pd
from pathlib import Path
df = pd.read_csv(
        Path.home() / 'OneDrive' / 'Rawdata' / 'World Bank World Development Index' / 'WDI_csv' / 'WDIData.csv',
        na_values='..',
    ).drop(columns=['Unnamed: 67'])

df = df [df['Country Name'] == 'United States']
df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
376890,United States,USA,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
376891,United States,USA,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
376892,United States,USA,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
376893,United States,USA,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
376894,United States,USA,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,


# Q8 How to reshape data?
## Pandas provides multiple methods like melt(), pivot_table(), stack(), unstack() ,etc to reshape data.
### https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

## 1) melt(). This function is used to transform or reshape data from a wide format to a long format. It essentially unpivots the DataFrame, converting columns into rows.
### https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.melt.html#pandas.DataFrame.melt

### Key parameters:
#### id_vars: A list or tuple of column names to use as identifier variables. These columns will remain as columns in the resulting DataFrame.
#### value_vars: A list or tuple of column names to unpivot. These columns will be converted into a single column in the resulting DataFrame.
#### var_name: The name to use for the column that contains the variable names (default is 'variable').
#### value_name: The name to use for the column that contains the values (default is 'value').

In [2]:
df_melt = (
    df.drop(columns='Indicator Code')
    .melt(id_vars=['Country Name', 'Country Code', 'Indicator Name'],
          var_name=['Year'])
)

df_melt.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Year,value
0,United States,USA,Access to clean fuels and technologies for coo...,1960,
1,United States,USA,Access to clean fuels and technologies for coo...,1960,
2,United States,USA,Access to clean fuels and technologies for coo...,1960,
3,United States,USA,Access to electricity (% of population),1960,
4,United States,USA,"Access to electricity, rural (% of rural popul...",1960,


## 2） pivot_table(). This function is used to create a pivot table from a DataFrame. It allows you to summarize and aggregate data based on one or more columns, providing insights into the relationships between different variables.
### https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html#pandas.pivot_table

### key parameters:

#### data: The DataFrame to be used for creating the pivot table.
#### values: The column(s) to aggregate.
#### index: The column(s) to be used as the index of the resulting pivot table.
#### columns: The column(s) to be used as the columns of the resulting pivot table.
#### aggfunc: The aggregation function(s) to apply to the values. It can be a single function, a list of functions, or a dictionary mapping columns to functions.
#### fill_value: The value to replace missing values with (default is None).

In [3]:
df_pivottable = df_melt.pivot_table(    
    values='value',
    index=['Country Name','Country Code', 'Year'],    
    columns='Indicator Name'    
)

df_pivottable.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Indicator Name,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)",...,"Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Vulnerable employment, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)",Women Business and the Law Index Score (scale 1-100),Women's share of population ages 15+ living with HIV (%),Young people (ages 15-24) newly infected with HIV
Country Name,Country Code,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
United States,USA,1960,,,,,,,,,,,...,,,,,,,,,,
United States,USA,1961,,,,,,,,,,,...,,,,,,,,,,
United States,USA,1962,,,,,,,,,,,...,,,,,,,,,,
United States,USA,1963,,,,,,,,,,,...,,,,,,,,,,
United States,USA,1964,,,,,,,,,,,...,,,,,,,,,,


In [4]:
#reset_index()
#rename_axis
WDI_US = df_pivottable.reset_index().rename_axis('', axis=1)
WDI_US.head()

Unnamed: 0,Country Name,Country Code,Year,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),...,"Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Vulnerable employment, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)",Women Business and the Law Index Score (scale 1-100),Women's share of population ages 15+ living with HIV (%),Young people (ages 15-24) newly infected with HIV
0,United States,USA,1960,,,,,,,,...,,,,,,,,,,
1,United States,USA,1961,,,,,,,,...,,,,,,,,,,
2,United States,USA,1962,,,,,,,,...,,,,,,,,,,
3,United States,USA,1963,,,,,,,,...,,,,,,,,,,
4,United States,USA,1964,,,,,,,,...,,,,,,,,,,


# Q9 How to export data to csv or xlsx?

In [5]:
WDI_US.to_csv(Path.home() / 'OneDrive' / '2024' / 'Big Data Analysis' / 'WDI_US.csv', index=False)

# Q10 How many missing values for each variable?

In [6]:
isna_data = WDI_US.isna().sum().sort_values(ascending=True)
isna_data


Country Name                                                                                     0
Labor force participation rate for ages 15-24, female (%) (national estimate)                    0
Labor force participation rate for ages 15-24, male (%) (national estimate)                      0
Labor force participation rate for ages 15-24, total (%) (national estimate)                     0
Labor force participation rate, female (% of female population ages 15+) (national estimate)     0
                                                                                                ..
Trained teachers in lower secondary education, male (% of male teachers)                        62
Trained teachers in lower secondary education, female (% of female teachers)                    62
Trained teachers in lower secondary education (% of total teachers)                             62
Completeness of birth registration, female (%)                                                  62
Mammal sp

In [7]:
count_data = WDI_US.count().sort_values(ascending=False)
count_data


Country Name                                                          63
Population ages 10-14, female (% of female population)                63
Net foreign assets (current LCU)                                      63
Official exchange rate (LCU per US$, period average)                  63
Population ages 0-14 (% of total population)                          63
                                                                      ..
Progression to secondary school, female (%)                            1
Progression to secondary school, male (%)                              1
Bird species, threatened                                               1
Mammal species, threatened                                             1
Ease of doing business rank (1=most business-friendly regulations)     1
Length: 1166, dtype: int64