# Population Dataset Cleaning

### Data Sources
- Population Dataset: https://data.worldbank.org/indicator/SP.POP.TOTL
    - **Important Note:** The .csv file contains strings with commas, and cannot be parsed properly with `pd.read_csv()`. This column has been removed in Excel (as it is also not necessary) in order to circumvent this issue.

### Importing Libraries:
`country-converter`: https://pypi.org/project/country-converter/

In [1]:
import pandas as pd
import numpy as np
import country_converter as coco

### 1. Load Dataset:

In [2]:
path_import = r"raw_data/population_raw_edited.csv"
df_csv = pd.read_csv(path_import)

In [3]:
# Create copy of df to work with
df = df_csv.copy()

df.head(10)

Unnamed: 0,Data Source,World Development Indicators,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66
0,,,,,,,,,,,...,,,,,,,,,,
1,Last Updated Date,2024-06-28,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,Country Name,Country Code,Indicator Code,1960.0,1961.0,1962.0,1963.0,1964.0,1965.0,1966.0,...,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0,2023.0
4,Aruba,ABW,SP.POP.TOTL,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,59291.0,...,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,106277.0
5,Africa Eastern and Southern,AFE,SP.POP.TOTL,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,...,583651101.0,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0,702977106.0,720859132.0,739108306.0
6,Afghanistan,AFG,SP.POP.TOTL,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,9783147.0,...,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,41128771.0,42239854.0
7,Africa Western and Central,AFW,SP.POP.TOTL,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,110798486.0,...,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,490330870.0,502789511.0
8,Angola,AGO,SP.POP.TOTL,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,5787044.0,...,27128337.0,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,35588987.0,36684202.0
9,Albania,ALB,SP.POP.TOTL,1608800.0,1659800.0,1711319.0,1762621.0,1814135.0,1864791.0,1914573.0,...,2889104.0,2880703.0,2876101.0,2873457.0,2866376.0,2854191.0,2837849.0,2811666.0,2777689.0,2745972.0


### 2. Assign Correct Column Headers and Convert Years to `int`:

In [4]:
# Converts to int if float
def convert_to_int(value):
    if isinstance(value, float):
        return int(value)
    else:
        return value

In [5]:
# Convert years to int and assign to column headers
df_headers = df.copy()
df_headers.columns = df.iloc[3].apply(convert_to_int)

df_headers.head(7)

3,Country Name,Country Code,Indicator Code,1960,1961,1962,1963,1964,1965,1966,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,,,,,,,,,,,...,,,,,,,,,,
1,Last Updated Date,2024-06-28,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,Country Name,Country Code,Indicator Code,1960.0,1961.0,1962.0,1963.0,1964.0,1965.0,1966.0,...,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0,2023.0
4,Aruba,ABW,SP.POP.TOTL,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,59291.0,...,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,106277.0
5,Africa Eastern and Southern,AFE,SP.POP.TOTL,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,...,583651101.0,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0,702977106.0,720859132.0,739108306.0
6,Afghanistan,AFG,SP.POP.TOTL,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,9783147.0,...,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,41128771.0,42239854.0


In [6]:
# Remove axis name
df_headers = df_headers.rename_axis(columns=None)

### 3. Drop Unnecessary Rows:

In [7]:
df_rows_dropped = df_headers.drop([0, 1, 2, 3])

df_rows_dropped.head(7)

Unnamed: 0,Country Name,Country Code,Indicator Code,1960,1961,1962,1963,1964,1965,1966,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
4,Aruba,ABW,SP.POP.TOTL,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,59291.0,...,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,106277.0
5,Africa Eastern and Southern,AFE,SP.POP.TOTL,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,...,583651101.0,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0,702977106.0,720859132.0,739108306.0
6,Afghanistan,AFG,SP.POP.TOTL,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,9783147.0,...,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,41128771.0,42239854.0
7,Africa Western and Central,AFW,SP.POP.TOTL,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,110798486.0,...,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,490330870.0,502789511.0
8,Angola,AGO,SP.POP.TOTL,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,5787044.0,...,27128337.0,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,35588987.0,36684202.0
9,Albania,ALB,SP.POP.TOTL,1608800.0,1659800.0,1711319.0,1762621.0,1814135.0,1864791.0,1914573.0,...,2889104.0,2880703.0,2876101.0,2873457.0,2866376.0,2854191.0,2837849.0,2811666.0,2777689.0,2745972.0
10,Andorra,AND,SP.POP.TOTL,9443.0,10216.0,11014.0,11839.0,12690.0,13563.0,14546.0,...,71621.0,71746.0,72540.0,73837.0,75013.0,76343.0,77700.0,79034.0,79824.0,80088.0


### 4. Keep Only Relevant Columns:
- We only need data from 2020-2023.

In [8]:
# Keep only country names and 2022-2023 data
df_cols_dropped = df_rows_dropped[["Country Name", 2020, 2021, 2022, 2023]]
df_cols_dropped

Unnamed: 0,Country Name,2020,2021,2022,2023
4,Aruba,106585.0,106537.0,106445.0,106277.0
5,Africa Eastern and Southern,685112979.0,702977106.0,720859132.0,739108306.0
6,Afghanistan,38972230.0,40099462.0,41128771.0,42239854.0
7,Africa Western and Central,466189102.0,478185907.0,490330870.0,502789511.0
8,Angola,33428486.0,34503774.0,35588987.0,36684202.0
...,...,...,...,...,...
265,Kosovo,1790133.0,1786038.0,1768086.0,1756374.0
266,"Yemen, Rep.",32284046.0,32981641.0,33696614.0,34449825.0
267,South Africa,58801927.0,59392255.0,59893885.0,60414495.0
268,Zambia,18927715.0,19473125.0,20017675.0,20569737.0


### 5. Check for Duplicates:

In [9]:
df_cols_dropped[df_cols_dropped.duplicated]

Unnamed: 0,Country Name,2020,2021,2022,2023


### 6. Check for Missing Values:

In [10]:
df_cols_dropped[df_cols_dropped.isna().any(axis=1)]

Unnamed: 0,Country Name,2020,2021,2022,2023
114,Not classified,,,,


In [11]:
# Drop NaN rows
df_na_dropped = df_cols_dropped.dropna()

### 7. Standardize Country Names:
- Using [`country-converter`](https://pypi.org/project/country-converter/) package.

In [12]:
# Extract country names
country_names = df_na_dropped["Country Name"].unique()

# Convert to standardized names
standard_names = coco.convert(country_names, to="name_short")

# Create dict mapping old and new names
country_dict = dict(zip(country_names, standard_names))

Africa Eastern and Southern not found in regex
Africa Western and Central not found in regex
Arab World not found in regex
Central Europe and the Baltics not found in regex
Channel Islands not found in regex
Caribbean small states not found in regex
East Asia & Pacific ( not found in regex
Early-demographic dividend not found in regex
East Asia & Pacific not found in regex
Europe & Central Asia ( not found in regex
Europe & Central Asia not found in regex
Euro area not found in regex
European Union not found in regex
Fragile and conflict affected situations not found in regex
High income not found in regex
Heavily indebted poor countries (HIPC) not found in regex
IBRD only not found in regex
IDA & IBRD total not found in regex
IDA total not found in regex
IDA blend not found in regex
IDA only not found in regex
Latin America & Caribbean ( not found in regex
Latin America & Caribbean not found in regex
Least developed countries: UN classification not found in regex
Low income not found 

In [13]:
# Replace old names with standardized names
df_standardized = df_na_dropped.copy()
df_standardized["Country Name"] = df_na_dropped["Country Name"].replace(country_dict)

# Drop rows where Country Name is 'not found'
df_standardized = df_standardized[df_standardized["Country Name"] != "not found"]

df_standardized.head(7)

Unnamed: 0,Country Name,2020,2021,2022,2023
4,Aruba,106585.0,106537.0,106445.0,106277.0
6,Afghanistan,38972230.0,40099462.0,41128771.0,42239854.0
8,Angola,33428486.0,34503774.0,35588987.0,36684202.0
9,Albania,2837849.0,2811666.0,2777689.0,2745972.0
10,Andorra,77700.0,79034.0,79824.0,80088.0
12,United Arab Emirates,9287289.0,9365145.0,9441129.0,9516871.0
13,Argentina,45376763.0,45808747.0,46234830.0,46654581.0


### 8. Unpivot Years and Data:

In [14]:
# Unpivot to year and population columns, long form
df_melted = pd.melt(
    df_standardized, id_vars="Country Name", var_name="year", value_name="population"
)

df_melted = df_melted.sort_values(["Country Name", "year"]).reset_index(drop=True)

df_melted.head(7)

Unnamed: 0,Country Name,year,population
0,Afghanistan,2020,38972230.0
1,Afghanistan,2021,40099462.0
2,Afghanistan,2022,41128771.0
3,Afghanistan,2023,42239854.0
4,Albania,2020,2837849.0
5,Albania,2021,2811666.0
6,Albania,2022,2777689.0


### 9. Rename Column:

In [15]:
df_melted = df_melted.rename(columns={"Country Name": "country"})
df_melted.head()

Unnamed: 0,country,year,population
0,Afghanistan,2020,38972230.0
1,Afghanistan,2021,40099462.0
2,Afghanistan,2022,41128771.0
3,Afghanistan,2023,42239854.0
4,Albania,2020,2837849.0


### 10. Export to .csv:

In [16]:
path_export = r"cleaned_data/population.csv"
df_melted.to_csv(path_export)