# this file will import and clean the socio-economic data

to do: put everything into PEP8 and beautify code

In [1]:
import pandas as pd
import country_converter as coco

### 1. import the socio-economic datasets

In [2]:
aqua_se = pd.read_csv('raw data/aquastat_socio_economic.csv')
unicef_se = pd.read_csv('raw data/unicef_socio_economic.csv')

next steps include:
1. remove unnecesary columns and rows
2. check where the missing values occur
3. make sure no duplicates are present
4. save a cleaner dataset to clean data folder

**check the general structure of both datasets**

In [3]:
aqua_se.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,1998-2002,Unnamed: 4,Unnamed: 5,2003-2007,Unnamed: 7,Unnamed: 8,2008-2012,Unnamed: 10,Unnamed: 11,2013-2017,Unnamed: 13,Unnamed: 14,2018-2022,Unnamed: 16
0,Afghanistan,Rural population (1000 inhab),2002.0,17086.91,,2007.0,20464.923,,2012.0,23280.663,,2017.0,26558.609,,,,
1,Afghanistan,Urban population (1000 inhab),2002.0,4893.013,,2007.0,6151.869,,2012.0,7416.295,,2017.0,8971.472,,,,
2,Afghanistan,Population density (inhab/km2),2002.0,34.6180957633,,2007.0,41.5104861686,,2012.0,47.73056398,,2017.0,55.5955534111,,,,
3,Afghanistan,GDP per capita (current US$/inhab),2002.0,194.958382,,2007.0,389.985586,,2012.0,694.885618,,2017.0,605.557362,,,,
4,Afghanistan,Human Development Index (HDI) [highest = 1] (-),2002.0,0.378,,2007.0,0.431,,2012.0,0.479,,2017.0,0.493,,,,


In [4]:
unicef_se.head()

Unnamed: 0,DEMO_IND,Indicator,LOCATION,Country,TIME,Time,Value,Flag Codes,Flags
0,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,2000,2000,1.8,,
1,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,2001,2001,1.7,,
2,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,2002,2002,1.8,,
3,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,2003,2003,1.7,,
4,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,2004,2004,1.8,,


**1. remove unnecessary columns and rows**



In [5]:
aqua_se = aqua_se.rename(columns={'Unnamed: 0' : 'country', 'Unnamed: 1' : 'variable'})# rename columns for clarity
cols = [c for c in aqua_se.columns if c.lower()[:7] != 'unnamed']
aqua_se = aqua_se[cols] # remove columns starting with 'unnamed'
aqua_se = aqua_se.dropna(subset=['country']) # remove rows that don't contain a country
aqua_se = aqua_se.drop(['2018-2022'], axis=1) # remove empty column


In [38]:
cols_to_keep = ['Indicator','Country','Time','Value']
unicef_se = unicef_se[cols_to_keep] # keep useful columns


**2. check for missing values**

In [39]:
aqua_se.isna().sum()# check for missing values by variable

country        0
variable       7
1998-2002    243
2003-2007    153
2008-2012    126
2013-2017    108
dtype: int64

In [40]:
# check for missing per specific variable
missing_per_variable = aqua_se.groupby(['variable']).apply(lambda x: x.isnull().sum()).sum(axis=1).sort_values()
# remove variables GII, total water withdrawal per capita because more than half are missing
aqua_se = aqua_se[~aqua_se.isin(["Total water withdrawal per capita (m<sup>3</sup>/year per inhabitant)",
                                'Gender Inequality Index (GII) [equality = 0; inequality = 1) (-)']).any(axis=1)]


In [41]:
aqua_se = aqua_se.dropna(subset=['variable'], axis=0)# drop de empty variable rows
# check if for missing values by country
missing_per_country = aqua_se.groupby(['country']).apply(lambda x: x.isnull().sum()).sum(axis=1).sort_values()
missing_per_country


country
Afghanistan       0
Morocco           0
Mozambique        0
Myanmar           0
Namibia           0
                 ..
Sudan            16
San Marino       16
South Sudan      16
Faroe Islands    20
Holy See         24
Length: 200, dtype: int64

In [42]:
unicef_se.isna().sum() # check per variable

Indicator      0
Country        0
Time           0
Value        642
dtype: int64

In [43]:
# check for missing per indicator
missing_per_variable_unicef = unicef_se.groupby(['Indicator']).apply(lambda x: x.isnull().sum()).sum(axis=1).sort_values()
remove_variables_unicef = missing_per_variable_unicef[missing_per_variable_unicef > 60].index
unicef_se = unicef_se[~unicef_se.isin(remove_variables_unicef).any(axis=1)]


In [44]:
missing_unicef = unicef_se.groupby(['Country']).apply(lambda x: x.isnull().sum()).sum(axis=1).sort_values() # check per country
missing_unicef

Country
Afghanistan     0
Malawi          0
Malaysia        0
Maldives        0
Mali            0
               ..
Guyana          4
Japan           4
Timor-Leste     5
Tokelau        11
Holy See       21
Length: 233, dtype: int64

**3. check for duplicates**

In [45]:
print((unicef_se['Country'].nunique()==233))

True


### change the country codes

In [46]:
# first keep only useful countries
clean_countries = missing_per_country[missing_per_country==0].index # keep countries with less than X missing values
aqua_clean = aqua_se[aqua_se['country'].isin(clean_countries)] # create a clean dataset, 169 countries


clean_countries_unicef = missing_unicef[missing_unicef==0].index # 197 countries
unicef_clean = unicef_se[unicef_se['Country'].isin(clean_countries_unicef)]

# change codes and remove the unfound ones 
aqua_clean['country'] = coco.convert(names=aqua_clean['country'], to='ISO3', not_found='XXX') # name Grenade to XXX
aqua_clean = aqua_clean[~aqua_clean.isin(['XXX']).any(axis=1)] # remove Grenade

unicef_clean['Country'] = coco.convert(names=unicef_clean['Country'], to='ISO3', not_found='XXX') # name channel Islands to XXX
unicef_clean = unicef_clean[~unicef_clean['Country'].isin(['XXX'])] # remove channel islands


Grenade not found in regex
Grenade not found in regex
Grenade not found in regex
Grenade not found in regex
Grenade not found in regex
Grenade not found in regex
Grenade not found in regex
Grenade not found in regex
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Islands not found in regex
Channel Isla

**keep all matching countries**

In [51]:
uni_countries = unicef_clean['Country'].unique() # unique countries in unicef data
aqua_clean = aqua_clean[aqua_clean['country'].isin(uni_countries)] # 140 countries left
unicef_clean = unicef_clean[unicef_clean['Country'].isin(aqua_clean['country'].unique())]

**4. explort cleaned datasets to the clean data folder**

In [55]:
aqua_clean.to_csv('clean data/aqua_socec_clean.csv') # export to clean data folder
unicef_clean.to_csv('clean data/unicef_socec_clean.csv')
countries_df = pd.DataFrame(unicef_clean['Country'].unique())
countries_df.to_csv('countries_available_socio.csv')