In [None]:
import pandas as pd
import os
import math

We have worked on Google Colab, so need to mount drive. Remove this cell if you run on local computer.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Change the root to your project directory to run this notebook.

In [None]:
root = '/content/drive/My Drive/DS_World Happiness'
data_path = os.path.join(root, 'data/data_processed.csv')

# **1. Check missing values**

Read the data, sort by country name and rearrange the columns.

In [None]:
data = pd.read_csv(data_path, index_col = 0)
data = data.sort_values(by= ['Country']).reset_index(drop= True)

columns = ['Country', 
           'Regional Indicator',
           'Population density (people per sq. km of land area)',
           'GDP per capita (current US$)',
           'Unemployment, total (% of labour force)',
           'Consumer price index (2010 = 100)',
           'Mean years of schooling (years)',
           'Life expectancy at birth (years)',
           'Healthy life expectancy (HALE) at birth (years)',
           'Current health expenditure (CHE) as percentage of gross domestic product (GDP) (%)',
           'Age-standardized suicide rates (per 100 000 population)',
           'Gender Development Index (GDI)',
           'Person held per 100,000 population',
           'Carbon dioxide emissions, production emissions per capita (tonnes)',
           'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)',
           'Forest area (% of land area)',
           'Happiness Index']

data = data[columns]
data

Unnamed: 0,Country,Regional Indicator,Population density (people per sq. km of land area),GDP per capita (current US$),"Unemployment, total (% of labour force)",Consumer price index (2010 = 100),Mean years of schooling (years),Life expectancy at birth (years),Healthy life expectancy (HALE) at birth (years),Current health expenditure (CHE) as percentage of gross domestic product (GDP) (%),Age-standardized suicide rates (per 100 000 population),Gender Development Index (GDI),"Person held per 100,000 population","Carbon dioxide emissions, production emissions per capita (tonnes)","PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",Forest area (% of land area),Happiness Index
0,Afghanistan,South Asia,56.933998,498.842544,11.119667,147.343066,3.880000,64.482000,53.94961,10.590,5.940000,0.660000,82.718353,0.2505,56.910808,1.850994,2.5669
1,Albania,Central and Eastern Europe,104.550170,5071.447533,12.807000,117.167089,10.085333,78.453667,69.07545,5.135,3.923333,0.967333,185.418472,1.5990,18.200603,28.792001,4.8827
2,Algeria,Middle East and North Africa,17.728018,4080.461684,11.860667,147.403289,7.980333,76.690667,66.38806,6.300,2.573333,0.859667,148.376871,3.6225,38.884011,0.813411,5.0051
3,Argentina,Latin America and Caribbean,16.257843,12154.946384,9.118667,,10.690333,76.520667,67.12776,10.040,8.606667,0.992667,206.361994,4.4750,13.311834,10.520130,5.9747
4,Armenia,Commonwealth of Independent States,103.667697,4246.571500,17.397000,126.907196,11.302000,74.944000,67.11749,10.195,2.246667,0.979333,90.645396,1.7935,32.528118,11.552160,4.6768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,Venezuela,Latin America and Caribbean,32.804424,,7.773333,,10.312333,72.144667,64.43692,4.890,2.183333,1.013000,194.187701,5.0150,17.008554,52.543484,5.0532
149,Vietnam,Southeast Asia,308.111835,2549.081694,2.019667,158.739414,8.240000,75.319333,65.29554,5.925,7.363333,0.998000,133.769222,2.0580,29.626728,46.735544,5.3535
150,Yemen,Middle East and North Africa,53.977446,861.205026,13.021333,,3.133333,66.104000,57.53111,,7.003333,0.486333,,0.3595,50.456007,1.039832,3.5274
151,Zambia,Sub-Saharan Africa,23.346362,1452.188659,11.517000,195.922759,7.104000,63.481000,54.40409,4.665,15.666300,0.972000,134.680854,0.3025,27.438035,60.789702,3.7594


Let's check missing values in each columns. <br>
We can see that all numeric explanatory attributes have missing data, up to above 15% of missing value (Personas held per 100 000 population).

In [None]:
print('Number of missing data in each attribute:')
data.isnull().sum()

Number of missing data in each attribute:


Country                                                                                0
Regional Indicator                                                                     0
Population density (people per sq. km of land area)                                    3
GDP per capita (current US$)                                                           5
Unemployment, total (% of labour force)                                                4
Consumer price index (2010 = 100)                                                     12
Mean years of schooling (years)                                                        4
Life expectancy at birth (years)                                                       4
Healthy life expectancy (HALE) at birth (years)                                        5
Current health expenditure (CHE) as percentage of gross domestic product (GDP) (%)     7
Age-standardized suicide rates (per 100 000 population)                                5
Gender Development In

And in row level. <br>
Here we get the missing data regarding to each country (31 countries in total). For the country which has missing indicators greater than 10, we decide to remove this country while we replace the missing of the rest countries by their regional indicators mean.

In [None]:
missing_value = data[data.isnull().any(axis = 1)]
for index in missing_value.index:
    print('Number of missing indicators in {}: {} '.format(missing_value.loc[index]['Country'], missing_value.loc[index].isnull().sum()))

Number of missing indicators in Argentina: 1 
Number of missing indicators in Central African Republic: 1 
Number of missing indicators in Comoros: 1 
Number of missing indicators in Congo (Brazzaville): 1 
Number of missing indicators in Congo (Kinshasa): 2 
Number of missing indicators in Egypt: 1 
Number of missing indicators in Ethiopia: 1 
Number of missing indicators in Gambia: 1 
Number of missing indicators in Hong Kong S.A.R. of China: 5 
Number of missing indicators in Iraq: 1 
Number of missing indicators in Ivory Coast: 1 
Number of missing indicators in Kosovo: 10 
Number of missing indicators in Laos: 1 
Number of missing indicators in Lesotho: 1 
Number of missing indicators in Libya: 3 
Number of missing indicators in Macedonia: 1 
Number of missing indicators in Mali: 1 
Number of missing indicators in Mauritania: 1 
Number of missing indicators in Namibia: 1 
Number of missing indicators in North Cyprus: 14 
Number of missing indicators in Palestinian Territories: 13 

# **2. Handle missing value**

Remove rows which have too many missing values. We also fill missing value by mean of the region.

In [None]:
data_region = data.groupby(['Regional Indicator'])
data_region = data_region.mean()

data_filled = data.copy()
columns = data.columns[2:].values
for row in missing_value.index:
    if missing_value.loc[row].isnull().sum() >= 10:
        data_filled.drop([row],axis = 0, inplace = True)
    
    else:
        region = missing_value.loc[row, 'Regional Indicator']
        for column in columns:
            if math.isnan(missing_value.loc[row, column]):
                data_filled.loc[row, column] = data_region.loc[region, column]

data_filled = data_filled.reset_index(drop = True)

In [None]:
data_filled

Unnamed: 0,Country,Regional Indicator,Population density (people per sq. km of land area),GDP per capita (current US$),"Unemployment, total (% of labour force)",Consumer price index (2010 = 100),Mean years of schooling (years),Life expectancy at birth (years),Healthy life expectancy (HALE) at birth (years),Current health expenditure (CHE) as percentage of gross domestic product (GDP) (%),Age-standardized suicide rates (per 100 000 population),Gender Development Index (GDI),"Person held per 100,000 population","Carbon dioxide emissions, production emissions per capita (tonnes)","PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",Forest area (% of land area),Happiness Index
0,Afghanistan,South Asia,56.933998,498.842544,11.119667,147.343066,3.880000,64.482000,53.94961,10.590000,5.940000,0.660000,82.718353,0.2505,56.910808,1.850994,2.5669
1,Albania,Central and Eastern Europe,104.550170,5071.447533,12.807000,117.167089,10.085333,78.453667,69.07545,5.135000,3.923333,0.967333,185.418472,1.5990,18.200603,28.792001,4.8827
2,Algeria,Middle East and North Africa,17.728018,4080.461684,11.860667,147.403289,7.980333,76.690667,66.38806,6.300000,2.573333,0.859667,148.376871,3.6225,38.884011,0.813411,5.0051
3,Argentina,Latin America and Caribbean,16.257843,12154.946384,9.118667,141.821140,10.690333,76.520667,67.12776,10.040000,8.606667,0.992667,206.361994,4.4750,13.311834,10.520130,5.9747
4,Armenia,Commonwealth of Independent States,103.667697,4246.571500,17.397000,126.907196,11.302000,74.944000,67.11749,10.195000,2.246667,0.979333,90.645396,1.7935,32.528118,11.552160,4.6768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,Venezuela,Latin America and Caribbean,32.804424,8287.383552,7.773333,141.821140,10.312333,72.144667,64.43692,4.890000,2.183333,1.013000,194.187701,5.0150,17.008554,52.543484,5.0532
145,Vietnam,Southeast Asia,308.111835,2549.081694,2.019667,158.739414,8.240000,75.319333,65.29554,5.925000,7.363333,0.998000,133.769222,2.0580,29.626728,46.735544,5.3535
146,Yemen,Middle East and North Africa,53.977446,861.205026,13.021333,160.479847,3.133333,66.104000,57.53111,6.099286,7.003333,0.486333,182.256693,0.3595,50.456007,1.039832,3.5274
147,Zambia,Sub-Saharan Africa,23.346362,1452.188659,11.517000,195.922759,7.104000,63.481000,54.40409,4.665000,15.666300,0.972000,134.680854,0.3025,27.438035,60.789702,3.7594


Save the completed data.

In [None]:
file_path = os.path.join(root, 'data/data_completed.csv')
data_filled.to_csv(file_path, index = False)

Check the saved data.

In [None]:
data_saved = pd.read_csv(file_path)
data_saved

Unnamed: 0,Country,Regional Indicator,Population density (people per sq. km of land area),GDP per capita (current US$),"Unemployment, total (% of labour force)",Consumer price index (2010 = 100),Mean years of schooling (years),Life expectancy at birth (years),Healthy life expectancy (HALE) at birth (years),Current health expenditure (CHE) as percentage of gross domestic product (GDP) (%),Age-standardized suicide rates (per 100 000 population),Gender Development Index (GDI),"Person held per 100,000 population","Carbon dioxide emissions, production emissions per capita (tonnes)","PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",Forest area (% of land area),Happiness Index
0,Afghanistan,South Asia,56.933998,498.842544,11.119667,147.343066,3.880000,64.482000,53.94961,10.590000,5.940000,0.660000,82.718353,0.2505,56.910808,1.850994,2.5669
1,Albania,Central and Eastern Europe,104.550170,5071.447533,12.807000,117.167089,10.085333,78.453667,69.07545,5.135000,3.923333,0.967333,185.418472,1.5990,18.200603,28.792001,4.8827
2,Algeria,Middle East and North Africa,17.728018,4080.461684,11.860667,147.403289,7.980333,76.690667,66.38806,6.300000,2.573333,0.859667,148.376871,3.6225,38.884011,0.813411,5.0051
3,Argentina,Latin America and Caribbean,16.257843,12154.946384,9.118667,141.821140,10.690333,76.520667,67.12776,10.040000,8.606667,0.992667,206.361994,4.4750,13.311834,10.520130,5.9747
4,Armenia,Commonwealth of Independent States,103.667697,4246.571500,17.397000,126.907196,11.302000,74.944000,67.11749,10.195000,2.246667,0.979333,90.645396,1.7935,32.528118,11.552160,4.6768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,Venezuela,Latin America and Caribbean,32.804424,8287.383552,7.773333,141.821140,10.312333,72.144667,64.43692,4.890000,2.183333,1.013000,194.187701,5.0150,17.008554,52.543484,5.0532
145,Vietnam,Southeast Asia,308.111835,2549.081694,2.019667,158.739414,8.240000,75.319333,65.29554,5.925000,7.363333,0.998000,133.769222,2.0580,29.626728,46.735544,5.3535
146,Yemen,Middle East and North Africa,53.977446,861.205026,13.021333,160.479847,3.133333,66.104000,57.53111,6.099286,7.003333,0.486333,182.256693,0.3595,50.456007,1.039832,3.5274
147,Zambia,Sub-Saharan Africa,23.346362,1452.188659,11.517000,195.922759,7.104000,63.481000,54.40409,4.665000,15.666300,0.972000,134.680854,0.3025,27.438035,60.789702,3.7594
