In [1]:
import pandas as pd
import numpy as np

In [2]:
def clean_data(merged_df, path, renamed_column, quad=False):
    df = pd.read_csv(f'../inputs_extended/{path}.csv', delimiter=',')[['LOCATION', 'TIME', 'Value']]
    if quad: 
        df['TIME'] = df['TIME'].str.slice(0,4).astype('int64')
        
    df.drop_duplicates(subset=['LOCATION', 'TIME'], keep='first', inplace=True)
    df_final = pd.merge(merged_df, df, on=['LOCATION', 'TIME'], how="outer")
    df_final.rename(columns={'Value': renamed_column}, inplace=True)
    df_final = df_final.dropna(thresh=9).reset_index(drop=True)
    df_final.reset_index(drop=True, inplace=True)
    return df_final

In [3]:
#main dataframe
main_df = pd.read_excel('../inputs_extended/happiness.xls')
main_df.rename(columns={'year': 'TIME'}, inplace=True)

In [4]:
# merge with country codes
df = pd.read_csv('../inputs_extended/codes.csv', delimiter=',')[['English short name lower case', 'Alpha-3 code']]
df.rename(columns={'Alpha-3 code': 'LOCATION', 'English short name lower case': 'Country name'}, inplace=True)
df_v1 = pd.merge(main_df, df, on=['Country name'], how="outer").dropna(thresh=4)
df_v1 = df_v1.dropna().reset_index(drop=True)

In [5]:
# inflation
df_v2 = clean_data(df_v1, 'inflation', 'Inflation rate', True)

In [6]:
# adult education
df_v3 = clean_data(df_v2, 'adult-education', 'Adult education')
df_v3

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,LOCATION,Inflation rate,Adult education
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,0.258195,AFG,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,0.237092,AFG,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,0.275324,AFG,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,0.267175,AFG,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,0.267919,AFG,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,0.208555,ZWE,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,0.224051,ZWE,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,0.211726,ZWE,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,0.235354,ZWE,,


In [7]:
# housing prices
df_v4 = clean_data(df_v3, 'housing-prices', 'Housing prices', True)
df_v4

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,LOCATION,Inflation rate,Adult education,Housing prices
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,0.258195,AFG,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,0.237092,AFG,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,0.275324,AFG,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,0.267175,AFG,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,0.267919,AFG,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,0.208555,ZWE,,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,0.224051,ZWE,,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,0.211726,ZWE,,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,0.235354,ZWE,,,


In [8]:
# adult education
df_v5 = clean_data(df_v4, 'trust-in-gov', 'Trust in government')
df_v5

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,LOCATION,Inflation rate,Adult education,Housing prices,Trust in government
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,0.258195,AFG,,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,0.237092,AFG,,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,0.275324,AFG,,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,0.267175,AFG,,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,0.267919,AFG,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,0.208555,ZWE,,,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,0.224051,ZWE,,,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,0.211726,ZWE,,,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,0.235354,ZWE,,,,


In [9]:
# Dataframe check
print(f"""
{df_v5['Inflation rate'].isna().sum()}
{df_v5['Adult education'].isna().sum()}
{df_v5['Housing prices'].isna().sum()}
{df_v5['Trust in government'].isna().sum()}
""")
df_v5


1084
1112
1056
1213



Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,LOCATION,Inflation rate,Adult education,Housing prices,Trust in government
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,0.258195,AFG,,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,0.237092,AFG,,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,0.275324,AFG,,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,0.267175,AFG,,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,0.267919,AFG,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,0.208555,ZWE,,,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,0.224051,ZWE,,,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,0.211726,ZWE,,,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,0.235354,ZWE,,,,


In [10]:
# adult education
df_v6 = clean_data(df_v5, 'self-employment', 'Self-employment rate')

In [11]:
# poverty rates
df_v7 = clean_data(df_v6, 'poverty', 'Poverty rate')

In [12]:
# unemployed youth rates
df_v8 = clean_data(df_v7, 'unemployed-youth', 'Unemployed youth rate')
df_v8

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,LOCATION,Inflation rate,Adult education,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,0.258195,AFG,,,,,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,0.237092,AFG,,,,,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,0.275324,AFG,,,,,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,0.267175,AFG,,,,,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,0.267919,AFG,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,0.208555,ZWE,,,,,,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,0.224051,ZWE,,,,,,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,0.211726,ZWE,,,,,,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,0.235354,ZWE,,,,,,,


In [14]:
# housing overcrowding
df = clean_data(df_v8, 'overcrowding', 'Households overcrowding')
df

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,LOCATION,Inflation rate,Adult education,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate,Households overcrowding
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,0.258195,AFG,,,,,,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,0.237092,AFG,,,,,,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,0.275324,AFG,,,,,,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,0.267175,AFG,,,,,,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,0.267919,AFG,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,0.208555,ZWE,,,,,,,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,0.224051,ZWE,,,,,,,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,0.211726,ZWE,,,,,,,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,0.235354,ZWE,,,,,,,,


In [16]:
# measure of discrimination against women in social institutions
df = clean_data(df, 'childcarecosts', 'Childcare costs')
df

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,...,Inflation rate,Adult education,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate,Households overcrowding,% of elderly population,Childcare costs
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,...,,,,,,,,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,...,,,,,,,,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,...,,,,,,,,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,...,,,,,,,,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,...,,,,,,,,,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,...,,,,,,,,,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,...,,,,,,,,,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,...,,,,,,,,,,


In [17]:
# measure of discrimination against women in social institutions
df = clean_data(df, 'netaccess', 'Internet access')
df

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,...,Adult education,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate,Households overcrowding,% of elderly population,Childcare costs,Internet access
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,...,,,,,,,,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,...,,,,,,,,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,...,,,,,,,,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,...,,,,,,,,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,...,,,,,,,,,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,...,,,,,,,,,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,...,,,,,,,,,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,...,,,,,,,,,,


In [20]:
df.rename(columns={'Country name': 'Country', 'TIME': 'Year', 'Life Ladder': 'Happiness index', 'Log GDP per capita': 'GDP per capita', 'LOCATION': 'Country code'}, inplace=True)
df

Unnamed: 0,Country,Year,Happiness index,GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,...,Adult education,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate,Households overcrowding,% of elderly population,Childcare costs,Internet access
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,...,,,,,,,,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,...,,,,,,,,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,...,,,,,,,,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,...,,,,,,,,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,Zimbabwe,2016.0,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,...,,,,,,,,,,
1641,Zimbabwe,2017.0,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,...,,,,,,,,,,
1642,Zimbabwe,2018.0,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,...,,,,,,,,,,
1643,Zimbabwe,2019.0,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,...,,,,,,,,,,


In [21]:
df1 = df.copy()

In [29]:
countries = df1['Country'].unique()
cols = df1.columns.values.tolist()
df_v10 = pd.DataFrame(columns=cols)

# fill the NaN values with latest available value in the column for the country
for country in countries:
    temp_df = df.loc[df['Country'] == country]
    rows = len(temp_df)
    temp_df = temp_df.loc[:, cols].ffill()
    temp_df = temp_df.loc[:, cols].bfill()
    df_v10 = pd.concat([df_v10, temp_df])

# discard countries that did not hold anything in more than 4 added categories 
# fyi: 8 cols are filled for sure for each country by default, dictated by the base dataset
df_v10.dropna(thresh=17, inplace=True)
df_v10.reset_index(drop=True, inplace=True)
display(df_v10)

Unnamed: 0,Country,Year,Happiness index,GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,...,Adult education,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate,Households overcrowding,% of elderly population,Childcare costs,Internet access
0,Australia,2007.0,7.285391,10.702894,0.965276,71.720001,0.890682,0.347052,0.512578,0.826251,...,31.816732,66.96025,60.975292,14.08298,0.129,6.48775,,13.13933,24.0,67.0
1,Australia,2008.0,7.253757,10.71878,0.946635,71.879997,0.915733,0.30529,0.430811,0.826391,...,30.064018,69.60275,60.975292,13.71566,0.129,6.310053,,13.201283,24.0,72.0
2,Australia,2010.0,7.450047,10.722262,0.95452,72.199997,0.932059,0.316744,0.366127,0.834236,...,26.792744,80.877,60.975292,13.77405,0.129,8.059968,,13.556231,24.0,78.92
3,Australia,2011.0,7.405616,10.732697,0.967029,72.300003,0.944586,0.36934,0.381772,0.81586,...,25.922543,79.13475,53.078672,13.2206,0.129,7.839816,,13.822326,24.0,78.92
4,Australia,2012.0,7.195586,10.753672,0.944599,72.400002,0.935146,0.273635,0.368252,0.810742,...,23.563389,78.88025,42.041918,12.21689,0.129,7.231065,,14.137101,21.0,83.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,United States,2016.0,6.8036,10.985777,0.896751,68.5,0.757893,0.144048,0.73892,0.805674,...,9.85275,105.637315,29.720566,7.466977,0.209,8.533521,4.244195,15.23144,23.0,73.37
551,United States,2017.0,6.991759,11.001395,0.921003,68.400002,0.868497,0.197317,0.681191,0.826555,...,9.354493,112.170681,38.653502,7.109162,0.212,7.45546,4.18607,15.611868,23.0,77.97
552,United States,2018.0,6.882685,11.025024,0.903856,68.300003,0.824607,0.116116,0.709928,0.815383,...,9.176164,119.143333,31.381609,7.262526,0.211,7.458875,4.188988,16.018509,23.0,77.97
553,United States,2019.0,6.943701,11.043353,0.916691,68.199997,0.836139,0.144299,0.706716,0.814985,...,9.189871,125.207117,36.277444,6.994946,0.21,7.903465,4.236565,16.458058,23.0,79.88


In [30]:
df_v10.isna().sum()

Country                               0
Year                                  0
Happiness index                       0
GDP per capita                        0
Social support                        0
Healthy life expectancy at birth      0
Freedom to make life choices          0
Generosity                            0
Perceptions of corruption             0
Positive affect                       0
Negative affect                       0
Country code                          0
Inflation rate                       14
Adult education                       0
Housing prices                        0
Trust in government                  14
Self-employment rate                 63
Poverty rate                         15
Unemployed youth rate                28
Households overcrowding             126
% of elderly population               0
Childcare costs                      84
Internet access                      43
dtype: int64

In [37]:
# since there is no value available in the entire column for a country, lets fill it with mean. 
# those countries may still hold valuable info for other columns so discarding them based on omne NaN value
# would be a loss of meaninful data.

# NaN values were detected for 'Self-employment rate', 'Poverty rate' and 'Unemployed youth rate'
mean_infl = df_v10['Inflation rate'].mean()
mean_gov_trust = df_v10['Trust in government'].mean()
mean_self_emp=df_v10['Self-employment rate'].mean()
mean_poverty=df_v10['Poverty rate'].mean()
mean_unemp_youth=df_v10['Unemployed youth rate'].mean()
mean_overcr=df_v10['Households overcrowding'].mean()
mean_childcare=df_v10['Childcare costs'].mean()
mean_internet=df_v10['Internet access'].mean()

df_v10['Trust in government'].fillna(mean_gov_trust, inplace=True)
df_v10['Self-employment rate'].fillna(mean_self_emp, inplace=True)
df_v10['Poverty rate'].fillna(mean_poverty, inplace=True)
df_v10['Unemployed youth rate'].fillna(mean_unemp_youth, inplace=True)
df_v10['Inflation rate'].fillna(mean_infl, inplace=True)
df_v10['Households overcrowding'].fillna(mean_overcr, inplace=True)
df_v10['Childcare costs'].fillna(mean_childcare, inplace=True)
df_v10['Internet access'].fillna(mean_internet, inplace=True)

In [38]:
df_v10.isna().sum()

Country                             0
Year                                0
Happiness index                     0
GDP per capita                      0
Social support                      0
Healthy life expectancy at birth    0
Freedom to make life choices        0
Generosity                          0
Perceptions of corruption           0
Positive affect                     0
Negative affect                     0
Inflation rate                      0
Adult education                     0
Housing prices                      0
Trust in government                 0
Self-employment rate                0
Poverty rate                        0
Unemployed youth rate               0
Households overcrowding             0
% of elderly population             0
Childcare costs                     0
Internet access                     0
Happy country                       0
dtype: int64

In [40]:
df_v10['Year'] = df['Year'].astype(int)
df_v10['Happy country'] = (df['Happiness index'] > 6).astype(bool).astype(int)
df_v10.sort_values(by=['Year', 'Country'], inplace=True)
#df_v10.drop('Country code', axis=1, inplace=True)
df_v10.reset_index(drop=True, inplace=True)
df_v10['Happy country'].value_counts()
df_v10

Unnamed: 0,Country,Year,Happiness index,GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,...,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate,Households overcrowding,% of elderly population,Childcare costs,Internet access,Happy country
0,South Korea,2005,5.970564,10.593056,0.811163,73.0,0.590956,0.026376,0.861816,0.676223,...,101.621556,23.708158,27.516530,0.176000,8.244254,5.752162,13.19284,4.000000,99.194448,1
1,Colombia,2006,5.983512,9.578836,0.87097,67.699997,0.850766,-0.148472,0.854821,0.825455,...,124.7345,27.486950,53.744190,0.151515,19.381748,11.405780,8.060025,15.044586,52.661142,0
2,Denmark,2006,7.970892,10.880102,0.953912,70.080002,0.969788,0.272087,0.247505,0.756866,...,100.200251,58.668944,12.588030,0.035000,3.967437,7.143568,15.743005,11.000000,81.881100,0
3,Estonia,2006,5.332044,10.34734,0.895632,65.32,0.712121,-0.245836,0.742697,0.66557,...,106.534984,41.754021,19.878318,0.141000,5.743918,9.080895,17.370663,3.000000,52.202200,0
4,Finland,2006,7.88935,10.750446,0.961621,72.099998,0.962424,-0.115532,0.163636,0.744292,...,106.3645,80.863288,17.295360,0.035000,3.832753,8.930486,22.490169,18.000000,95.998600,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,South Africa,2020,5.034863,9.432028,0.84772,56.900002,0.738339,-0.13397,0.819824,0.800584,...,118.272954,49.590632,19.878318,0.277000,13.382333,11.405780,5.980963,15.044586,73.116780,0
551,Spain,2020,6.502175,10.488059,0.934935,75.0,0.783257,-0.120613,0.729977,0.686178,...,127.505181,38.176710,19.381760,0.209000,8.124655,3.013694,19.646805,7.000000,95.377500,0
552,Turkey,2020,5.579794,10.124029,0.863288,65.360001,0.649196,-0.023702,0.764014,0.483359,...,86.1545,55.647830,31.485320,0.253000,20.970842,11.405780,7.82869,0.000000,60.236000,1
553,United States,2020,6.943701,11.043353,0.916691,68.199997,0.836139,0.144299,0.706716,0.814985,...,125.207117,36.277444,6.994946,0.210000,7.903465,4.236565,16.458058,23.000000,79.880000,1


In [41]:
# final export
df_v10.to_csv('../cleaned_data/clean_v4_fillednans.csv', index=False)

relative povert
GDP
extend. by years(?)
quality of life in eiuropean cities