In [1]:
import pandas as pd
import numpy as np

In [2]:
def clean_data(merged_df, path, renamed_column, quad=False):
    df = pd.read_csv(f'../inputs_extended/{path}.csv', delimiter=',')[['LOCATION', 'TIME', 'Value']]
    if quad: 
        df['TIME'] = df['TIME'].str.slice(0,4).astype('int64')
        
    df.drop_duplicates(subset=['LOCATION', 'TIME'], keep='first', inplace=True)
    df_final = pd.merge(merged_df, df, on=['LOCATION', 'TIME'], how="outer")
    df_final.rename(columns={'Value': renamed_column}, inplace=True)
    df_final.reset_index(drop=True, inplace=True)
    return df_final

In [3]:
#main dataframe
main_df = pd.read_excel('../inputs_extended/happiness.xls')
main_df.drop(labels=[
    'Healthy life expectancy at birth',
    'Generosity',
    'Positive affect',
    'Negative affect',
    ], axis=1, inplace=True)
main_df.rename(columns={'year': 'TIME'}, inplace=True)

In [4]:
# merge with country codes
df = pd.read_csv('../inputs_extended/codes.csv', delimiter=',')[['English short name lower case', 'Alpha-3 code']]
df.rename(columns={'Alpha-3 code': 'LOCATION', 'English short name lower case': 'Country name'}, inplace=True)
df_v1 = pd.merge(main_df, df, on=['Country name'], how="outer").dropna(thresh=4)
df_v1 = df_v1.dropna().reset_index(drop=True)

In [5]:
# inflation
df_v2 = clean_data(df_v1, 'inflation', 'Inflation rate', True)
df_v2

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Freedom to make life choices,Perceptions of corruption,LOCATION,Inflation rate
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,0.718114,0.881686,AFG,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,0.678896,0.850035,AFG,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,0.600127,0.706766,AFG,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,0.495901,0.731109,AFG,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,0.530935,0.775620,AFG,
...,...,...,...,...,...,...,...,...,...
3937,,2021.0,,,,,,EU27_2020,12.60
3938,,2001.0,,,,,,EU27_2020,6.80
3939,,2000.0,,,,,,EU27_2020,69.60
3940,,2022.0,,,,,,EU27_2020,27.00


In [6]:
# adult education
df_v3 = clean_data(df_v2, 'adult-education', 'Adult education')
df_v3

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Freedom to make life choices,Perceptions of corruption,LOCATION,Inflation rate,Adult education
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,0.718114,0.881686,AFG,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,0.678896,0.850035,AFG,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,0.600127,0.706766,AFG,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,0.495901,0.731109,AFG,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,0.530935,0.775620,AFG,,
...,...,...,...,...,...,...,...,...,...,...
3989,,2010.0,,,,,,G20,,37.730685
3990,,2011.0,,,,,,G20,,38.054268
3991,,2012.0,,,,,,G20,,38.172514
3992,,2018.0,,,,,,G20,,34.080354


In [8]:
# housing prices
df_v4 = clean_data(df_v3, 'housing-prices', 'Housing prices', True)

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Freedom to make life choices,Perceptions of corruption,LOCATION,Inflation rate,Adult education,Housing prices
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,0.718114,0.881686,AFG,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,0.678896,0.850035,AFG,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,0.600127,0.706766,AFG,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,0.495901,0.731109,AFG,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,0.530935,0.775620,AFG,,,
...,...,...,...,...,...,...,...,...,...,...,...
4092,,2008.0,,,,,,BGR,,,147.972500
4093,,2009.0,,,,,,BGR,,,117.785000
4094,,2020.0,,,,,,ROU,,,128.435000
4095,,2021.0,,,,,,BGR,,,144.852440


In [27]:
# adult education
df_v5 = clean_data(df_v4, 'trust-in-gov', 'Trust in government')
df_v5

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Freedom to make life choices,Perceptions of corruption,LOCATION,Inflation rate,Adult education,Housing prices,Trust in government
0,Afghanistan,2008.0,3.723590,7.370100,0.450662,0.718114,0.881686,AFG,,,,
1,Afghanistan,2009.0,4.401778,7.539972,0.552308,0.678896,0.850035,AFG,,,,
2,Afghanistan,2010.0,4.758381,7.646709,0.539075,0.600127,0.706766,AFG,,,,
3,Afghanistan,2011.0,3.831719,7.619532,0.521104,0.495901,0.731109,AFG,,,,
4,Afghanistan,2012.0,3.782938,7.705479,0.520637,0.530935,0.775620,AFG,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
4092,,2008.0,,,,,,BGR,,,147.972500,
4093,,2009.0,,,,,,BGR,,,117.785000,
4094,,2020.0,,,,,,ROU,,,128.435000,
4095,,2021.0,,,,,,BGR,,,144.852440,


In [28]:
# Dataframe check
print(f"""
{df_v5['Inflation rate'].isna().sum()}
{df_v5['Adult education'].isna().sum()}
{df_v5['Housing prices'].isna().sum()}
{df_v5['Trust in government'].isna().sum()}
""")
df_v5 = df_v5.dropna(thresh=10).reset_index(drop=True)
print(f"""
{df_v5['Inflation rate'].isna().sum()}
{df_v5['Adult education'].isna().sum()}
{df_v5['Housing prices'].isna().sum()}
{df_v5['Trust in government'].isna().sum()}
""")
df_v5


1254
3067
1996
3660


14
44
3
156



Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Freedom to make life choices,Perceptions of corruption,LOCATION,Inflation rate,Adult education,Housing prices,Trust in government
0,Argentina,2017.0,6.039330,10.067430,0.906699,0.831966,0.841052,ARG,24.795600,17.488495,,
1,Argentina,2018.0,5.792797,10.032141,0.899912,0.845895,0.855255,ARG,34.277230,31.336042,,
2,Australia,2005.0,7.340688,10.658608,0.967892,0.934973,0.390416,AUS,9.712082,34.967968,56.664750,
3,Australia,2007.0,7.285391,10.702894,0.965276,0.890682,0.512578,AUS,1.405548,31.816732,66.960250,
4,Australia,2008.0,7.253757,10.718780,0.946635,0.915733,0.430811,AUS,12.489460,30.064018,69.602750,
...,...,...,...,...,...,...,...,...,...,...,...,...
587,United States,2016.0,6.803600,10.985777,0.896751,0.757893,0.738920,USA,-6.584728,9.852750,105.637315,29.720566
588,United States,2017.0,6.991759,11.001395,0.921003,0.868497,0.681191,USA,7.916659,9.354493,112.170681,38.653502
589,United States,2018.0,6.882685,11.025024,0.903856,0.824607,0.709928,USA,7.529873,9.176164,119.143333,31.381609
590,United States,2019.0,6.943701,11.043353,0.916691,0.836139,0.706716,USA,-2.112950,9.189871,125.207117,36.277444


In [29]:
# adult education
df_v6 = clean_data(df_v5, 'self-employment', 'Self-employment rate')

In [30]:
# poverty rates
df_v7 = clean_data(df_v6, 'poverty', 'Poverty rate')

In [52]:
# adult education
df_v8 = clean_data(df_v7, 'unemployed-youth', 'Unemployed youth rate')
df_v8 = df_v8.dropna(thresh=14).reset_index(drop=True)
df_v8

Unnamed: 0,Country name,TIME,Life Ladder,Log GDP per capita,Social support,Freedom to make life choices,Perceptions of corruption,LOCATION,Inflation rate,Adult education,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate
0,Australia,2010.0,7.450047,10.722262,0.954520,0.932059,0.366127,AUS,8.548600,26.792744,80.877000,60.975292,13.774050,,8.059968
1,Australia,2011.0,7.405616,10.732697,0.967029,0.944586,0.381772,AUS,10.980910,25.922543,79.134750,53.078672,13.220600,,7.839816
2,Australia,2012.0,7.195586,10.753672,0.944599,0.935146,0.368252,AUS,7.324251,23.563389,78.880250,42.041918,12.216890,0.129,7.231065
3,Australia,2013.0,7.364169,10.761981,0.928205,0.933379,0.431539,AUS,6.317400,24.284430,84.106500,45.587147,11.975450,,7.336304
4,Australia,2014.0,7.288550,10.772080,0.923799,0.922932,0.442021,AUS,1.184765,22.895836,91.720250,46.467623,12.052440,0.130,7.217288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,United States,2016.0,6.803600,10.985777,0.896751,0.757893,0.738920,USA,-6.584728,9.852750,105.637315,29.720566,7.466977,0.209,8.533521
412,United States,2017.0,6.991759,11.001395,0.921003,0.868497,0.681191,USA,7.916659,9.354493,112.170681,38.653502,7.109162,0.212,7.455460
413,United States,2018.0,6.882685,11.025024,0.903856,0.824607,0.709928,USA,7.529873,9.176164,119.143333,31.381609,7.262526,0.211,7.458875
414,United States,2019.0,6.943701,11.043353,0.916691,0.836139,0.706716,USA,-2.112950,9.189871,125.207117,36.277444,6.994946,0.210,7.903465


In [55]:
print(f"""
{df_v8['Inflation rate'].isna().sum()}
{df_v8['Adult education'].isna().sum()}
{df_v8['Housing prices'].isna().sum()}
{df_v8['Trust in government'].isna().sum()}
{df_v8['Self-employment rate'].isna().sum()}
{df_v8['Poverty rate'].isna().sum()}
{df_v8['Unemployed youth rate'].isna().sum()}
""")


7
0
0
37
36
97
14



In [57]:
df_v8.rename(columns={'TIME': 'Year', 'LOCATION': 'Country code'}, inplace=True)
df_v8

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Freedom to make life choices,Perceptions of corruption,Country code,Inflation rate,Adult education,Housing prices,Trust in government,Self-employment rate,Poverty rate,Unemployed youth rate
0,Australia,2010.0,7.450047,10.722262,0.954520,0.932059,0.366127,AUS,8.548600,26.792744,80.877000,60.975292,13.774050,,8.059968
1,Australia,2011.0,7.405616,10.732697,0.967029,0.944586,0.381772,AUS,10.980910,25.922543,79.134750,53.078672,13.220600,,7.839816
2,Australia,2012.0,7.195586,10.753672,0.944599,0.935146,0.368252,AUS,7.324251,23.563389,78.880250,42.041918,12.216890,0.129,7.231065
3,Australia,2013.0,7.364169,10.761981,0.928205,0.933379,0.431539,AUS,6.317400,24.284430,84.106500,45.587147,11.975450,,7.336304
4,Australia,2014.0,7.288550,10.772080,0.923799,0.922932,0.442021,AUS,1.184765,22.895836,91.720250,46.467623,12.052440,0.130,7.217288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,United States,2016.0,6.803600,10.985777,0.896751,0.757893,0.738920,USA,-6.584728,9.852750,105.637315,29.720566,7.466977,0.209,8.533521
412,United States,2017.0,6.991759,11.001395,0.921003,0.868497,0.681191,USA,7.916659,9.354493,112.170681,38.653502,7.109162,0.212,7.455460
413,United States,2018.0,6.882685,11.025024,0.903856,0.824607,0.709928,USA,7.529873,9.176164,119.143333,31.381609,7.262526,0.211,7.458875
414,United States,2019.0,6.943701,11.043353,0.916691,0.836139,0.706716,USA,-2.112950,9.189871,125.207117,36.277444,6.994946,0.210,7.903465


In [54]:
df_v8.to_csv('../cleaned_data/extended_happiness.csv', index=False)

relative povert
GDP
extend. by years(?)
quality of life in eiuropean cities