In [62]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

dataset = pd.read_csv('dataset_world_happiness.csv')
print(dataset.isnull().sum())

Country                          303
Region                           150
Happiness Rank                     0
Happiness Score                    0
GDP Per Capita                     0
Family                           303
Health (Life Expectancy)           0
Freedom                            0
Trust (Government Corruption)      1
Generosity                         0
Dystopia Residual                303
Year                               0
dtype: int64


In [63]:
# break up data by years
def separateYears(data,year):
    if(year>2015):
        return data.loc[data['Year']==year].reset_index()
    else:
        return data.loc[data['Year']==year]

In [64]:
# 2015
dataset_2015 = separateYears(dataset, 2015) 
dataset_2015.pop('Year')
print(dataset_2015.isnull().sum())

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
GDP Per Capita                   0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64


In [65]:
# 2016
dataset_2016 = separateYears(dataset,2016)
dataset_2016.drop('index', inplace=True, axis=1)
dataset_2016.pop('Year')
print(dataset_2016.isnull().sum())

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
GDP Per Capita                   0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64


In [66]:
# 2017
dataset_2017 = separateYears(dataset,2017)
dataset_2017.drop('index', inplace=True, axis=1)
dataset_2017.pop('Year')
print(dataset_2017.isnull().sum())

Country                            0
Region                           150
Happiness Rank                     0
Happiness Score                    0
GDP Per Capita                     0
Family                             0
Health (Life Expectancy)           0
Freedom                            0
Trust (Government Corruption)      0
Generosity                         0
Dystopia Residual                  0
dtype: int64


In [67]:
print(dataset_2017.head())

       Country Region  Happiness Rank  Happiness Score  GDP Per Capita  \
0       Norway    NaN               1            7.537        1.616463   
1      Denmark    NaN               2            7.522        1.482383   
2      Iceland    NaN               3            7.504        1.480633   
3  Switzerland    NaN               4            7.494        1.564980   
4      Finland    NaN               5            7.469        1.443572   

     Family  Health (Life Expectancy)   Freedom  \
0  1.533524                  0.796667  0.635423   
1  1.551122                  0.792566  0.626007   
2  1.610574                  0.833552  0.627163   
3  1.516912                  0.858131  0.620071   
4  1.540247                  0.809158  0.617951   

   Trust (Government Corruption)  Generosity  Dystopia Residual  
0                       0.315964    0.362012           2.277027  
1                       0.400770    0.355280           2.313707  
2                       0.153527    0.475540      

In [68]:
#2018
dataset_2018 = separateYears(dataset,2018)
dataset_2018.pop('Year')
dataset_2018.drop('index', inplace=True, axis=1)
print("Before")
print(dataset_2018.isnull().sum())

Before
Country                          149
Region                             0
Happiness Rank                     0
Happiness Score                    0
GDP Per Capita                     0
Family                           149
Health (Life Expectancy)           0
Freedom                            0
Trust (Government Corruption)      1
Generosity                         0
Dystopia Residual                149
dtype: int64


In [69]:
print(dataset_2018.head())

  Country       Region  Happiness Rank  Happiness Score  GDP Per Capita  \
0     NaN      Finland               1            7.632           1.305   
1     NaN      Denmark               3            7.555           1.351   
2     NaN      Iceland               4            7.495           1.343   
3     NaN  Switzerland               5            7.487           1.420   
4     NaN  Netherlands               6            7.441           1.361   

   Family  Health (Life Expectancy)  Freedom  Trust (Government Corruption)  \
0     NaN                     0.874    0.681                          0.393   
1     NaN                     0.868    0.683                          0.408   
2     NaN                     0.914    0.677                          0.138   
3     NaN                     0.927    0.660                          0.357   
4     NaN                     0.878    0.638                          0.295   

   Generosity  Dystopia Residual  
0       0.202                NaN  
1   

In [70]:
dataset_2018.Country.fillna(dataset_2018.Region, inplace=True)
print("\nAfter")
print(dataset_2018.isnull().sum())


After
Country                            0
Region                             0
Happiness Rank                     0
Happiness Score                    0
GDP Per Capita                     0
Family                           149
Health (Life Expectancy)           0
Freedom                            0
Trust (Government Corruption)      1
Generosity                         0
Dystopia Residual                149
dtype: int64


In [71]:
print(dataset_2018.head())

       Country       Region  Happiness Rank  Happiness Score  GDP Per Capita  \
0      Finland      Finland               1            7.632           1.305   
1      Denmark      Denmark               3            7.555           1.351   
2      Iceland      Iceland               4            7.495           1.343   
3  Switzerland  Switzerland               5            7.487           1.420   
4  Netherlands  Netherlands               6            7.441           1.361   

   Family  Health (Life Expectancy)  Freedom  Trust (Government Corruption)  \
0     NaN                     0.874    0.681                          0.393   
1     NaN                     0.868    0.683                          0.408   
2     NaN                     0.914    0.677                          0.138   
3     NaN                     0.927    0.660                          0.357   
4     NaN                     0.878    0.638                          0.295   

   Generosity  Dystopia Residual  
0       0

In [72]:
# 2019
dataset_2019 = separateYears(dataset,2019)
dataset_2019.drop('index', inplace=True, axis=1)
dataset_2019.pop('Year')
print("Before")
print(dataset_2019.isnull().sum())

Before
Country                          154
Region                             0
Happiness Rank                     0
Happiness Score                    0
GDP Per Capita                     0
Family                           154
Health (Life Expectancy)           0
Freedom                            0
Trust (Government Corruption)      0
Generosity                         0
Dystopia Residual                154
dtype: int64


In [73]:
print(dataset_2019.head())

  Country       Region  Happiness Rank  Happiness Score  GDP Per Capita  \
0     NaN      Finland               1            7.769           1.340   
1     NaN      Denmark               2            7.600           1.383   
2     NaN       Norway               3            7.554           1.488   
3     NaN      Iceland               4            7.494           1.380   
4     NaN  Netherlands               5            7.488           1.396   

   Family  Health (Life Expectancy)  Freedom  Trust (Government Corruption)  \
0     NaN                     0.986    0.596                          0.393   
1     NaN                     0.996    0.592                          0.410   
2     NaN                     1.028    0.603                          0.341   
3     NaN                     1.026    0.591                          0.118   
4     NaN                     0.999    0.557                          0.298   

   Generosity  Dystopia Residual  
0       0.153                NaN  
1   

In [74]:
dataset_2019.Country.fillna(dataset_2019.Region, inplace=True)
print("\nAfter")
print(dataset_2019.isnull().sum())


After
Country                            0
Region                             0
Happiness Rank                     0
Happiness Score                    0
GDP Per Capita                     0
Family                           154
Health (Life Expectancy)           0
Freedom                            0
Trust (Government Corruption)      0
Generosity                         0
Dystopia Residual                154
dtype: int64


In [75]:
print(dataset_2019.head())

       Country       Region  Happiness Rank  Happiness Score  GDP Per Capita  \
0      Finland      Finland               1            7.769           1.340   
1      Denmark      Denmark               2            7.600           1.383   
2       Norway       Norway               3            7.554           1.488   
3      Iceland      Iceland               4            7.494           1.380   
4  Netherlands  Netherlands               5            7.488           1.396   

   Family  Health (Life Expectancy)  Freedom  Trust (Government Corruption)  \
0     NaN                     0.986    0.596                          0.393   
1     NaN                     0.996    0.592                          0.410   
2     NaN                     1.028    0.603                          0.341   
3     NaN                     1.026    0.591                          0.118   
4     NaN                     0.999    0.557                          0.298   

   Generosity  Dystopia Residual  
0       0

In [76]:
# Merging 2015 and 2016 (Call temp)
temp = pd.merge(dataset_2015, dataset_2016, on = 'Country')
temp.rename(columns = {'Region_x' : 'Region',
                      'Happiness Rank_x' : 'Happiness Rank_2015',
                      'Happiness Score_x' : 'Happiness Score_2015',
                      'GDP Per Capita_x':'GDP Per Capita_2015',
                      'Family_x':'Family_2015',
                      'Health (Life Expectancy)_x':'Health (Life Expectancy)_2015',
                      'Freedom_x':'Freedom_2015',
                      'Trust (Government Corruption)_x':'Trust (Government Corruption)_2015',
                      'Generosity_x':'Generosity_2015',
                      'Dystopia Residual_x':'Dystopia Residual_2015',
                      'Region_y' : 'Region_2016',
                      'Happiness Rank_y' : 'Happiness Rank_2016',
                      'Happiness Score_y' : 'Happiness Score_2016',
                      'GDP Per Capita_y':'GDP Per Capita_2016',
                      'Family_y':'Family_2016',
                      'Health (Life Expectancy)_y':'Health (Life Expectancy)_2016',
                      'Freedom_y':'Freedom_2016',
                      'Trust (Government Corruption)_y':'Trust (Government Corruption)_2016',
                      'Generosity_y':'Generosity_2016',
                      'Dystopia Residual_y':'Dystopia Residual_2016'}, inplace = True)

In [77]:
# Merging 2015,2016 with 2017
temp2 = pd.merge(temp, dataset_2017,on = 'Country')
temp2.drop('Region_y', inplace =True, axis =1)

In [78]:
# Merging 2015-2017 with 2018
temp3 = pd.merge(temp2, dataset_2018,on = 'Country')
temp3.drop('Region', inplace =True, axis =1)
temp3.drop('Region_2016', inplace =True, axis =1)
temp3.rename(columns = {'Happiness Rank_x' : 'Happiness Rank_2017',
                      'Happiness Score_x' : 'Happiness Score_2017',
                      'GDP Per Capita_x':'GDP Per Capita_2017',
                      'Family_x':'Family_2017',
                      'Health (Life Expectancy)_x':'Health (Life Expectancy)_2017',
                      'Freedom_x':'Freedom_2017',
                      'Trust (Government Corruption)_x':'Trust (Government Corruption)_2017',
                      'Generosity_x':'Generosity_2017',
                      'Dystopia Residual_x':'Dystopia Residual_2017',
                      'Happiness Rank_y' : 'Happiness Rank_2018',
                      'Happiness Score_y' : 'Happiness Score_2018',
                      'GDP Per Capita_y':'GDP Per Capita_2018',
                      'Family_y':'Family_2018',
                      'Health (Life Expectancy)_y':'Health (Life Expectancy)_2018',
                      'Freedom_y':'Freedom_2018',
                      'Trust (Government Corruption)_y':'Trust (Government Corruption)_2018',
                      'Generosity_y':'Generosity_2018',
                      'Dystopia Residual_y':'Dystopia Residual_2018'}, inplace = True)

In [79]:
# Merging 2015-2018 with 2019
newData = pd.merge(temp3,dataset_2019,on = 'Country')
newData.drop('Region', inplace =True, axis =1)
newData.rename(columns = {'Happiness Rank_x' : 'Happiness Rank_2018',
                      'Happiness Score_x' : 'Happiness Score_2018',
                      'GDP Per Capita_x':'GDP Per Capita_2018',
                      'Family_x':'Family_2018',
                      'Health (Life Expectancy)_x':'Health (Life Expectancy)_2018',
                      'Freedom_x':'Freedom_2017',
                      'Trust (Government Corruption)_x':'Trust (Government Corruption)_2018',
                      'Generosity_x':'Generosity_2018',
                      'Dystopia Residual_x':'Dystopia Residual_2018',
                      'Happiness Rank' : 'Happiness Rank_2019',
                      'Happiness Score' : 'Happiness Score_2019',
                      'GDP Per Capita':'GDP Per Capita_2019',
                      'Family':'Family_2019',
                      'Health (Life Expectancy)':'Health (Life Expectancy)_2019',
                      'Freedom':'Freedom_2019',
                      'Trust (Government Corruption)':'Trust (Government Corruption)_2019',
                      'Generosity':'Generosity_2019',
                      'Dystopia Residual':'Dystopia Residual_2019',
                       'Region_x':'Region'}, inplace = True)

In [80]:
# Filling in the NaN with the averages of previous years
newData['Family_2018'] = newData[['Family_2015', 'Family_2016','Family_2017']].mean(axis=1)
newData['Family_2019'] = newData[['Family_2015', 'Family_2016','Family_2017','Family_2018']].mean(axis=1)
newData['Dystopia Residual_2018'] = newData[['Dystopia Residual_2015', 'Dystopia Residual_2016','Dystopia Residual_2017']].mean(axis=1)
newData['Dystopia Residual_2019'] = newData[['Dystopia Residual_2015', 'Dystopia Residual_2016','Dystopia Residual_2017','Dystopia Residual_2018']].mean(axis=1)

In [81]:
# Filling in the NaN in Trust by averaging the previous years
uae = newData.loc[newData['Country']=='United Arab Emirates']
avg_uae=uae[['Trust (Government Corruption)_2015','Trust (Government Corruption)_2016','Trust (Government Corruption)_2017','Trust (Government Corruption)_2019']].mean(axis=1)
newData.loc[17,'Trust (Government Corruption)_2018']=avg_uae.loc[17]
newData.to_csv('data.csv')
newData.head()

Unnamed: 0,Country,Region,Happiness Rank_2015,Happiness Score_2015,GDP Per Capita_2015,Family_2015,Health (Life Expectancy)_2015,Freedom_2015,Trust (Government Corruption)_2015,Generosity_2015,...,Dystopia Residual_2018,Happiness Rank_2019,Happiness Score_2019,GDP Per Capita_2019,Family_2019,Health (Life Expectancy)_2019,Freedom_2019,Trust (Government Corruption)_2019,Generosity_2019,Dystopia Residual_2019
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,...,2.496242,6,7.48,1.452,1.337221,1.052,0.572,0.343,0.263,2.496242
1,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,...,2.618698,4,7.494,1.38,1.398688,1.026,0.591,0.118,0.354,2.618698
2,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,...,2.515046,2,7.6,1.383,1.358481,0.996,0.592,0.41,0.252,2.515046
3,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,...,2.447958,9,7.278,1.365,1.30002,1.039,0.584,0.308,0.285,2.447958
4,Finland,Western Europe,6,7.406,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,...,2.625231,1,7.769,1.34,1.331049,0.986,0.596,0.393,0.153,2.625231


In [137]:
columns = ['Country','Region','Happiness Rank','Happiness Score','GDP Per Capita','Family','HLE','Freedom','Trust','Generosity','Dystopia Residual']
data2015 = newData[['Country','Region','Happiness Rank_2015','Happiness Score_2015','GDP Per Capita_2015','Family_2015','Health (Life Expectancy)_2015','Freedom_2015','Trust (Government Corruption)_2015','Generosity_2015','Dystopia Residual_2015']]
data2015.columns = columns
data2015 = data2015.assign(Year=2015)

In [138]:
data2016 = newData[['Country','Region','Happiness Rank_2016','Happiness Score_2016','GDP Per Capita_2016','Family_2016','Health (Life Expectancy)_2016','Freedom_2016','Trust (Government Corruption)_2016','Generosity_2016','Dystopia Residual_2016']]
data2016.columns = columns
data2016 = data2016.assign(Year=2016)

In [139]:
data2017 = newData[['Country','Region','Happiness Rank_2017','Happiness Score_2017','GDP Per Capita_2017','Family_2017','Health (Life Expectancy)_2017','Freedom_2017','Trust (Government Corruption)_2017','Generosity_2017','Dystopia Residual_2017']]
data2017.columns = columns
data2017 = data2017.assign(Year=2017)

In [140]:
data2018 = newData[['Country','Region','Happiness Rank_2018','Happiness Score_2018','GDP Per Capita_2018','Family_2018','Health (Life Expectancy)_2018','Freedom_2018','Trust (Government Corruption)_2018','Generosity_2018','Dystopia Residual_2018']]
data2018.columns = columns
data2018 = data2018.assign(Year=2018)

In [141]:
data2019 = newData[['Country','Region','Happiness Rank_2019','Happiness Score_2019','GDP Per Capita_2019','Family_2019','Health (Life Expectancy)_2019','Freedom_2019','Trust (Government Corruption)_2019','Generosity_2019','Dystopia Residual_2019']]
data2019.columns = columns
data2019 = data2019.assign(Year=2019)

# 2015

In [142]:
data2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP Per Capita,Family,HLE,Freedom,Trust,Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015
1,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015
2,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015
3,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015
4,Finland,Western Europe,6,7.406,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,2.61955,2015


# 2016

In [143]:
data2016.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP Per Capita,Family,HLE,Freedom,Trust,Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,2,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463,2016
1,Iceland,Western Europe,3,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137,2016
2,Denmark,Western Europe,1,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939,2016
3,Canada,North America,6,7.404,1.44015,1.0961,0.8276,0.5737,0.31329,0.44834,2.70485,2016
4,Finland,Western Europe,5,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596,2016


# 2017

In [144]:
data2017.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP Per Capita,Family,HLE,Freedom,Trust,Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,4,7.494,1.56498,1.516912,0.858131,0.620071,0.367007,0.290549,2.276716,2017
1,Iceland,Western Europe,3,7.504,1.480633,1.610574,0.833552,0.627163,0.153527,0.47554,2.322715,2017
2,Denmark,Western Europe,2,7.522,1.482383,1.551122,0.792566,0.626007,0.40077,0.35528,2.313707,2017
3,Canada,North America,7,7.316,1.479204,1.481349,0.834558,0.611101,0.287372,0.43554,2.187264,2017
4,Finland,Western Europe,5,7.469,1.443572,1.540247,0.809158,0.617951,0.382612,0.245483,2.430182,2017


# 2018

In [145]:
data2018.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP Per Capita,Family,HLE,Freedom,Trust,Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,5,7.487,1.42,1.337221,0.927,0.66,0.357,0.256,2.496242,2018
1,Iceland,Western Europe,4,7.495,1.343,1.398688,0.914,0.677,0.138,0.353,2.618698,2018
2,Denmark,Western Europe,3,7.555,1.351,1.358481,0.868,0.683,0.408,0.284,2.515046,2018
3,Canada,North America,7,7.328,1.33,1.30002,0.896,0.653,0.291,0.321,2.447958,2018
4,Finland,Western Europe,1,7.632,1.305,1.331049,0.874,0.681,0.393,0.202,2.625231,2018


# 2019

In [146]:
data2019.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP Per Capita,Family,HLE,Freedom,Trust,Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,6,7.48,1.452,1.337221,1.052,0.572,0.343,0.263,2.496242,2019
1,Iceland,Western Europe,4,7.494,1.38,1.398688,1.026,0.591,0.118,0.354,2.618698,2019
2,Denmark,Western Europe,2,7.6,1.383,1.358481,0.996,0.592,0.41,0.252,2.515046,2019
3,Canada,North America,9,7.278,1.365,1.30002,1.039,0.584,0.308,0.285,2.447958,2019
4,Finland,Western Europe,1,7.769,1.34,1.331049,0.986,0.596,0.393,0.153,2.625231,2019
