# Case Study - World Happiness Analysis Project

## Table of Contents
1. Importing libraries
2. Import Data
* a. 2015 Data
* b. 2016 Data
* c. 2017 Data
* d. 2018 Data
* e. 2019 Data
3. Data Wrangling & Subsetting
* a. 2015 Data
* b. 2016 Data
* c. 2017 Data
* d. 2018 Data
* e. 2019 Data
4. Data Consistency Check
* a. 2015 Data
* b. 2016 Data
* c. 2017 Data
* d. 2018 Data
* e. 2019 Data
5. Export Data


# 01. Importing Libraries

In [158]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02a. Importing Data - 2015 Data

In [159]:
# Path & list setup
path = r'/Users/ChuahLH/Desktop/Data Science/Data Immersion/Achievement 6/World Happiness Analysis'

In [160]:
# Import 2015.csv dataset into notebook using the os library and the vars_list
df_2015 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2015.csv'), index_col = False)

In [161]:
# Investigate the dataframe
df_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy GDP per Capita,Family,Health Life Expectancy,Freedom,Trust Government Corruption,Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [162]:
# check the shape of the dataframe
df_2015.shape

(158, 12)

In [163]:
# check the information of each column
df_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country                      158 non-null    object 
 1   Region                       158 non-null    object 
 2   Happiness Rank               158 non-null    int64  
 3   Happiness Score              158 non-null    float64
 4   Standard Error               158 non-null    float64
 5   Economy GDP per Capita       158 non-null    float64
 6   Family                       158 non-null    float64
 7   Health Life Expectancy       158 non-null    float64
 8   Freedom                      158 non-null    float64
 9   Trust Government Corruption  158 non-null    float64
 10  Generosity                   158 non-null    float64
 11  Dystopia Residual            158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 14.9+ KB


In [164]:
df_2015.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy GDP per Capita,Family,Health Life Expectancy,Freedom,Trust Government Corruption,Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


# 03a. Data Wrangling & Subsetting - 2015 Data

In [165]:
# There is dataframe each year. Rename the dataframe to reflect the year
df_2015.rename(columns = {'Happiness Rank':'happiness_rank_2015'}, inplace = True)

In [166]:
df_2015.rename(columns = {'Happiness Score':'happiness_score_2015'}, inplace = True)

In [167]:
df_2015.rename(columns = {'Standard Error':'standard_error_2015'}, inplace = True)

In [168]:
df_2015.rename(columns = {'Economy GDP per Capita':'economy_gdp_2015'}, inplace = True)

In [169]:
df_2015.rename(columns = {'Family':'family_2015'}, inplace = True)

In [170]:
df_2015.rename(columns = {'Health Life Expectancy':'health_life_expectancy_2015'}, inplace = True)

In [171]:
df_2015.rename(columns = {'Freedom':'freedom_2015'}, inplace = True)

In [172]:
df_2015.rename(columns = {'Trust Government Corruption':'trust_2015'}, inplace = True)

In [173]:
df_2015.rename(columns = {'Generosity':'generosity_2015'}, inplace = True)

In [174]:
df_2015.rename(columns = {'Dystopia Residual':'dystopia_residual_2015'}, inplace = True)

In [175]:
df_2015.head()

Unnamed: 0,Country,Region,happiness_rank_2015,happiness_score_2015,standard_error_2015,economy_gdp_2015,family_2015,health_life_expectancy_2015,freedom_2015,trust_2015,generosity_2015,dystopia_residual_2015
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


# 04a. Data Consistency Check - 2015 Data

In [245]:
# Check for mixed type

for col in df_2015.columns.tolist():
  weird = (df_2015[[col]].applymap(type) != df_2015[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_2015[weird]) > 0:
    print (col)

There are no mixed data type.

In [246]:
# Finding Missing Values in df_2015
df_2015.isnull().sum()

Country                        0
Region                         0
happiness_rank_2015            0
happiness_score_2015           0
standard_error_2015            0
economy_gdp_2015               0
family_2015                    0
health_life_expectancy_2015    0
freedom_2015                   0
trust_2015                     0
generosity_2015                0
dystopia_residual_2015         0
dtype: int64

There are no missing value.

In [247]:
# Finding Duplicates
df_dups_2015 = df_2015[df_2015.duplicated()]

In [248]:
df_dups_2015

Unnamed: 0,Country,Region,happiness_rank_2015,happiness_score_2015,standard_error_2015,economy_gdp_2015,family_2015,health_life_expectancy_2015,freedom_2015,trust_2015,generosity_2015,dystopia_residual_2015


There are no duplicated value.

# 02b. Importing Data - 2016 Data

In [176]:
# Import 2016.csv dataset into notebook using the os library and the vars_list
df_2016 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2016.csv'), index_col = False)

In [177]:
# Investigate the dataframe
df_2016.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy GDP per Capita,Family,Health Life Expectancy,Freedom,Trust Government Corruption,Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


In [178]:
# check the shape of the dataframe
df_2016.shape

(157, 13)

In [179]:
# check the information of each column
df_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country                      157 non-null    object 
 1   Region                       157 non-null    object 
 2   Happiness Rank               157 non-null    int64  
 3   Happiness Score              157 non-null    float64
 4   Lower Confidence Interval    157 non-null    float64
 5   Upper Confidence Interval    157 non-null    float64
 6   Economy GDP per Capita       157 non-null    float64
 7   Family                       157 non-null    float64
 8   Health Life Expectancy       157 non-null    float64
 9   Freedom                      157 non-null    float64
 10  Trust Government Corruption  157 non-null    float64
 11  Generosity                   157 non-null    float64
 12  Dystopia Residual            157 non-null    float64
dtypes: float64(10), int6

In [180]:
df_2016.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy GDP per Capita,Family,Health Life Expectancy,Freedom,Trust Government Corruption,Generosity,Dystopia Residual
count,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0
mean,78.980892,5.382185,5.282395,5.481975,0.95388,0.793621,0.557619,0.370994,0.137624,0.242635,2.325807
std,45.46603,1.141674,1.148043,1.136493,0.412595,0.266706,0.229349,0.145507,0.111038,0.133756,0.54222
min,1.0,2.905,2.732,3.078,0.0,0.0,0.0,0.0,0.0,0.0,0.81789
25%,40.0,4.404,4.327,4.465,0.67024,0.64184,0.38291,0.25748,0.06126,0.15457,2.03171
50%,79.0,5.314,5.237,5.419,1.0278,0.84142,0.59659,0.39747,0.10547,0.22245,2.29074
75%,118.0,6.269,6.154,6.434,1.27964,1.02152,0.72993,0.48453,0.17554,0.31185,2.66465
max,157.0,7.526,7.46,7.669,1.82427,1.18326,0.95277,0.60848,0.50521,0.81971,3.83772


# 03b. Data Wrangling & Subsetting - 2016 Data

In [181]:
# There is dataframe each year. Rename the dataframe to reflect the year
df_2016.rename(columns = {'Happiness Rank':'happiness_rank_2016'}, inplace = True)

In [182]:
df_2016.rename(columns = {'Happiness Score':'happiness_score_2016'}, inplace = True)

In [183]:
df_2016.rename(columns = {'Lower Confidence Interval':'lower_confidence_2016'}, inplace = True)

In [184]:
df_2016.rename(columns = {'Upper Confidence Interval':'upper_confidence_2016'}, inplace = True)

In [185]:
df_2016.rename(columns = {'Economy GDP per Capita':'economy_gdp_2016'}, inplace = True)

In [186]:
df_2016.rename(columns = {'Family':'family_2016'}, inplace = True)

In [187]:
df_2016.rename(columns = {'Health Life Expectancy':'health_life_expectancy_2016'}, inplace = True)

In [188]:
df_2016.rename(columns = {'Freedom':'freedom_2016'}, inplace = True)

In [189]:
df_2016.rename(columns = {'Trust Government Corruption':'trust_2016'}, inplace = True)

In [190]:
df_2016.rename(columns = {'Generosity':'generosity_2016'}, inplace = True)

In [191]:
df_2016.rename(columns = {'Dystopia Residual':'dystopia_residual_2016'}, inplace = True)

In [192]:
df_2016.head()

Unnamed: 0,Country,Region,happiness_rank_2016,happiness_score_2016,lower_confidence_2016,upper_confidence_2016,economy_gdp_2016,family_2016,health_life_expectancy_2016,freedom_2016,trust_2016,generosity_2016,dystopia_residual_2016
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


# 04b. Data Consistency Check - 2016 Data

In [249]:
# Check for mixed type

for col in df_2016.columns.tolist():
  weird = (df_2016[[col]].applymap(type) != df_2016[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_2016[weird]) > 0:
    print (col)

In [250]:
# Finding Missing Values in df_2016
df_2016.isnull().sum()

Country                        0
Region                         0
happiness_rank_2016            0
happiness_score_2016           0
lower_confidence_2016          0
upper_confidence_2016          0
economy_gdp_2016               0
family_2016                    0
health_life_expectancy_2016    0
freedom_2016                   0
trust_2016                     0
generosity_2016                0
dystopia_residual_2016         0
dtype: int64

There are no missing value. 

In [251]:
# Finding Duplicates
df_dups_2016 = df_2016[df_2016.duplicated()]

In [252]:
df_dups_2016

Unnamed: 0,Country,Region,happiness_rank_2016,happiness_score_2016,lower_confidence_2016,upper_confidence_2016,economy_gdp_2016,family_2016,health_life_expectancy_2016,freedom_2016,trust_2016,generosity_2016,dystopia_residual_2016


There are no duplicated value.

# 02c. Importing Data - 2017 Data

In [193]:
# Import 2017.csv dataset into notebook using the os library and the vars_list
df_2017 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2017.csv'), index_col = False)

In [194]:
# Investigate the dataframe
df_2017.head()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [195]:
# check the shape of the dataframe
df_2017.shape

(155, 12)

In [196]:
# check the information of each column
df_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 

In [197]:
df_2017.describe()

Unnamed: 0,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,78.0,5.354019,5.452326,5.255713,0.984718,1.188898,0.551341,0.408786,0.246883,0.12312,1.850238
std,44.888751,1.13123,1.118542,1.14503,0.420793,0.287263,0.237073,0.149997,0.13478,0.101661,0.500028
min,1.0,2.693,2.864884,2.521116,0.0,0.0,0.0,0.0,0.0,0.0,0.377914
25%,39.5,4.5055,4.608172,4.374955,0.663371,1.042635,0.369866,0.303677,0.154106,0.057271,1.591291
50%,78.0,5.279,5.370032,5.193152,1.064578,1.253918,0.606042,0.437454,0.231538,0.089848,1.83291
75%,116.5,6.1015,6.1946,6.006527,1.318027,1.414316,0.723008,0.516561,0.323762,0.153296,2.144654
max,155.0,7.537,7.62203,7.479556,1.870766,1.610574,0.949492,0.658249,0.838075,0.464308,3.117485


# 03c. Data Wrangling & Subsetting - 2017 Data

In [198]:
# There is dataframe each year. Rename the dataframe to reflect the year
df_2017.rename(columns = {'Happiness.Rank':'happiness_rank_2017'}, inplace = True)

In [199]:
df_2017.rename(columns = {'Happiness.Score':'happiness_score_2017'}, inplace = True)

In [200]:
df_2017.rename(columns = {'Whisker.high':'whisker_high_2017'}, inplace = True)

In [201]:
df_2017.rename(columns = {'Whisker.low':'whisker_low_2017'}, inplace = True)

In [202]:
df_2017.rename(columns = {'Economy..GDP.per.Capita.':'economy_gdp_2017'}, inplace = True)

In [203]:
df_2017.rename(columns = {'Family':'family_2017'}, inplace = True)

In [204]:
df_2017.rename(columns = {'Health..Life.Expectancy.':'health_life_expectancy_2017'}, inplace = True)

In [205]:
df_2017.rename(columns = {'Freedom':'freedom_2017'}, inplace = True)

In [206]:
df_2017.rename(columns = {'Generosity':'generosity_2017'}, inplace = True)

In [207]:
df_2017.rename(columns = {'Trust..Government.Corruption.':'trust_2017'}, inplace = True)

In [208]:
df_2017.rename(columns = {'Dystopia.Residual':'dystopia_residual_2017'}, inplace = True)

In [209]:
df_2017.head()

Unnamed: 0,Country,happiness_rank_2017,happiness_score_2017,whisker_high_2017,whisker_low_2017,economy_gdp_2017,family_2017,health_life_expectancy_2017,freedom_2017,generosity_2017,trust_2017,dystopia_residual_2017
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


# 04c. Data Consistency Check - 2017 Data

In [253]:
# Check for mixed type

for col in df_2017.columns.tolist():
  weird = (df_2017[[col]].applymap(type) != df_2017[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_2017[weird]) > 0:
    print (col)

There is no mixed type of data.

In [254]:
# Finding Missing Values in df_2017
df_2017.isnull().sum()

Country                        0
happiness_rank_2017            0
happiness_score_2017           0
whisker_high_2017              0
whisker_low_2017               0
economy_gdp_2017               0
family_2017                    0
health_life_expectancy_2017    0
freedom_2017                   0
generosity_2017                0
trust_2017                     0
dystopia_residual_2017         0
dtype: int64

There are no missing values.

In [255]:
# Finding Duplicates
df_dups_2017 = df_2017[df_2017.duplicated()]

In [256]:
df_dups_2017

Unnamed: 0,Country,happiness_rank_2017,happiness_score_2017,whisker_high_2017,whisker_low_2017,economy_gdp_2017,family_2017,health_life_expectancy_2017,freedom_2017,generosity_2017,trust_2017,dystopia_residual_2017


There are no duplicated values. 

# 02d. Importing Data - 2018 Data

In [210]:
# Import 2018.csv dataset into notebook using the os library and the vars_list
df_2018 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2018.csv'), index_col = False)

In [211]:
# Investigate the dataframe
df_2018.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [212]:
# check the shape of the dataframe
df_2018.shape

(156, 9)

In [213]:
# check the information of each column
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [214]:
df_2018.describe()

Unnamed: 0,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156.0,156.0,156.0,156.0,156.0,156.0,155.0
mean,78.5,5.375917,0.891449,1.213237,0.597346,0.454506,0.181006,0.112
std,45.177428,1.119506,0.391921,0.302372,0.247579,0.162424,0.098471,0.096492
min,1.0,2.905,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.75,4.45375,0.61625,1.06675,0.42225,0.356,0.1095,0.051
50%,78.5,5.378,0.9495,1.255,0.644,0.487,0.174,0.082
75%,117.25,6.1685,1.19775,1.463,0.77725,0.5785,0.239,0.137
max,156.0,7.632,2.096,1.644,1.03,0.724,0.598,0.457


# 03d. Data Wrangling & Subsetting - 2018 Data

In [215]:
# There is dataframe each year. Rename the dataframe to reflect the year
df_2018.rename(columns = {'Country or region':'country'}, inplace = True)

In [216]:
df_2018.rename(columns = {'Score':'happiness_score_2018'}, inplace = True)

In [217]:
df_2018.rename(columns = {'GDP per capita':'economy_gdp_2018'}, inplace = True)

In [218]:
df_2018.rename(columns = {'Social support':'social_support_2018'}, inplace = True)

In [219]:
df_2018.rename(columns = {'Healthy life expectancy':'healthy_life_expectancy_2018'}, inplace = True)

In [220]:
df_2018.rename(columns = {'Freedom to make life choices':'freedom_2018'}, inplace = True)

In [221]:
df_2018.rename(columns = {'Generosity':'generosity_2018'}, inplace = True)

In [222]:
df_2018.rename(columns = {'Perceptions of corruption':'trust_2018'}, inplace = True)

In [223]:
df_2018.rename(columns = {'Overall rank':'happiness_rank_2018'}, inplace = True)

In [224]:
df_2018.head()

Unnamed: 0,happiness_rank_2018,country,happiness_score_2018,economy_gdp_2018,social_support_2018,healthy_life_expectancy_2018,freedom_2018,generosity_2018,trust_2018
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


# 04d. Data Consistency Check - 2018 Data

In [257]:
# Check for mixed type

for col in df_2018.columns.tolist():
  weird = (df_2018[[col]].applymap(type) != df_2018[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_2018[weird]) > 0:
    print (col)

There is no mixed type of data. 

In [258]:
# Finding Missing Values in df_2018
df_2018.isnull().sum()

happiness_rank_2018             0
country                         0
happiness_score_2018            0
economy_gdp_2018                0
social_support_2018             0
healthy_life_expectancy_2018    0
freedom_2018                    0
generosity_2018                 0
trust_2018                      1
dtype: int64

In [259]:
# To view missing values
df_nan_2018 = df_2018[df_2018['trust_2018'].isnull()== True]

In [260]:
df_nan_2018

Unnamed: 0,happiness_rank_2018,country,happiness_score_2018,economy_gdp_2018,social_support_2018,healthy_life_expectancy_2018,freedom_2018,generosity_2018,trust_2018
19,20,United Arab Emirates,6.774,2.096,0.776,0.67,0.284,0.186,


Only 1 trust_value missing out of 156. Very minimum (<5%). Leave it. Other information in this row are valuable. 

In [261]:
# Finding Duplicates
df_dups_2018 = df_2018[df_2018.duplicated()]

In [262]:
df_dups_2018

Unnamed: 0,happiness_rank_2018,country,happiness_score_2018,economy_gdp_2018,social_support_2018,healthy_life_expectancy_2018,freedom_2018,generosity_2018,trust_2018


There are no duplicated value. 

# 02e. Importing Data - 2019 Data

In [225]:
# Import 2019.csv dataset into notebook using the os library and the vars_list
df_2019 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2019.csv'), index_col = False)

In [226]:
df_2019.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [227]:
# check the shape of the dataframe
df_2019.shape

(156, 9)

In [228]:
# check the information of each column
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [229]:
df_2019.describe()

Unnamed: 0,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0
mean,78.5,5.407096,0.905147,1.208814,0.725244,0.392571,0.184846,0.110603
std,45.177428,1.11312,0.398389,0.299191,0.242124,0.143289,0.095254,0.094538
min,1.0,2.853,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.75,4.5445,0.60275,1.05575,0.54775,0.308,0.10875,0.047
50%,78.5,5.3795,0.96,1.2715,0.789,0.417,0.1775,0.0855
75%,117.25,6.1845,1.2325,1.4525,0.88175,0.50725,0.24825,0.14125
max,156.0,7.769,1.684,1.624,1.141,0.631,0.566,0.453


# 03e. Data Wrangling & Subsetting - 2019 Data

In [230]:
# There is dataframe each year. Rename the dataframe to reflect the year
df_2019.rename(columns = {'Country or region':'country'}, inplace = True)

In [231]:
df_2019.rename(columns = {'Score':'happiness_score_2019'}, inplace = True)

In [232]:
df_2019.rename(columns = {'GDP per capita':'economy_gdp_2019'}, inplace = True)

In [233]:
df_2019.rename(columns = {'Social support':'social_support_2019'}, inplace = True)

In [234]:
df_2019.rename(columns = {'Healthy life expectancy':'healthy_life_expectancy_2019'}, inplace = True)

In [235]:
df_2019.rename(columns = {'Freedom to make life choices':'freedom_2019'}, inplace = True)

In [236]:
df_2019.rename(columns = {'Generosity':'generosity_2019'}, inplace = True)

In [237]:
df_2019.rename(columns = {'Perceptions of corruption':'trust_2019'}, inplace = True)

In [238]:
df_2019.rename(columns = {'Overall rank':'happiness_rank_2019'}, inplace = True)

In [239]:
df_2019.head()

Unnamed: 0,happiness_rank_2019,country,happiness_score_2019,economy_gdp_2019,social_support_2019,healthy_life_expectancy_2019,freedom_2019,generosity_2019,trust_2019
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


# 04d. Data Consistency Check - 2019 Data

In [263]:
# Check for mixed type

for col in df_2019.columns.tolist():
  weird = (df_2019[[col]].applymap(type) != df_2019[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_2019[weird]) > 0:
    print (col)

There is no mixed type data.

In [264]:
# Finding Missing Values in df_2019
df_2019.isnull().sum()

happiness_rank_2019             0
country                         0
happiness_score_2019            0
economy_gdp_2019                0
social_support_2019             0
healthy_life_expectancy_2019    0
freedom_2019                    0
generosity_2019                 0
trust_2019                      0
dtype: int64

There are no missing value.

In [265]:
# Finding Duplicates
df_dups_2019 = df_2019[df_2019.duplicated()]

In [266]:
df_dups_2019

Unnamed: 0,happiness_rank_2019,country,happiness_score_2019,economy_gdp_2019,social_support_2019,healthy_life_expectancy_2019,freedom_2019,generosity_2019,trust_2019


There are no duplicated value.

# 05. Export Data

In [267]:
# Export clean df_2015 as csv in prepared data.
df_2015.to_csv(os.path.join(path, '02 Data', 'Prepared Data', '2015_clean_data.csv'))

In [268]:
# Export clean df_2015 as csv in prepared data.
df_2016.to_csv(os.path.join(path, '02 Data', 'Prepared Data', '2016_clean_data.csv'))

In [269]:
# Export clean df_2015 as csv in prepared data.
df_2017.to_csv(os.path.join(path, '02 Data', 'Prepared Data', '2017_clean_data.csv'))

In [270]:
# Export clean df_2015 as csv in prepared data.
df_2018.to_csv(os.path.join(path, '02 Data', 'Prepared Data', '2018_clean_data.csv'))

In [271]:
# Export clean df_2015 as csv in prepared data.
df_2019.to_csv(os.path.join(path, '02 Data', 'Prepared Data', '2019_clean_data.csv'))