# World Happiness Report Data Set Cleaning and Consistency Checks

## 1. Import libraries and data

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [4]:
# Set a path
path = r'/Users/test/Desktop/Data Analysis/World Happiness Report'

In [9]:
df_2015 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2015.csv'), index_col = False)
df_2016 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2016.csv'), index_col = False)
df_2017 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2017.csv'), index_col = False)
df_2018 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2018.csv'), index_col = False)
df_2019 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2019.csv'), index_col = False)

## 2. Check dataframe columns

In [12]:
# Overview all the dataframes
print(df_2015.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [14]:
print(df_2016.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      157 non-null    float64
 5   Upper Confidence Interval      157 non-null    float64
 6   Economy (GDP per Capita)       157 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       157 non-null    float64
 9   Freedom                        157 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    flo

In [16]:
print(df_2017.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 

In [18]:
print(df_2018.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB
None


In [20]:
print(df_2019.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB
None


### 2.2 Drop unnessary columns

In [25]:
df_2015 = df_2015.drop(columns=['Standard Error'])
df_2016 = df_2016.drop(columns=['Lower Confidence Interval', 'Upper Confidence Interval'])
df_2017 = df_2017.drop(columns=['Whisker.high', 'Whisker.low'])

### 2.3 Change inconsistent column names

In [28]:
columns_mapping = {
    'Country': 'Country',
    'Happiness Rank': 'Happiness Rank',
    'Happiness Score': 'Happiness Score',
    'Economy (GDP per Capita)': 'GDP per capita',
    'Family': 'Social Support',
    'Health (Life Expectancy)': 'Life Expectancy',
    'Freedom': 'Freedom',
    'Trust (Government Corruption)': 'Trust (Government Corruption)',
    'Generosity': 'Generosity',
    'Dystopia Residual': 'Dystopia Residual'}

In [30]:
df_2015.rename(columns=columns_mapping, inplace=True)
df_2016.rename(columns=columns_mapping, inplace=True)

In [43]:
df_2017.rename(columns={
    'Happiness.Rank': 'Happiness Rank', 
    'Happiness.Score': 'Happiness Score',
    'Economy..GDP.per.Capita.': 'GDP per capita',
    'Health..Life.Expectancy.': 'Life Expectancy',
    'Trust..Government.Corruption.': 'Trust (Government Corruption)',
    'Dystopia.Residual':'Dystopia Residual'
}, inplace=True)

In [47]:
df_2018.rename(columns={
    'Overall rank': 'Happiness Rank', 
    'Score': 'Happiness Score',
    'Country or region': 'Country',
    'Health..Life.Expectancy.': 'Life Expectancy',
    'Perceptions of corruption': 'Trust (Government Corruption)',
    'Social support': 'Social Support',
    'Healthy life expectancy': 'Life Expectancy',
    'Freedom to make life choices': 'Freedom'
}, inplace=True)

In [51]:
df_2019.rename(columns={
    'Overall rank': 'Happiness Rank', 
    'Score': 'Happiness Score',
    'Country or region': 'Country',
    'Health..Life.Expectancy.': 'Life Expectancy',
    'Perceptions of corruption': 'Trust (Government Corruption)',
    'Social support': 'Social Support',
    'Healthy life expectancy': 'Life Expectancy',
    'Freedom to make life choices': 'Freedom'
}, inplace=True)

### 2.4 Fill in missing columns

In [63]:
# Add region value for 2017, 2018 and 2019 from 2015 dataset
region_mapping = df_2015[['Country', 'Region']].set_index('Country').to_dict()['Region']

In [65]:
df_2017['Region'] = df_2017['Country'].map(region_mapping)
df_2018['Region'] = df_2018['Country'].map(region_mapping)
df_2019['Region'] = df_2019['Country'].map(region_mapping)

In [83]:
# Add and calculate Dystopia Residual column for df_2018 and df_2019
df_2019['Dystopia Residual'] = (
    df_2019['Happiness Score'] 
    - df_2019['GDP per capita'] 
    - df_2019['Social Support'] 
    - df_2019['Life Expectancy'] 
    - df_2019['Freedom'] 
    - df_2019['Trust (Government Corruption)'] 
    - df_2019['Generosity']
)
df_2018['Dystopia Residual'] = (
    df_2018['Happiness Score'] 
    - df_2018['GDP per capita'] 
    - df_2018['Social Support'] 
    - df_2018['Life Expectancy'] 
    - df_2018['Freedom'] 
    - df_2018['Trust (Government Corruption)'] 
    - df_2018['Generosity']
)

In [89]:
df_2019.head()

Unnamed: 0,Happiness Rank,Country,Happiness Score,GDP per capita,Social Support,Life Expectancy,Freedom,Generosity,Trust (Government Corruption),Region,Dystopia Residual
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,Western Europe,2.714
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,Western Europe,2.394
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,Western Europe,2.241
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,Western Europe,2.401
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,Western Europe,2.394


In [91]:
df_2018.head()

Unnamed: 0,Happiness Rank,Country,Happiness Score,GDP per capita,Social Support,Life Expectancy,Freedom,Generosity,Trust (Government Corruption),Region,Dystopia Residual
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393,Western Europe,2.585
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34,Western Europe,2.383
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408,Western Europe,2.371
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138,Western Europe,2.426
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357,Western Europe,2.318


In [93]:
df_2017.head()

Unnamed: 0,Country,Happiness Rank,Happiness Score,GDP per capita,Social Support,Life Expectancy,Freedom,Generosity,Trust (Government Corruption),Dystopia Residual,Region
0,Norway,1,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,Western Europe
1,Denmark,2,7.522,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,Western Europe
2,Iceland,3,7.504,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715,Western Europe
3,Switzerland,4,7.494,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716,Western Europe
4,Finland,5,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182,Western Europe


In [95]:
df_2016.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP per capita,Social Support,Life Expectancy,Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


In [97]:
df_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP per capita,Social Support,Life Expectancy,Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [102]:
# Add a 'Year' column for every dataframe
df_2015['Year'] = 2015
df_2016['Year'] = 2016
df_2017['Year'] = 2017
df_2018['Year'] = 2018
df_2019['Year'] = 2019

## 3. Export dataframes separately

In [104]:
df_2015.to_csv(os.path.join(path, '02 Data','Prepared Data', '2015_cleaned.csv'))
df_2016.to_csv(os.path.join(path, '02 Data','Prepared Data', '2016_cleaned.csv'))
df_2017.to_csv(os.path.join(path, '02 Data','Prepared Data', '2017_cleaned.csv'))
df_2018.to_csv(os.path.join(path, '02 Data','Prepared Data', '2018_cleaned.csv'))
df_2019.to_csv(os.path.join(path, '02 Data','Prepared Data', '2019_cleaned.csv'))

## 4. Combine all the dataframes

In [116]:
df_combined = pd.concat([df_2015, df_2016, df_2017, df_2018, df_2019], ignore_index=True)

## 5. Export combined dataframe

In [119]:
df_combined.to_csv(os.path.join(path, '02 Data','Prepared Data', 'world_happiness_combined.csv'))

## 6. Missing values and consistency check for combined dataframe 

In [127]:
# Check datatypes
print(df_combined.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        782 non-null    object 
 1   Region                         763 non-null    object 
 2   Happiness Rank                 782 non-null    int64  
 3   Happiness Score                782 non-null    float64
 4   GDP per capita                 782 non-null    float64
 5   Social Support                 782 non-null    float64
 6   Life Expectancy                782 non-null    float64
 7   Freedom                        782 non-null    float64
 8   Trust (Government Corruption)  781 non-null    float64
 9   Generosity                     782 non-null    float64
 10  Dystopia Residual              781 non-null    float64
 11  Year                           782 non-null    int64  
dtypes: float64(8), int64(2), object(2)
memory usage: 7

In [129]:
df = df_combined

In [131]:
# Optimize datatypes
df['Region'] = df['Region'].astype('category')
df['Country'] = df['Country'].astype('category')
df['Year'] = pd.to_datetime(df['Year'], format='%Y')

In [133]:
# Check mixed datatype
for col in df.columns.tolist():
  weird = (df[[col]].map(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

Region


In [135]:
# Check 'Region' value counts
region_counts_with_nan = df['Region'].value_counts(dropna=False)
print(region_counts_with_nan)

Region
Sub-Saharan Africa                 186
Central and Eastern Europe         144
Latin America and Caribbean        107
Western Europe                     103
Middle East and Northern Africa     96
Southeastern Asia                   44
Southern Asia                       35
Eastern Asia                        28
NaN                                 19
Australia and New Zealand           10
North America                       10
Name: count, dtype: int64


In [137]:
# Find NaN value of 'Region' column
missing_region = df[df['Region'].isna()]
result = missing_region[['Country', 'Region']]
print(result)

                      Country Region
347  Taiwan Province of China    NaN
364                    Belize    NaN
385   Hong Kong S.A.R., China    NaN
407                   Somalia    NaN
425                   Namibia    NaN
461               South Sudan    NaN
507         Trinidad & Tobago    NaN
518                    Belize    NaN
527           Northern Cyprus    NaN
567                   Somalia    NaN
588                   Namibia    NaN
623               South Sudan    NaN
664         Trinidad & Tobago    NaN
689           Northern Cyprus    NaN
709           North Macedonia    NaN
737                   Somalia    NaN
738                   Namibia    NaN
745                    Gambia    NaN
781               South Sudan    NaN


In [139]:
# Fill in the right value of region
region_mapping = {
    "Taiwan Province of China": "Eastern Asia",
    "Hong Kong S.A.R., China": "Eastern Asia",
    "Belize": "Latin America and Caribbean",
    "Trinidad & Tobago": "Latin America and Caribbean",
    "Somalia": "Sub-Saharan Africa",
    "South Sudan": "Sub-Saharan Africa",
    "Namibia": "Sub-Saharan Africa",
    "Gambia": "Sub-Saharan Africa",
    "Northern Cyprus": "Central and Eastern Europe",
    "North Macedonia": "Central and Eastern Europe"
}

df['Region'] = df['Country'].map(region_mapping).fillna(df['Region'])

In [141]:
# Check the output
print(df[df['Region'].isna()])

Empty DataFrame
Columns: [Country, Region, Happiness Rank, Happiness Score, GDP per capita, Social Support, Life Expectancy, Freedom, Trust (Government Corruption), Generosity, Dystopia Residual, Year]
Index: []


In [143]:
# Find missing value
df.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
GDP per capita                   0
Social Support                   0
Life Expectancy                  0
Freedom                          0
Trust (Government Corruption)    1
Generosity                       0
Dystopia Residual                1
Year                             0
dtype: int64

In [145]:
residual_with_nan = df['Dystopia Residual'].value_counts(dropna=False)
print(residual_with_nan)

Dystopia Residual
1.85100    2
1.44300    2
2.44800    2
2.04400    2
2.13000    2
          ..
2.88586    1
2.29551    1
1.99817    1
2.58991    1
1.37400    1
Name: count, Length: 772, dtype: int64


In [147]:
missing_trust = df[df['Trust (Government Corruption)'].isna()]
print("Missing Trust (Government Corruption):")
print(missing_trust[['Country', 'Region', 'Year', 'Trust (Government Corruption)']])

Missing Trust (Government Corruption):
                  Country                           Region       Year  \
489  United Arab Emirates  Middle East and Northern Africa 2018-01-01   

     Trust (Government Corruption)  
489                            NaN  


In [151]:
# Fill in with mean of UAE
mean_trust = df[df['Country'] == 'United Arab Emirates']['Trust (Government Corruption)'].mean()
df['Trust (Government Corruption)'].fillna(mean_trust, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Trust (Government Corruption)'].fillna(mean_trust, inplace=True)


In [153]:
print(df['Trust (Government Corruption)'].isna().sum())

0


In [155]:
missing_dystopia = df[df['Dystopia Residual'].isna()]
print("Missing Dystopia Residual:")
print(missing_dystopia[['Country', 'Region', 'Year', 'Dystopia Residual']])

Missing Dystopia Residual:
                  Country                           Region       Year  \
489  United Arab Emirates  Middle East and Northern Africa 2018-01-01   

     Dystopia Residual  
489                NaN  


In [157]:
# Locate the row of Dystopia Residual missing value
missing_dystopia_index = df[df['Dystopia Residual'].isna()].index

# Calculate the value
df.loc[missing_dystopia_index, 'Dystopia Residual'] = (
    df.loc[missing_dystopia_index, 'Happiness Score']
    - df.loc[missing_dystopia_index, 'GDP per capita']
    - df.loc[missing_dystopia_index, 'Social Support']
    - df.loc[missing_dystopia_index, 'Life Expectancy']
    - df.loc[missing_dystopia_index, 'Freedom']
    - df.loc[missing_dystopia_index, 'Trust (Government Corruption)']
    - df.loc[missing_dystopia_index, 'Generosity']
)

In [159]:
# Check the result
print(df['Dystopia Residual'].isna().sum())

0


In [161]:
df.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
GDP per capita                   0
Social Support                   0
Life Expectancy                  0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
dtype: int64

## 7. Check duplicate values

In [164]:
df_dup = df[df.duplicated()]
df_dup

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP per capita,Social Support,Life Expectancy,Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year


No duplicate values.

In [167]:
# Check dataframe again
df.head(20)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP per capita,Social Support,Life Expectancy,Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015-01-01
1,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015-01-01
2,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015-01-01
3,Norway,Western Europe,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,2015-01-01
4,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015-01-01
5,Finland,Western Europe,6,7.406,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,2.61955,2015-01-01
6,Netherlands,Western Europe,7,7.378,1.32944,1.28017,0.89284,0.61576,0.31814,0.4761,2.4657,2015-01-01
7,Sweden,Western Europe,8,7.364,1.33171,1.28907,0.91087,0.6598,0.43844,0.36262,2.37119,2015-01-01
8,New Zealand,Australia and New Zealand,9,7.286,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425,2015-01-01
9,Australia,Australia and New Zealand,10,7.284,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646,2015-01-01


 Year value seems weird. Let's change them.


In [170]:
df['Year'] = df['Year'].dt.year

In [172]:
df.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,GDP per capita,Social Support,Life Expectancy,Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015
1,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015
2,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015
3,Norway,Western Europe,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,2015
4,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015


## 8. Export the cleaned dataframe.

In [177]:
df.to_csv(os.path.join(path, '02 Data','Prepared Data', 'world_happiness_checked.csv'))

## 9. Supplementary check

In [7]:
df = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'world_happiness_checked.csv'), index_col = False)

In [11]:
# Sort the country counts descending
country_counts_sorted = df['Country'].value_counts().sort_values(ascending=True).head(20)

print(country_counts_sorted)

Country
Gambia                      1
Hong Kong S.A.R., China     1
North Macedonia             1
Somaliland region           1
Oman                        1
Taiwan Province of China    1
Somaliland Region           1
Djibouti                    1
Puerto Rico                 1
Swaziland                   2
Northern Cyprus             2
Suriname                    2
Trinidad & Tobago           2
Trinidad and Tobago         3
North Cyprus                3
Belize                      3
Comoros                     3
Central African Republic    4
Hong Kong                   4
Somalia                     4
Name: count, dtype: int64


We found that there are some values have different names, but actually the same country or region.

In [14]:
df['Country'] = df['Country'].replace({
    'Hong Kong': 'Hong Kong S.A.R., China',
    'Somaliland region': 'Somaliland Region',
    'Trinidad and Tobago': 'Trinidad & Tobago',
    'Taiwan': 'Taiwan Province of China'
})

In [18]:
# Export the revised data
df.to_csv(os.path.join(path, '02 Data','Prepared Data', 'world_happiness_checked.csv'))