## Load libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Read and first analysis of the data

In [2]:
data = pd.read_csv("World Happiness Report 2021.csv", sep = ';')

In [3]:
data.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237
2,Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275
3,Afghanistan,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268


In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      1949 non-null   object 
 1   year                              1949 non-null   int64  
 2   Life Ladder                       1949 non-null   float64
 3   Log GDP per capita                1913 non-null   float64
 4   Social support                    1936 non-null   float64
 5   Healthy life expectancy at birth  1894 non-null   float64
 6   Freedom to make life choices      1917 non-null   float64
 7   Generosity                        1860 non-null   float64
 8   Perceptions of corruption         1839 non-null   float64
 9   Positive affect                   1927 non-null   float64
 10  Negative affect                   1933 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 167.6+ KB


### Cleaning Text and Removing Special Characters

In [90]:
data.columns = data.columns.str.replace(' ', '_')
data.head()

Unnamed: 0,Country_name,year,Life_Ladder,Log_GDP_per_capita,Social_support,Healthy_life_expectancy_at_birth,Freedom_to_make_life_choices,Generosity,Perceptions_of_corruption,Positive_affect,Negative_affect
0,Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237
2,Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275
3,Afghanistan,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268


### Check out the duplicates in the ID column

In [91]:
before = len(data)
data = data.drop_duplicates()
after = len(data)
print('Number of duplicate records dropped: ', str(before - after))

Number of duplicate records dropped:  0


### Check for missing values and fill them if needed

In [94]:
data.isnull().sum()

Country_name                          0
year                                  0
Life_Ladder                           0
Log_GDP_per_capita                   36
Social_support                       13
Healthy_life_expectancy_at_birth     55
Freedom_to_make_life_choices         32
Generosity                           89
Perceptions_of_corruption           110
Positive_affect                      22
Negative_affect                      16
dtype: int64

##### For Perception_of_corruption

In [101]:
# Group the DataFrame by country and calculate the mean of each group
grouped = data.groupby("Country_name")['Perceptions_of_corruption'].mean()

# Replace null values with the mean of each country
for country in data["Country_name"].unique():
    data.loc[data["Country_name"] == country, 'Perceptions_of_corruption'] = data.loc[data["Country_name"] == country, 'Perceptions_of_corruption'].fillna(grouped[country])

# Concatenate the country groups back into a single DataFrame
data = pd.concat([data[data["Country_name"] == country] for country in data["Country_name"].unique()])

In [102]:
data = perception_corruption(data, country)
data.isnull().sum()

Country_name                         0
year                                 0
Life_Ladder                          0
Log_GDP_per_capita                  36
Social_support                      13
Healthy_life_expectancy_at_birth    55
Freedom_to_make_life_choices        32
Generosity                          89
Perceptions_of_corruption           28
Positive_affect                     22
Negative_affect                     16
dtype: int64

##### For Log_GDP_per_capita

In [103]:
grouped = data.groupby("Country_name")['Log_GDP_per_capita'].mean()
for country in data["Country_name"].unique():
    data.loc[data["Country_name"] == country, 'Log_GDP_per_capita'] = data.loc[data["Country_name"] == country, 'Log_GDP_per_capita'].fillna(grouped[country])
data = pd.concat([data[data["Country_name"] == country] for country in data["Country_name"].unique()])

In [104]:
data.isnull().sum()

Country_name                         0
year                                 0
Life_Ladder                          0
Log_GDP_per_capita                  19
Social_support                      13
Healthy_life_expectancy_at_birth    55
Freedom_to_make_life_choices        32
Generosity                          89
Perceptions_of_corruption           28
Positive_affect                     22
Negative_affect                     16
dtype: int64

##### For Social_support 

In [106]:
grouped = data.groupby("Country_name")['Social_support'].mean()
for country in data["Country_name"].unique():
    data.loc[data["Country_name"] == country, 'Social_support'] = data.loc[data["Country_name"] == country, 'Social_support'].fillna(grouped[country])
data = pd.concat([data[data["Country_name"] == country] for country in data["Country_name"].unique()])

In [107]:
data.isnull().sum()

Country_name                         0
year                                 0
Life_Ladder                          0
Log_GDP_per_capita                  19
Social_support                       1
Healthy_life_expectancy_at_birth    55
Freedom_to_make_life_choices        32
Generosity                          89
Perceptions_of_corruption           28
Positive_affect                     22
Negative_affect                     16
dtype: int64

##### For Healthy_life_expectancy_at_birth

In [108]:
grouped = data.groupby("Country_name")['Healthy_life_expectancy_at_birth'].mean()
for country in data["Country_name"].unique():
    data.loc[data["Country_name"] == country, 'Healthy_life_expectancy_at_birth'] = data.loc[data["Country_name"] == country, 'Healthy_life_expectancy_at_birth'].fillna(grouped[country])
data = pd.concat([data[data["Country_name"] == country] for country in data["Country_name"].unique()])

In [109]:
data.isnull().sum()

Country_name                         0
year                                 0
Life_Ladder                          0
Log_GDP_per_capita                  19
Social_support                       1
Healthy_life_expectancy_at_birth    36
Freedom_to_make_life_choices        32
Generosity                          89
Perceptions_of_corruption           28
Positive_affect                     22
Negative_affect                     16
dtype: int64

##### For Freedom_to_make_life_choices

In [110]:
grouped = data.groupby("Country_name")['Freedom_to_make_life_choices'].mean()
for country in data["Country_name"].unique():
    data.loc[data["Country_name"] == country, 'Freedom_to_make_life_choices'] = data.loc[data["Country_name"] == country, 'Freedom_to_make_life_choices'].fillna(grouped[country])
data = pd.concat([data[data["Country_name"] == country] for country in data["Country_name"].unique()])

In [112]:
data.isnull().sum()

Country_name                         0
year                                 0
Life_Ladder                          0
Log_GDP_per_capita                  19
Social_support                       1
Healthy_life_expectancy_at_birth    36
Freedom_to_make_life_choices         0
Generosity                          89
Perceptions_of_corruption           28
Positive_affect                     22
Negative_affect                     16
dtype: int64

##### For Generosity

In [114]:
grouped = data.groupby("Country_name")['Generosity'].mean()
for country in data["Country_name"].unique():
    data.loc[data["Country_name"] == country, 'Generosity'] = data.loc[data["Country_name"] == country, 'Generosity'].fillna(grouped[country])
data = pd.concat([data[data["Country_name"] == country] for country in data["Country_name"].unique()])

In [116]:
data.isnull().sum()

Country_name                         0
year                                 0
Life_Ladder                          0
Log_GDP_per_capita                  19
Social_support                       1
Healthy_life_expectancy_at_birth    36
Freedom_to_make_life_choices         0
Generosity                          19
Perceptions_of_corruption           28
Positive_affect                     22
Negative_affect                     16
dtype: int64

##### For Positive_affect 

In [118]:
grouped = data.groupby("Country_name")['Positive_affect'].mean()
for country in data["Country_name"].unique():
    data.loc[data["Country_name"] == country, 'Positive_affect'] = data.loc[data["Country_name"] == country, 'Positive_affect'].fillna(grouped[country])
data = pd.concat([data[data["Country_name"] == country] for country in data["Country_name"].unique()])

In [119]:
data.isnull().sum()

Country_name                         0
year                                 0
Life_Ladder                          0
Log_GDP_per_capita                  19
Social_support                       1
Healthy_life_expectancy_at_birth    36
Freedom_to_make_life_choices         0
Generosity                          19
Perceptions_of_corruption           28
Positive_affect                      2
Negative_affect                     16
dtype: int64

##### For Negative_affect 

In [121]:
grouped = data.groupby("Country_name")['Negative_affect'].mean()
for country in data["Country_name"].unique():
    data.loc[data["Country_name"] == country, 'Negative_affect'] = data.loc[data["Country_name"] == country, 'Negative_affect'].fillna(grouped[country])
data = pd.concat([data[data["Country_name"] == country] for country in data["Country_name"].unique()])

In [122]:
data.isnull().sum()

Country_name                         0
year                                 0
Life_Ladder                          0
Log_GDP_per_capita                  19
Social_support                       1
Healthy_life_expectancy_at_birth    36
Freedom_to_make_life_choices         0
Generosity                          19
Perceptions_of_corruption           28
Positive_affect                      2
Negative_affect                      1
dtype: int64

##### We decided to drop the rows where the missing values remain showing the data of an entire country is missing for the columns in question

In [132]:
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1878 entries, 0 to 1948
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country_name                      1878 non-null   object 
 1   year                              1878 non-null   int64  
 2   Life_Ladder                       1878 non-null   float64
 3   Log_GDP_per_capita                1878 non-null   float64
 4   Social_support                    1878 non-null   float64
 5   Healthy_life_expectancy_at_birth  1878 non-null   float64
 6   Freedom_to_make_life_choices      1878 non-null   float64
 7   Generosity                        1878 non-null   float64
 8   Perceptions_of_corruption         1878 non-null   float64
 9   Positive_affect                   1878 non-null   float64
 10  Negative_affect                   1878 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 176.1+ KB


We end up with 1878 rows instead of 1949 rows

In [133]:
data

Unnamed: 0,Country_name,year,Life_Ladder,Log_GDP_per_capita,Social_support,Healthy_life_expectancy_at_birth,Freedom_to_make_life_choices,Generosity,Perceptions_of_corruption,Positive_affect,Negative_affect
0,Afghanistan,2008,3.724,7.370,0.451,50.80,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,2009,4.402,7.540,0.552,51.20,0.679,0.190,0.850,0.584,0.237
2,Afghanistan,2010,4.758,7.647,0.539,51.60,0.600,0.121,0.707,0.618,0.275
3,Afghanistan,2011,3.832,7.620,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.710,0.268
...,...,...,...,...,...,...,...,...,...,...,...
1944,Zimbabwe,2016,3.735,7.984,0.768,54.40,0.733,-0.095,0.724,0.738,0.209
1945,Zimbabwe,2017,3.638,8.016,0.754,55.00,0.753,-0.098,0.751,0.806,0.224
1946,Zimbabwe,2018,3.616,8.049,0.775,55.60,0.763,-0.068,0.844,0.710,0.212
1947,Zimbabwe,2019,2.694,7.950,0.759,56.20,0.632,-0.064,0.831,0.716,0.235


In [125]:
#country = list(null_Perceptions_of_corruption["Country_name"].unique())
#def perception_corruption(data, country):
 #   for i in country: 
 #       mean_perception_corruption=data.loc[data["Country_name"] == i, 'Perceptions_of_corruption'].mean()
  #      data.loc[(data['Perceptions_of_corruption'].isnull()==True)&(data["Country_name"]==i),'Perceptions_of_corruption']= mean_perception_corruption
   #     return data

### Check for low variance columns and possible outliers (check ou the 0)

__check for low variance columns:__

In [134]:
low_variance = []
for col in data._get_numeric_data():
    minimum = min(data[col])
    ninety_perc = np.percentile(data[col], 90)
    if ninety_perc == minimum:
        low_variance.append(col)
print(low_variance)

[]


__check for outliers:__

In [135]:
outliers = pd.DataFrame(columns=data.columns)
stats = data.describe().transpose()
stats['IQR'] = stats['75%'] - stats['25%']
for col in stats.index:
    iqr = stats.at[col,'IQR']
    cutoff = iqr * 1.5
    lower = stats.at[col,'25%'] - cutoff
    upper = stats.at[col,'75%'] + cutoff
    results = data[(data[col] < lower) |
                   (data[col] > upper)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)
outliers.value_counts('Outlier')

  outliers = outliers.append(results)
  outliers = outliers.append(results)
  outliers = outliers.append(results)
  outliers = outliers.append(results)
  outliers = outliers.append(results)
  outliers = outliers.append(results)
  outliers = outliers.append(results)
  outliers = outliers.append(results)
  outliers = outliers.append(results)
  outliers = outliers.append(results)


Outlier
Perceptions_of_corruption           157
Social_support                       40
Generosity                           36
Negative_affect                      27
Healthy_life_expectancy_at_birth     15
Freedom_to_make_life_choices          7
Positive_affect                       3
dtype: int64

In [140]:
test = outliers.loc[outliers['Outlier'] == "Perceptions_of_corruption"]
test["Country_name"].unique()

array(['Australia', 'Canada', 'Denmark', 'Estonia', 'Finland', 'Georgia',
       'Ireland', 'Kuwait', 'Luxembourg', 'Netherlands', 'New Zealand',
       'Norway', 'Qatar', 'Rwanda', 'Singapore', 'Sweden', 'Switzerland',
       'United Arab Emirates', 'United Kingdom'], dtype=object)

##### We decided to keep the outliers that could be important factors for the anlysis 

## Export the cleaned dataset as .csv

In [136]:
data.to_csv("World_Happiness_Report_Cleaned.csv", sep = ';')

In [137]:
pd.read_csv("World_Happiness_Report_Cleaned.csv", sep = ";")

Unnamed: 0.1,Unnamed: 0,Country_name,year,Life_Ladder,Log_GDP_per_capita,Social_support,Healthy_life_expectancy_at_birth,Freedom_to_make_life_choices,Generosity,Perceptions_of_corruption,Positive_affect,Negative_affect
0,0,Afghanistan,2008,3.724,7.370,0.451,50.80,0.718,0.168,0.882,0.518,0.258
1,1,Afghanistan,2009,4.402,7.540,0.552,51.20,0.679,0.190,0.850,0.584,0.237
2,2,Afghanistan,2010,4.758,7.647,0.539,51.60,0.600,0.121,0.707,0.618,0.275
3,3,Afghanistan,2011,3.832,7.620,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.710,0.268
...,...,...,...,...,...,...,...,...,...,...,...,...
1873,1944,Zimbabwe,2016,3.735,7.984,0.768,54.40,0.733,-0.095,0.724,0.738,0.209
1874,1945,Zimbabwe,2017,3.638,8.016,0.754,55.00,0.753,-0.098,0.751,0.806,0.224
1875,1946,Zimbabwe,2018,3.616,8.049,0.775,55.60,0.763,-0.068,0.844,0.710,0.212
1876,1947,Zimbabwe,2019,2.694,7.950,0.759,56.20,0.632,-0.064,0.831,0.716,0.235
