# Contents
## 1. Importing Libraries
## 2. Data Consistency
## 3. Data Wrangling
## 4. Basic Stats of the Data
## 5. Exporting Data

# 1. Importing Libraries

In [2]:
#importing libraries
import pandas as pd
import numpy as np
import os

In [3]:
path= r'C:\Users\spada\OneDrive\Data Analytics\World Happiness Report'

In [4]:
df18 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2018.csv'), index_col = False)

In [5]:
df18

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.340
2,3,Denmark,7.555,1.351,1.590,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.420,1.549,0.927,0.660,0.256,0.357
...,...,...,...,...,...,...,...,...,...
151,152,Yemen,3.355,0.442,1.073,0.343,0.244,0.083,0.064
152,153,Tanzania,3.303,0.455,0.991,0.381,0.481,0.270,0.097
153,154,South Sudan,3.254,0.337,0.608,0.177,0.112,0.224,0.106
154,155,Central African Republic,3.083,0.024,0.000,0.010,0.305,0.218,0.038


In [6]:
#understanding more about the data
df18.info

<bound method DataFrame.info of      Overall rank         Country or region  Score  GDP per capita  \
0               1                   Finland  7.632           1.305   
1               2                    Norway  7.594           1.456   
2               3                   Denmark  7.555           1.351   
3               4                   Iceland  7.495           1.343   
4               5               Switzerland  7.487           1.420   
..            ...                       ...    ...             ...   
151           152                     Yemen  3.355           0.442   
152           153                  Tanzania  3.303           0.455   
153           154               South Sudan  3.254           0.337   
154           155  Central African Republic  3.083           0.024   
155           156                   Burundi  2.905           0.091   

     Social support  Healthy life expectancy  Freedom to make life choices  \
0             1.592                    0.874     

# 2. Data Consistency

In [7]:
#checking for missing values
df18.isnull().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       1
dtype: int64

#### 1 missing value within "perceptions of corruption"

In [8]:
#Checking for duplicates
df18_dups=df18[df18.duplicated()]

In [9]:
df18_dups

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption


#### No duplicates

In [10]:
#finding the median of columns
df18.median()

  df18.median()


Overall rank                    78.5000
Score                            5.3780
GDP per capita                   0.9495
Social support                   1.2550
Healthy life expectancy          0.6440
Freedom to make life choices     0.4870
Generosity                       0.1740
Perceptions of corruption        0.0820
dtype: float64

In [11]:
#addressing missing value by replacing with median
df18['Perceptions of corruption'].fillna(0.0820, inplace=True)

In [12]:
#checking to make sure missing value was filled with median
df18.isnull().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

#### No further missing values

In [13]:
#Finding column names
df18.columns

Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')

# 3. Data Wrangling

In [14]:
#renaming 'Overall Rank' column
df18.rename(columns={'Overall rank':'Happiness Rank'}, inplace=True)

In [15]:
#renaming 'Score' column
df18.rename(columns={'Score':'Happiness Score'}, inplace=True)

In [16]:
#renaming 'Healthy life expectancy' column
df18.rename(columns={'Healthy life expectancy':'Health (Life Expectancy)'}, inplace=True)

In [25]:
#renaming 'Social support' column
df18.rename(columns={'Social support':'Social Support'}, inplace=True)

In [17]:
#renaming 'Freedom to make life choices' column
df18.rename(columns={'Freedom to make life choices':'Freedom'}, inplace=True)

In [18]:
#renaming 'Freedom to make life choices' column
df18.rename(columns={'Freedom to make life choices':'Freedom'}, inplace=True)

In [19]:
#renaming 'Overall Rank' column
df18.rename(columns={'Perceptions of corruption':'Trust (Government Corruption)'}, inplace=True)

In [20]:
#renaming 'Country or region' column
df18.rename(columns={'Country or region':'Country or Region'}, inplace=True)

In [21]:
#Creating 'Year' column
df18.loc[:, ["Year"]] = [2018]

In [26]:
df18

Unnamed: 0,Happiness Rank,Country or Region,Happiness Score,GDP per capita,Social Support,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption),Year
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393,2018
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.340,2018
2,3,Denmark,7.555,1.351,1.590,0.868,0.683,0.284,0.408,2018
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138,2018
4,5,Switzerland,7.487,1.420,1.549,0.927,0.660,0.256,0.357,2018
...,...,...,...,...,...,...,...,...,...,...
151,152,Yemen,3.355,0.442,1.073,0.343,0.244,0.083,0.064,2018
152,153,Tanzania,3.303,0.455,0.991,0.381,0.481,0.270,0.097,2018
153,154,South Sudan,3.254,0.337,0.608,0.177,0.112,0.224,0.106,2018
154,155,Central African Republic,3.083,0.024,0.000,0.010,0.305,0.218,0.038,2018


# 4. Basic Stats on the Data

In [27]:
df18.describe()

Unnamed: 0,Happiness Rank,Happiness Score,GDP per capita,Social Support,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption),Year
count,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0
mean,78.5,5.375917,0.891449,1.213237,0.597346,0.454506,0.181006,0.111808,2018.0
std,45.177428,1.119506,0.391921,0.302372,0.247579,0.162424,0.098471,0.09621,0.0
min,1.0,2.905,0.0,0.0,0.0,0.0,0.0,0.0,2018.0
25%,39.75,4.45375,0.61625,1.06675,0.42225,0.356,0.1095,0.051,2018.0
50%,78.5,5.378,0.9495,1.255,0.644,0.487,0.174,0.082,2018.0
75%,117.25,6.1685,1.19775,1.463,0.77725,0.5785,0.239,0.1365,2018.0
max,156.0,7.632,2.096,1.644,1.03,0.724,0.598,0.457,2018.0


# 5. Exporting data

In [28]:
# Export 2018 Data dataframe
df18.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'df18_clean.pkl'))