## Pipelines Project 

# Step 0 : Importing libraries and dataset


In [24]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [25]:
# Import our dataset
data = pd.read_csv('./2015.csv')


# Step 1 : Data Acquisition

In [26]:
data.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [27]:
data.shape


(158, 12)

In [28]:
data.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Standard Error                   float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object

## STEP 2: Data Wrangling 


In [29]:
# Find how prevalent missing values are in our data 
null_cols = data.isnull().sum()
null_cols

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [30]:
stats = data.describe().transpose()
stats['IQR'] = stats['75%'] - stats['25%']
stats


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR
Happiness Rank,158.0,79.493671,45.754363,1.0,40.25,79.5,118.75,158.0,78.5
Happiness Score,158.0,5.375734,1.14501,2.839,4.526,5.2325,6.24375,7.587,1.71775
Standard Error,158.0,0.047885,0.017146,0.01848,0.037268,0.04394,0.0523,0.13693,0.015032
Economy (GDP per Capita),158.0,0.846137,0.403121,0.0,0.545808,0.910245,1.158448,1.69042,0.61264
Family,158.0,0.991046,0.272369,0.0,0.856823,1.02951,1.214405,1.40223,0.357582
Health (Life Expectancy),158.0,0.630259,0.247078,0.0,0.439185,0.696705,0.811013,1.02525,0.371828
Freedom,158.0,0.428615,0.150693,0.0,0.32833,0.435515,0.549092,0.66973,0.220762
Trust (Government Corruption),158.0,0.143422,0.120034,0.0,0.061675,0.10722,0.180255,0.55191,0.11858
Generosity,158.0,0.237296,0.126685,0.0,0.150553,0.21613,0.309883,0.79588,0.15933
Dystopia Residual,158.0,2.098977,0.55355,0.32858,1.75941,2.095415,2.462415,3.60214,0.703005


In [31]:
outliers = pd.DataFrame(columns=data.columns)
outliers

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual


In [32]:
for col in stats.index:
    iqr = stats.at[col,'IQR']
    cutoff = iqr * 3
    lower = stats.at[col,'25%'] - cutoff
    upper = stats.at[col,'75%'] + cutoff
    results = data[(data[col] < lower) | 
                   (data[col] > upper)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)
    
outliers

Unnamed: 0,Country,Dystopia Residual,Economy (GDP per Capita),Family,Freedom,Generosity,Happiness Rank,Happiness Score,Health (Life Expectancy),Outlier,Region,Standard Error,Trust (Government Corruption)
39,Suriname,2.79094,0.99534,0.972,0.59657,0.16991,40,6.269,0.6082,Standard Error,Latin America and Caribbean,0.09811,0.13633
40,Trinidad and Tobago,2.26882,1.21183,1.18354,0.55884,0.31844,41,6.168,0.61483,Standard Error,Latin America and Caribbean,0.10895,0.0114
64,Jamaica,2.32038,0.81038,1.15102,0.50442,0.2123,65,5.709,0.68741,Standard Error,Latin America and Caribbean,0.13693,0.02299
115,Liberia,2.77729,0.0712,0.78968,0.28531,0.24362,116,4.571,0.34201,Standard Error,Sub-Saharan Africa,0.11068,0.06232
153,Rwanda,0.67042,0.22208,0.7737,0.59201,0.22628,154,3.465,0.42864,Trust (Government Corruption),Sub-Saharan Africa,0.03464,0.55191
128,Myanmar,1.41805,0.27108,0.70905,0.44017,0.79588,129,4.307,0.48246,Generosity,Southeastern Asia,0.04351,0.19034


In [33]:
data.drop(index = list(outliers.index), inplace=True)

In [42]:
outliers.index

Int64Index([39, 40, 64, 115, 153, 128], dtype='int64')

In [17]:
data.head(68)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.03880,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
5,Finland,Western Europe,6,7.406,0.03140,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,2.61955
6,Netherlands,Western Europe,7,7.378,0.02799,1.32944,1.28017,0.89284,0.61576,0.31814,0.47610,2.46570
7,Sweden,Western Europe,8,7.364,0.03157,1.33171,1.28907,0.91087,0.65980,0.43844,0.36262,2.37119
8,New Zealand,Australia and New Zealand,9,7.286,0.03371,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425
9,Australia,Australia and New Zealand,10,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646


In [34]:
before = len(data)
data = data.drop_duplicates()
after = len(data)
print('Number of duplicate records dropped: ', str(before - after))


Number of duplicate records dropped:  0


## STEP 3: Data Analysis


In [35]:
data.describe()


Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,152.0,152.0,152.0,152.0,152.0,152.0,152.0,152.0,152.0,152.0
mean,79.046053,5.387349,0.04627,0.855972,0.993462,0.634325,0.425946,0.142667,0.233725,2.101266
std,45.750307,1.147519,0.013481,0.398847,0.274957,0.249932,0.151501,0.116641,0.120535,0.543386
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,38.75,4.542,0.037145,0.579618,0.863468,0.440095,0.326415,0.062795,0.144985,1.76077
50%,79.5,5.2325,0.043805,0.91856,1.03396,0.70593,0.43252,0.10722,0.21515,2.08797
75%,118.25,6.29575,0.051095,1.16216,1.220705,0.813547,0.5434,0.179565,0.307548,2.456625
max,158.0,7.587,0.09438,1.69042,1.40223,1.02525,0.66973,0.52208,0.5763,3.60214


In [36]:
# Most important determinants happinness

print(data['Happiness Score'].corr(data['Economy (GDP per Capita)']))
print("---")

print(data['Happiness Score'].corr(data['Family']))
print(data['Happiness Score'].corr(data['Health (Life Expectancy)']))

print("---")
print(data['Happiness Score'].corr(data['Freedom']))
print(data['Happiness Score'].corr(data['Trust (Government Corruption)']))
print(data['Happiness Score'].corr(data['Generosity']))

0.7775354184051506
---
0.7386212094949969
0.7249822033825948
---
0.5838941318387033
0.467341068868184
0.2249416650496363


In [37]:
data.corr

<bound method DataFrame.corr of                       Country                           Region  \
0                 Switzerland                   Western Europe   
1                     Iceland                   Western Europe   
2                     Denmark                   Western Europe   
3                      Norway                   Western Europe   
4                      Canada                    North America   
5                     Finland                   Western Europe   
6                 Netherlands                   Western Europe   
7                      Sweden                   Western Europe   
8                 New Zealand        Australia and New Zealand   
9                   Australia        Australia and New Zealand   
10                     Israel  Middle East and Northern Africa   
11                 Costa Rica      Latin America and Caribbean   
12                    Austria                   Western Europe   
13                     Mexico      Latin Ame

##  STEP 4: Reporting and Distribution  


In [39]:
"""
#### Important determinant happinness
Economy (GDP per Capita)

#### Moderately important determinant happinness
Family
Health (Life Expectancy)

#### Not very important determinant happinness
Freedom
Trust (Government Corruption)
Generosity
"""


'\n#### Important determinant happinness\nEconomy (GDP per Capita)\n\n#### Moderately important determinant happinness\nFamily\nHealth (Life Expectancy)\n\n#### Not very important determinant happinness\nFreedom\nTrust (Government Corruption)\nGenerosity\n'

## STEP 5: Get yearly report as csv


In [40]:
data.to_csv('./2015_happinness.csv', index=False)