In [9]:
# Import libraries and load dataset
from sklearn import preprocessing
import pandas as pd
import numpy as np

world_happiness = pd.read_csv("world_happiness_2017.csv")

In [10]:
# Catch only the top 10 countries
world_happiness10 = world_happiness.head(10)

In [11]:
# Normalization
scaled_dystopia_residual = preprocessing.scale(world_happiness10["Dystopia.Residual"].copy())
print("Mean: ", world_happiness10["Dystopia.Residual"].values.mean())
print("Standard Deviation:", world_happiness10["Dystopia.Residual"].values.std())
print("Normalized Mean: ", scaled_dystopia_residual.mean())
print("Normalize Standard Deviation:", scaled_dystopia_residual.std())

Mean:  2.231162071228027
Standard Deviation: 0.12019647207777867
Normalized Mean:  -2.2204460492503132e-17
Normalize Standard Deviation: 1.0


In [150]:
# Categorical Encoding in Pandas
one_hot_df = pd.get_dummies(world_happiness10["Country"], prefix = "enc")
pd.get_dummies(world_happiness10["Country"], prefix = "enc")

Unnamed: 0,enc_Australia,enc_Canada,enc_Denmark,enc_Finland,enc_Iceland,enc_Netherlands,enc_New Zealand,enc_Norway,enc_Sweden,enc_Switzerland
0,0,0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [151]:
world_happiness10

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182
5,Netherlands,6,7.377,7.427426,7.326574,1.503945,1.428939,0.810696,0.585384,0.47049,0.282662,2.294804
6,Canada,7,7.316,7.384403,7.247597,1.479204,1.481349,0.834558,0.611101,0.43554,0.287372,2.187264
7,New Zealand,8,7.314,7.37951,7.24849,1.405706,1.548195,0.81676,0.614062,0.500005,0.382817,2.046456
8,Sweden,9,7.284,7.344095,7.223905,1.494387,1.478162,0.830875,0.612924,0.385399,0.384399,2.097538
9,Australia,10,7.284,7.356651,7.211349,1.484415,1.510042,0.843887,0.601607,0.477699,0.301184,2.065211


In [152]:
# Put it all in one DataFrame
world_happiness10 = world_happiness10.join(one_hot_df)
world_happiness10["Scaled.Dystopia.Residual"] = scaled_dystopia_residual
world_happiness10[["Country", "Happiness.Rank", "enc_Australia",
                                    "enc_Canada", "enc_Denmark", "enc_Finland",
                                    "enc_Iceland", "enc_Netherlands", "enc_New Zealand",
                                    "enc_Norway", "enc_Sweden", "enc_Switzerland", "Scaled.Dystopia.Residual"]]

Unnamed: 0,Country,Happiness.Rank,enc_Australia,enc_Canada,enc_Denmark,enc_Finland,enc_Iceland,enc_Netherlands,enc_New Zealand,enc_Norway,enc_Sweden,enc_Switzerland,Scaled.Dystopia.Residual
0,Norway,1,0,0,0,0,0,0,0,1,0,0,0.38158
1,Denmark,2,0,0,1,0,0,0,0,0,0,0,0.686753
2,Iceland,3,0,0,0,0,1,0,0,0,0,0,0.761696
3,Switzerland,4,0,0,0,0,0,0,0,0,0,1,0.378997
4,Finland,5,0,0,0,1,0,0,0,0,0,0,1.655784
5,Netherlands,6,0,0,0,0,0,1,0,0,0,0,0.529483
6,Canada,7,0,1,0,0,0,0,0,0,0,0,-0.365216
7,New Zealand,8,0,0,0,0,0,0,1,0,0,0,-1.536698
8,Sweden,9,0,0,0,0,0,0,0,0,1,0,-1.111714
9,Australia,10,1,0,0,0,0,0,0,0,0,0,-1.380667


In [155]:
# WOE Load dataset
woe = pd.read_csv("woe_raw.csv")
woe

Unnamed: 0,Age Group,Number of Goods,Number of Bads
0,<=21,63,42
1,21-24,82,52
2,24-31,188,87
3,31-34,90,23
4,34-42,128,46
5,>42,149,50


In [156]:
# https://documentation.statsoft.com/STATISTICAHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview
# WOE
# Get Total N
woe["Total N"] = woe["Number of Goods"] + woe["Number of Bads"] 
# Get Distribution of Goods
woe["Distribution of Goods"] = woe["Number of Goods"] / woe["Number of Goods"].sum()
# Get Distribution of Bads
woe["Distribution of Bads"] = woe["Number of Bads"] / woe["Number of Bads"].sum()
# Cumulative Information Value
zipped = zip(woe["Distribution of Goods"], woe["Distribution of Bads"])
post_map_list = list(map( lambda x: (x[0] - x[1]) * np.log(x[0] / x[1]), zipped)) 
woe["Cumulative Information Value"] = post_map_list
# WOE
woe["WOE"] = 100 * np.log(woe["Distribution of Goods"] / woe["Distribution of Bads"])

# Information Value =  ("Distribution of Goods" - "Distribution of Bads") * ln("Distribution of Goods / Distribution of Bads")
# WOE = ln("Distribution of Goods / Distribution of Bads")

In [157]:
# Full WOE table
woe

Unnamed: 0,Age Group,Number of Goods,Number of Bads,Total N,Distribution of Goods,Distribution of Bads,Cumulative Information Value,WOE
0,<=21,63,42,105,0.09,0.14,0.022092,-44.183275
1,21-24,82,52,134,0.117143,0.173333,0.022017,-39.182233
2,24-31,188,87,275,0.268571,0.29,0.001645,-7.676402
3,31-34,90,23,113,0.128571,0.076667,0.026836,51.701759
4,34-42,128,46,174,0.182857,0.153333,0.005199,17.609101
5,>42,149,50,199,0.212857,0.166667,0.011299,24.462544
