# GDP and Happiness plots

Does money make people happier?

## Setup

In [6]:
import pandas as pd
import numpy as np
%matplotlib inline
import altair as alt
from sklearn.model_selection import train_test_split

## Data

### Import data

In [1]:
# Load the data from GitHub
LINK = "https://raw.githubusercontent.com/kirenz/datasets/master/oecd_gdp.csv"
df = pd.read_csv(LINK)

### Data corrections

In [2]:
# Change column names (lower case and spaces to underscore)
df.columns = df.columns.str.lower().str.replace(' ', '_')


Unnamed: 0,country,gdp_per_capita,life_satisfaction
0,Russia,9054.914,6.0
1,Turkey,9437.372,5.6
2,Hungary,12239.894,4.9
3,Poland,12495.334,5.8
4,Slovak Republic,15991.736,6.1


In [8]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   country            29 non-null     object 
 1   gdp_per_capita     29 non-null     float64
 2   life_satisfaction  29 non-null     float64
dtypes: float64(2), object(1)
memory usage: 824.0+ bytes


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('gdp', axis=1), df['gdp'], test_size=0.2, random_state=42)

In [37]:
df['Data'] = np.random.randint(0, 2, size=29)
df.replace({'Data': {0: "Train", 1: 'Test'}}, inplace=True)

In [38]:
df.head()

Unnamed: 0,country,gdp_per_capita,life_satisfaction,class,data,Data
0,Russia,9054.914,6.0,train,test,Test
1,Turkey,9437.372,5.6,train,test,Test
2,Hungary,12239.894,4.9,test,test,Train
3,Poland,12495.334,5.8,test,test,Train
4,Slovak Republic,15991.736,6.1,train,test,Train


### Variable lists

Prepare the data for later use

In [39]:
# define outcome variable as y_label
y_label = 'life_satisfaction'

# select features
X = df[["gdp_per_capita"]]

# create response
y = df[y_label]

### Data exploration

In [56]:
# Visualize the data
chart = alt.Chart(df).mark_circle(size=100).encode(
    alt.X('gdp_per_capita:Q', axis=alt.Axis(title='GDP per capita')),
    alt.Y('life_satisfaction:Q', axis=alt.Axis(title='Life satisfaction'), scale=alt.Scale(domain=[0, 10])),
    tooltip=['country', 'gdp_per_capita', 'life_satisfaction']
).interactive()

chart

In [57]:
# Visualize the data
text = points.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='country'
)

chart + text

In [71]:
plot = alt.Chart(df).mark_circle(size=100).encode(
    alt.X('gdp_per_capita:Q', axis=alt.Axis(title='GDP per capita')),
    alt.Y('life_satisfaction:Q', axis=alt.Axis(title='Life satisfaction'), scale=alt.Scale(domain=[0, 10]))
    )

plot + plot.transform_regression('gdp_per_capita', 'life_satisfaction', extent=(0,60000)).mark_line()

In [73]:

plot + plot.transform_regression('gdp_per_capita', 'life_satisfaction', method="poly", order=20).mark_line()


In [40]:
# Visualize the data
alt.Chart(df).mark_circle(size=100).encode(
    alt.X('gdp_per_capita:Q', axis=alt.Axis(title='GDP per capita')),
    alt.Y('life_satisfaction:Q', axis=alt.Axis(title='Life satisfaction')),
    color='Data:N',)

In [4]:


# Visualize the data
alt.Chart(df).mark_circle(size=100).encode(
    x='gdp_per_capita:Q',
    y='life_satisfaction:Q',
    color=alt.Color('country', legend=None),
    tooltip=['country', 'gdp_per_capita', 'life_satisfaction']

).interactive()

## Linear regression model

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
# Select a linear regression model
reg = LinearRegression()

### Training

In [20]:
# Fit the model
reg.fit(X, y)

In [22]:
# Model intercept
reg.intercept_

4.853052800266436

In [21]:
# Model coefficient
reg.coef_

array([4.91154459e-05])

### Prediction

In [23]:
# Prediction for our data
y_pred = reg.predict(X)

In [38]:
# Make a prediction for a specific GDP value
X_new = pd.DataFrame({'gdp_per_capita': [50000]})

reg.predict(X_new)

array([7.30882509])

### Evaluation

In [30]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [33]:
# Mean squared error
mean_squared_error(y, y_pred)

0.18075033705835142

In [34]:
# Root mean squared error
mean_squared_error(y, y_pred, squared=False)

0.4251474297915388

In [32]:
mean_absolute_error(y, y_pred)

0.35530429427921734

## K-Nearest Neighbor Model

In [40]:
from sklearn.neighbors import KNeighborsRegressor

In [46]:
reg2 = KNeighborsRegressor(n_neighbors=2)

In [47]:
reg2.fit(X, y)

In [48]:
y_pred2 = reg2.predict(X)

In [49]:
reg2.predict(X_new) 

array([7.35])

In [50]:
mean_squared_error(y, y_pred2)

0.06181034482758619

In [51]:
mean_absolute_error(y, y_pred2)

0.20517241379310344