# Does money make people happier, part 2

Version with data splitting.

## Setup

In [25]:
import pandas as pd
import altair as alt

from sklearn.model_selection import train_test_split # <-
from sklearn.model_selection import cross_val_score # <-
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

## Data

### Import data

In [26]:
# Load the data from GitHub
LINK = "https://raw.githubusercontent.com/kirenz/datasets/master/oecd_gdp.csv"
df = pd.read_csv(LINK)

### Data structure

In [27]:
df.head()

Unnamed: 0,Country,GDP per capita,Life satisfaction
0,Russia,9054.914,6.0
1,Turkey,9437.372,5.6
2,Hungary,12239.894,4.9
3,Poland,12495.334,5.8
4,Slovak Republic,15991.736,6.1


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            29 non-null     object 
 1   GDP per capita     29 non-null     float64
 2   Life satisfaction  29 non-null     float64
dtypes: float64(2), object(1)
memory usage: 824.0+ bytes


### Data corrections

In [29]:
# Change column names (lower case and spaces to underscore)
df.columns = df.columns.str.lower().str.replace(' ', '_')

df.head()

Unnamed: 0,country,gdp_per_capita,life_satisfaction
0,Russia,9054.914,6.0
1,Turkey,9437.372,5.6
2,Hungary,12239.894,4.9
3,Poland,12495.334,5.8
4,Slovak Republic,15991.736,6.1


### Variable lists

Prepare the data for later use

In [30]:
# define outcome variable as y_label
y_label = 'life_satisfaction'

# select features
X = df[["gdp_per_capita"]]

# create response
y = df[y_label]

## Data splitting

In [31]:
# use a test size of 0,2 and random state 42
___, ___, ___, ___ = ___(___, ___, test_size=___, shuffle=True, random_state=___)

Investigate the data:

((23, 1), (23,))

Unnamed: 0,gdp_per_capita
21,43724.031
0,9054.914


((6, 1), (6,))

- We make a copy of the training data since we don’t want to alter our data during data exploration. 

- We will use this data for our exploratory data analysis.

In [35]:
# use your training data to make a pandas dataframe
df_train = pd.DataFrame(___.copy())

In [36]:
# add your training labels to the data
df_train = df_train.join(pd.DataFrame(___))

In [37]:
df_train.head(2)

Unnamed: 0,gdp_per_capita,life_satisfaction
21,43724.031,6.9
0,9054.914,6.0


### Data exploration

In [40]:
# Visualize the training data
alt.Chart(df_train).mark_circle(size=100).encode(
    x='gdp_per_capita:Q',
    y='life_satisfaction:Q',
    tooltip=['gdp_per_capita', 'life_satisfaction']
).interactive()

## Model

In [None]:
reg = LinearRegression()

### Training & validation

In [None]:
# cross-validation with 5 folds
scores = ___(___, ___, ___, cv=___, scoring='neg_mean_squared_error') *-1

In [None]:
# store cross-validation scores (we call the column "lr" for "linear regression")
df_scores = pd.DataFrame({"___": scores})

In [None]:
# reset index to match the number of folds
df_scores.index += 1

Unnamed: 0,lr
1,0.335187
2,0.101782
3,0.097167
4,0.252955
5,0.443893


In [None]:

df_scores

In [None]:
# print nice looking dataframe
df_scores.style.background_gradient(cmap='Blues')

Unnamed: 0,lr
1,0.335187
2,0.101782
3,0.097167
4,0.252955
5,0.443893


In [None]:
# calculate statistics
df_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lr,5.0,0.246197,0.150095,0.097167,0.101782,0.252955,0.335187,0.443893


### Final training

Train your model with the complete training data (without cross-validation).

In [None]:
# Fit the model
reg.fit(___, ___)

In [None]:
# Intercept
reg.intercept_


 Intercept: 4.87 
 Slope: 0.00005


In [None]:
# Coefficient
reg.coef_

### Test error

Evaluate the final model on the test set. 

In [None]:
# Prediction for our test data
y_pred = reg.predict(___)

# Mean squared error
mean_squared_error(___, ___)

0.09021411430745652