In [40]:
import pandas as pd

wkhrs_df = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/Workinghours.csv", index_col=0)
wkhrs_df

Unnamed: 0,hours,income,age,education,child5,child13,child17,nonwhite,owned,mortgage,occupation,unemp
1,2000,350,26,12,0,1,0,0,1,1,swcc,7
2,390,241,29,8,0,1,1,0,1,1,other,4
3,1900,160,33,10,0,2,0,0,1,0,swcc,7
4,0,80,20,9,2,0,0,0,1,1,other,7
5,3177,456,33,12,0,2,0,0,1,1,swcc,7
...,...,...,...,...,...,...,...,...,...,...,...,...
3378,0,150,64,9,0,3,2,1,1,0,other,9
3379,344,369,42,11,0,0,0,1,0,0,other,9
3380,0,70,23,13,2,0,0,1,0,0,other,9
3381,784,301,25,15,1,0,0,0,0,0,swcc,6


#### Check number of values for categorical attributes

In [39]:
wkhrs_df.occupation.unique()

array(['swcc', 'other', 'mp', 'fr'], dtype=object)

#### Encode data

The attribute `occupation` needs to be dummy encoded as it has four values. 

In [46]:
wkhrs_x_enc_df = pd.get_dummies(wkhrs_df, columns=['occupation'])
wkhrs_x_enc_df.drop('hours', axis=1, inplace=True)
wkhrs_x_enc_df

Unnamed: 0,income,age,education,child5,child13,child17,nonwhite,owned,mortgage,unemp,occupation_fr,occupation_mp,occupation_other,occupation_swcc
1,350,26,12,0,1,0,0,1,1,7,0,0,0,1
2,241,29,8,0,1,1,0,1,1,4,0,0,1,0
3,160,33,10,0,2,0,0,1,0,7,0,0,0,1
4,80,20,9,2,0,0,0,1,1,7,0,0,1,0
5,456,33,12,0,2,0,0,1,1,7,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3378,150,64,9,0,3,2,1,1,0,9,0,0,1,0
3379,369,42,11,0,0,0,1,0,0,9,0,0,1,0
3380,70,23,13,2,0,0,1,0,0,9,0,0,1,0
3381,301,25,15,1,0,0,0,0,0,6,0,0,0,1


## Linear Regression

#### Train/test split

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wkhrs_x_enc_df, wkhrs_df.hours, test_size=0.2)

#### Fit model and predict

In [49]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

#### Evaluate model

In [51]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

824.1944106271233

In [52]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
r2


0.15407177674621098

Our $R^2$ (r-squared) score value gives the proportion of variation in the target values can be predicted by the model. The remaining proportion is variance that comes from the distribution of the variable. The positive value of `r2` means that the model is 'doing something'.

## Regression Tree

#### Train/test split

In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wkhrs_x_enc_df, wkhrs_df.hours, test_size=0.2)

#### Fit model and predict

In [58]:
from sklearn.tree import DecisionTreeRegressor
rt = DecisionTreeRegressor()
rt.fit(X_train, y_train)
y_pred = rt.predict(X_test)

#### Evaluate model

In [59]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

1133.7217829965023

In [60]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
r2


-0.6475323138931937