# Regression & Interpretation labs

Example notebook for exploring linear models (Regression) and its interpretation


@Ricardo Almeida

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

In [2]:
import pandas as pd
import numpy as np
import shap
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [3]:
RANDOM_SEED = 7657

TEST_SIZE=0.20

#### Loading dataset

Boston housing dataset

In [4]:
# Load the Boston Housing price regression dataset from Keras dataset
(X_train, y_train), (X_test, y_test) = keras.datasets.boston_housing.load_data()

In [5]:
# Get the feature names from the dataset
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

feature_desc = {
    'CRIM': 'per capita crime rate by town',
    'ZN': 'proportion of residential land zoned for lots over 25,000 sq.ft.',
    'INDUS': 'proportion of non-retail business acres per town.',
    'CHAS': 'Charles River dummy variable (1 if tract bounds river; 0 otherwise)',
    'NOX': 'nitric oxides concentration (parts per 10 million)',
    'RM': 'average number of rooms per dwelling',
    'AGE': 'proportion of owner-occupied units built prior to 1940',
    'DIS': 'weighted distances to five Boston employment centres',
    'RAD': 'index of accessibility to radial highways',
    'TAX': 'full-value property-tax rate per $10,000',
    'PTRATIO': 'pupil-teacher ratio by town',
    'LSTAT': '% lower status of the population',
    'MEDV': 'Median value of owner-occupied homes in $1000\'s'
}

In [6]:
for key, value in feature_desc.items():
    print(f"{key} =  {value}")

CRIM =  per capita crime rate by town
ZN =  proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS =  proportion of non-retail business acres per town.
CHAS =  Charles River dummy variable (1 if tract bounds river; 0 otherwise)
NOX =  nitric oxides concentration (parts per 10 million)
RM =  average number of rooms per dwelling
AGE =  proportion of owner-occupied units built prior to 1940
DIS =  weighted distances to five Boston employment centres
RAD =  index of accessibility to radial highways
TAX =  full-value property-tax rate per $10,000
PTRATIO =  pupil-teacher ratio by town
LSTAT =  % lower status of the population
MEDV =  Median value of owner-occupied homes in $1000's


In [7]:
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

### Exercise

Fit a model to the data (`X_train` train dataset) and perform interpretation with SHAP (Shapley Values) in order to answer these questions:

- **Task 1**: Which are the 3 most impactful features for Boston house value?

- **Task 2**: Is crime rate (CRIM feature) a very relevant predictor of house value in Boston?

- **Task 3**: How does distance to employment centers (DIS feature) affects house value? Do houses closer to employment centers tend to be more or less valuable?

- **Task 4**: Consider the (single) case of polution rate (NOX) of 0.4, residential land zoned (ZN) of 70, and average number of rooms (RM) lower than 6.5. What drives house value in this case?

In [8]:
### Create and fit a Linear Regression model on the data (df_train)
### YOUR CODE HERE:

from sklearn.linear_model import LinearRegression

model = ...

In [9]:
#df_train

In [10]:
#y_train

In [11]:
### Create the explainer and compute SHAP Values for further explanation (use df_train)
### YOUR CODE HERE:

explainer = shap.LinearExplainer(...)
shap_values = ...

In [1]:
### Create and analyze different SHAP plots to answer the task questions
### YOUR CODE HERE:


In [None]:
### Potentially useful SHAP plots

# shap.summary_plot()
# shap.plots.bar()
# shap.plots.violin()
# shap.plots.waterfall()