# Simple linear regression


### Housing Data

- CRIM: Per capita crime rate by town
- ZN: Proportion of residential land zoned for lots over 25,000 sq. ft
- INDUS: Proportion of non-retail business acres per town
- CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX: Nitric oxide concentration (parts per 10 million)
- RM: Average number of rooms per dwelling
- AGE: Proportion of owner-occupied units built prior to 1940
- DIS: Weighted distances to five Boston employment centers
- RAD: Index of accessibility to radial highways
- TAX: Full-value property tax rate per $10,000
- PTRATIO: Pupil-teacher ratio by town
- B: 1000(Bk — 0.63)², where Bk is the proportion of [people of African American descent] by town
- LSTAT: Percentage of lower status of the population
- MEDV: Median value of owner-occupied homes in $1000s


### Setup
`pip install seaborn`

In [None]:
import os, sys, pathlib
UTILS_FOLDER = 'S00 - Utils'
curPath = os.getcwd()
parPath = pathlib.Path(curPath).parent
utilPath = os.path.join(parPath, UTILS_FOLDER)
for p in [curPath, str(parPath), utilPath]:
    sys.path.append(p)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns

## Loading data


In [None]:
df = pd.read_excel(f"{utilPath}/housing_data.xlsx",)

df.head()



## Data exploratory


### Histogram


In [None]:
df.hist(figsize=(10, 10))
display()


In [None]:
cols = ["LSTAT", "MEDV"]
for col in cols:
    plt.figure()
    sns.histplot(data=df, x=col, bins=30, kde=True)


### Pairplot


In [None]:
cols = ["LSTAT", "INDUS", "NOX", "RM", "MEDV"]
sns.pairplot(df[cols])


### Correlation matrix


In [None]:
corr = df.corr()
display(corr)


In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=0.3,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
    annot=True,
)



## Extract and split data


In [None]:
# Extract data
X = df[["LSTAT"]].values
y = df["MEDV"].values


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)



## Model training


In [None]:
from sklearn.linear_model import LinearRegression

slr = LinearRegression()
slr.fit(X_train, y_train)

print("Slope: %.3f" % slr.coef_[0])
print("Intercept: %.3f" % slr.intercept_)



## Evaluation


In [None]:
y_train_pred = slr.predict(X_train)
y_test_pred = slr.predict(X_test)


### Plotting


In [None]:
def reg_plot(ax, X, y, y_pred, title, color="steelblue"):
    ax.scatter(X, y, c=color, edgecolor="white", s=70)
    ax.plot(X, y_pred, color="black", lw=2)
    ax.set_title(title)
    return



In [None]:
fig, ax = plt.subplots(
    1, 2, figsize=(10, 5), constrained_layout=True, sharex=True, sharey=True
)
reg_plot(ax=ax[0], X=X_train, y=y_train, y_pred=y_train_pred, title="Train")
reg_plot(
    ax=ax[1], X=X_test, y=y_test, y_pred=y_test_pred, title="Test", color="limegreen"
)
fig.supxlabel("Lower status of the population [LSTAT]")
fig.supylabel("Median value of homes in $1000s [MEDV]")



## Residual plot


In [None]:
def res_plot(ax, y_pred, y, title, color="steelblue"):
    ax.scatter(y_pred, y_pred - y, c=color, marker="o", edgecolor="white", s=70)
    ax.hlines(y=0, xmin=-10, xmax=50, color="black", lw=2)
    ax.axes.set_aspect("equal")
    ax.set_title(title)


In [None]:
fig, ax = plt.subplots(
    1, 2, figsize=(9, 5), constrained_layout=True, sharex=True, sharey=True
)
res_plot(ax=ax[0], y_pred=y_train_pred, y=y_train, title="Train")
res_plot(ax=ax[1], y_pred=y_test_pred, y=y_test, title="Test", color="limegreen")
ax[0].set_xlim([-5, 35])
fig.supxlabel("Predicted values")
fig.supylabel("Residual")


### R2 and MSE


In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

MSE_train = mean_squared_error(y_train, y_train_pred)
MSE_test = mean_squared_error(y_test, y_test_pred)

R2_train = r2_score(y_train, y_train_pred)
R2_test = r2_score(y_test, y_test_pred)

data = {
    "Model": "Simple linear regression",
    "Coef": slr.coef_[0],
    "Intercept": slr.intercept_,
    "MSE Train": [MSE_train],
    "MSE Test": [MSE_test],
    "R2 Train": [R2_train],
    "R2 Test": [R2_test],
}

dft = pd.DataFrame(data=data)

display(dft)
