# Regression labs

Example notebook for exploring linear models (Regression) and its interpretation


@Ricardo Almeida

In [1]:
# Additional requirements
# pip install ipywidgets

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from ipywidgets import interactive

In [3]:
RANDOM_SEED = 7657

TEST_SIZE=0.20

## Interactive Linear Regression

#### Loading dataset

California housing dataset

In [4]:
from sklearn.datasets import fetch_california_housing

In [5]:
housing = fetch_california_housing()

In [6]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [7]:
housing.target_names

['MedHouseVal']

In [8]:
housing.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [9]:
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [10]:
X = housing.data[housing.target<5]
y = housing.target[housing.target<5] + 2.5

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

#### Simple Linear Regression

In [12]:
# Using a single feature, for simplicity
feature = 'MedInc'
feature_index = housing.feature_names.index(feature)

X_single_feature = X_train[:, feature_index:feature_index+1]

In [13]:
def visualize_regression_line(slope, intercept):
    # Calculate predicted values using the provided slope and intercept
    y_pred = slope * X_single_feature + intercept
    
    # Plot the original data points
    plt.figure(figsize=(8, 6))
    plt.scatter(X_single_feature, y_train, color='blue', label='Original Data')
    
    # Plot the regression line
    plt.plot(X_single_feature, y_pred, color='red', linewidth=2, label='Regression Line')
    plt.xlabel(feature)
    plt.ylabel('Target (house value)')
    plt.legend()
    plt.title('Interactive Linear Regression Simulation')
    plt.show()

In [14]:
# Create interactive widget for adjusting slope and intercept
interactive_plot = interactive(visualize_regression_line, slope=(-2.0, 2.0, 0.1), intercept=(-6.0, 4.0, 1.0))

# Display the interactive widget
interactive_plot

interactive(children=(FloatSlider(value=0.0, description='slope', max=2.0, min=-2.0), FloatSlider(value=-1.0, …

#### Linear Regression

Fit a Linear Regression to the data ("MedInc" feature only) to get the "best" slope and intercept.

Use the already prepared `X_single_feature` as input and `y_train` as the label vector.

Check the model performence, on the same `X_single_feature` train set, using an appropriate metric of your choice.

In [15]:
### import LinearRegression and metric of choice
### YOUR CODE HERE




In [20]:
### fit a LinearRegression to the data
### YOUR CODE HERE

model = LinearRegression().fit(X_single_feature, y_train)


In [17]:
### Check model performance, calculating a metric of your choice
### YOUR CODE HERE




In [23]:
### Display model's slope and intercept
### YOUR CODE HERE

model.coef_


array([0.40109772])

In [24]:
model.intercept_

2.951121110699129