# Discussion Week 1

Creating Widgets for Multiple Linear Regression

1/9/2025

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.linear_model
from sklearn.preprocessing import PolynomialFeatures
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, FloatSlider
from IPython.display import display, clear_output

In [2]:
do_data = pd.read_excel("data/Hurricane_Irene_and_the_Hudson_River.xlsx", sheet_name= 5).drop(['Piermont D.O. (ppm)'], axis = 1)
rainfall_data = pd.read_excel("data/Hurricane_Irene_and_the_Hudson_River.xlsx", sheet_name= "Rainfall").drop(['Piermont  Rainfall Daily Accumulation (Inches)'], axis = 1)
turbidity_data = pd.read_excel("data/Hurricane_Irene_and_the_Hudson_River.xlsx", sheet_name= "Turbidity").drop(['Piermont Turbidity in NTU'], axis = 1)

## Data Cleaning

Since we read in our data through multiple excel sheets, we need to merge our different dataframes. The datasheets have a column for dat so we merge on this. We can also change column names and update the date to be datetime type and set as index

In [10]:
# Merge the three datasets

data = do_data.merge(rainfall_data, on = 'Date Time (ET)')
data = data.merge(turbidity_data, on = 'Date Time (ET)')
data.head(3)

# Update the column names
data.columns = ['date', 'albany_do', 'norrie_do', 'albany_rainfall', 'norrie_rainfall', 'albany_turbidity', 'norrie_turbidity']

# Convert data to datetime format
data['date'] = pd.to_datetime(data['date'])

#Set the date as the index
data.set_index('date', inplace = True)
data.head(3)

Unnamed: 0_level_0,albany_do,norrie_do,albany_rainfall,norrie_rainfall,albany_turbidity,norrie_turbidity
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-08-25 00:00:00,7.68,7.81,0.0,0.0,4.0,9.3
2011-08-25 00:15:00,7.6,7.73,0.0,0.0,3.9,8.4
2011-08-25 00:30:00,7.57,7.63,0.0,0.0,4.3,7.9


## Multiple Linear Regression

In [13]:
# Define predictors and the target variable
X = data[['albany_do', 'albany_rainfall']]
y = data[['albany_turbidity']]

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Create and fit the model
model = LinearRegression()
model.fit(X_train, y_train)


# Predict and evaluate
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"R-squared: {r2_score(y_test, y_pred)}")      

RMSE: 221.9143474905527
R-squared: 0.4907389518457509


## Create a widget to visualize different models

In [18]:
predictor_selector = widgets.SelectMultiple(
    options = data.columns, 
    value = (data.columns[0],),
    description = 'Predictors'
)

# Create a dropdown for selecting the target variable
target_selector = widgets.Dropdown(
    option = data.columns,
    value = data.columns[1],
    description = 'Target'

)

# Button to evaluate the model
evaluate_button = widgets.Button(description = "Evaluate Model")

#Output widget to display results
output = widgets.Output


# Define the function to handle button clicks
def evaluate_model(b):
    with output:
        clear_output(wait=True)
        
        # Make sure the target is not in the predictors
        selected_predictors = [item for item in predictor_selector.value]
        if target_selector.value in selected_predictors:
            print("Target variable must not be in the predictors.")
            return
        
        # Prepare the data
        X = data[selected_predictors]
        y = data[target_selector.value]
        
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
        
        # Create and fit the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict and calculate
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"MSE: {mean_squared_error(y_test, y_pred)}")
        print(f"R-squared: {r2_score(y_test, y_pred)}") 
        
# Display the widgets and connect the button to function
display(predictor_selector, target_selector, evaluate_button, output)
evaluate_button.on_click(evaluate_model)

TraitError: Invalid selection: value not found