# Week 1 Discussion


In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, FloatSlider
from IPython.display import display, clear_output

In [6]:
filepath = 'data/irene.xlsx'
do_data = pd.read_excel(filepath, sheet_name = 5).drop(['Piermont D.O. (ppm)'], axis = 1)
rainfall_data = pd.read_excel(filepath, sheet_name = 'Rainfall').drop(['Piermont  Rainfall Daily Accumulation (Inches)'], axis = 1)
turbidity_data = pd.read_excel(filepath, sheet_name = 'Turbidity').drop(['Piermont Turbidity in NTU'], axis = 1)

In [13]:
# Merge the two datasets
data = do_data.merge(rainfall_data, on='Date Time (ET)')
data = data.merge(turbidity_data, on='Date Time (ET)')
data.shape

# Update the column names
data.columns = ['date', 'albany_do', 'norrie_do', 'albany_rainfall', 'norrie_rainfall', 'albany_turbidity', 'norrie_turbidity']

# Convert date to date format
data['date'] = pd.to_datetime(data['date'])

# Set the date as the index
data.set_index('date', inplace=True)
data.head()


Unnamed: 0_level_0,albany_do,norrie_do,albany_rainfall,norrie_rainfall,albany_turbidity,norrie_turbidity
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-08-25 00:00:00,7.68,7.81,0.0,0.0,4.0,9.3
2011-08-25 00:15:00,7.6,7.73,0.0,0.0,3.9,8.4
2011-08-25 00:30:00,7.57,7.63,0.0,0.0,4.3,7.9
2011-08-25 00:45:00,7.72,7.67,0.0,0.0,4.7,8.1
2011-08-25 01:00:00,7.74,7.63,0.0,0.0,4.4,8.4


In [17]:
# Define the predictors and the target variable
X = data[['albany_do', 'albany_rainfall']]
Y = data[['albany_turbidity']]

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Create and fit the model
model = LinearRegression()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(Y_test, Y_pred))}")
print(f"R-squared: {r2_score(Y_test, Y_pred)}")


RMSE: 221.9143474905527
R-squared: 0.4907389518457509


On average, our predicted value for the turbidity is 221 turbidity units off from the actual value. 

In [23]:
# Create a widget for selecting predictors
predictor_selector = widgets.SelectMultiple( # Can pick multiple predictors
    options = data.columns, # user can pick any variable as a predictor
    value = [data.columns[0]], # default value to start with
    description = 'Predictors' # name of the box
)

# Create a dropdown for selecting the target variable
target_selector = widgets.Dropdown( # only pick one Y, not multiple
    options = data.columns, # user can pick any variable as a predictor
    value = data.columns[1], # don't want the same as X value
    description = 'Target'
)

# Button to evaluate the model
evaluate_button = widgets.Button(description = "Evaluate Model")

# Output widget to display results
output = widgets.Output()

# Define the function to handle button clicks
def evaluate_model(b):
    with output:
        clear_output(wait=True) # clear output of display area

        # Make sure the target is not in the predictors
        selected_predictors = [item for item in predictor_selector.value] # for each item in predictor selector, store the value of it here
        if target_selector.value in selected_predictors :
            print("Target variable must not be in the predictors")
            return
        
        # Prepare the data
        X = data[selected_predictors]
        Y = data[target_selector.value] # need to parse target selector to get just the value

        # Split the data into training and testing sets
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)


        # Create and fit the model 
        model = LinearRegression()
        model.fit(X_train, Y_train)
        

        # Predict and calculate R2 and MSE
        y_pred = model.predict(X_test)
        r2 = r2_score(Y_test, Y_pred)
        mse = mean_squared_error(Y_test, Y_pred)

        # Display R2 and MSE
        print(f"R^2: {r2:.4f}")
        print(f"MSE: {mse:.4f}")


# Display widget and connect to button function
display(predictor_selector, target_selector, evaluate_button, output)
evaluate_button.on_click(evaluate_model)

SelectMultiple(description='Predictors', index=(0,), options=('albany_do', 'norrie_do', 'albany_rainfall', 'no…

Dropdown(description='Target', index=1, options=('albany_do', 'norrie_do', 'albany_rainfall', 'norrie_rainfall…

Button(description='Evaluate Model', style=ButtonStyle())

Output()