In [1]:
# Import necessary libraries

import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [2]:
"""
Part 1: Using KNN Regression
"""

'\nPart 1: Using KNN Regression\n'

In [3]:
# Task 1: Read the data file and display it
file_path = 'data/marathon.csv'
marathon = pd.read_csv(file_path)
marathon

Unnamed: 0,age,bmi,female,footwear,group,injury,mf_d,mf_di,mf_ti,max,sprint,mf_s,time_hrs
0,35,23.592323,0,2,1,2,42195,4,10295,60.0,1,4.098592,2.859722
1,33,22.518295,0,2,2,2,42195,3,12292,50.0,0,3.432720,3.414444
2,38,25.560312,0,2,3,1,42195,4,10980,65.0,0,3.842896,3.050000
3,34,22.607931,0,2,1,1,42195,3,10694,88.0,1,3.945670,2.970556
4,39,24.974836,0,2,1,1,42195,2,13452,51.0,0,3.136708,3.736667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
924,23,23.277760,1,2,2,1,42195,3,15660,18.0,0,2.694444,4.350000
925,30,24.489796,0,2,2,1,42195,2,16110,45.0,0,2.619181,4.475000
926,44,24.237617,0,2,3,1,42195,2,12289,63.0,1,3.433558,3.413611
927,34,21.249750,0,2,3,1,42195,3,12602,32.0,0,3.348278,3.500556


In [4]:
# Task 2: Take a subset of the data
# Take a subset of 50 individuals
marathon_50 = marathon.sample(n=50, random_state=300)

# Create the scatterplot
answer2 = alt.Chart(marathon_50).mark_circle(size=60).encode(
    x=alt.X('max', title='Max Distance per Week (miles)'),
    y=alt.Y('time_hrs', title='Race Time (hours)', scale=alt.Scale(zero=False))
).properties(
    title='Max Distance vs Race Time (50 runners)'
)

answer2

In [6]:
# Task 3: Visualization of the 4 nearest neighbors to 100 miles/week
# Create column for distance from 100 and select 4 nearest neighbors
marathon_50['distance_from_100'] = (marathon_50['max'] - 100).abs()
nearest_neighbors = marathon_50.sort_values(by='distance_from_100').head(4)

# Vertical line at x = 100
vertical = alt.Chart(pd.DataFrame({'x': [100]})).mark_rule(color='black').encode(x='x')

# All runners
points = alt.Chart(marathon_50).mark_circle().encode(
    x=alt.X('max', title='Max Distance Ran per Week'),
    y=alt.Y('time_hrs', title='Race Time (hours)', scale=alt.Scale(zero=False))
)

# Highlight 4 nearest
highlight = alt.Chart(nearest_neighbors).mark_circle(size=70, color='orange').encode(
    x='max',
    y='time_hrs'
)

# Connect to x = 100
connectors = alt.Chart(nearest_neighbors).mark_rule(color='orange').encode(
    x=alt.X('max:Q'),
    x2=alt.X2('x:Q'),
    y='time_hrs:Q',
    y2='time_hrs:Q'
).transform_calculate(
    x='100'
)

# Final combined plot
answer3_plot = (points + highlight + connectors + vertical).properties(
    title='KNN: 4 Nearest Neighbors to 100 miles/week'
)

answer3_plot


NameError: name 'nearest_neighbors' is not defined

In [None]:
# Manually predict time_hrs for max = 100 using 4 nearest neighbors

# Calculate absolute distance from 100
marathon_50['distance_from_100'] = (marathon_50['max'] - 100).abs()

# Get the 4 closest runners
nearest_neighbors = marathon_50.sort_values(by='distance_from_100').head(4)

# Average their race times
answer3 = nearest_neighbors['time_hrs'].mean()
answer3

In [None]:
# Task 4: Cross-validation on the training data to choose K
# Set random seed per spec
np.random.seed(2019)

# Split into training/testing sets (should be the same as earlier)
from sklearn.model_selection import train_test_split
marathon_training, marathon_testing = train_test_split(
    marathon,
    test_size=0.25,
    random_state=2000
)

# Features and labels
X_train = marathon_training[['max']]
y_train = marathon_training['time_hrs']

# Create pipeline
marathon_pipe = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

# 5-fold CV with neg_root_mean_squared_error
marathon_cv = pd.DataFrame(cross_validate(
    marathon_pipe,
    X_train,
    y_train,
    cv=5,
    scoring='neg_root_mean_squared_error',
    return_train_score=True
))

marathon_cv

In [None]:
# Task 5: Tune and fit your model
# Set seed per spec
np.random.seed(2019)

# Define parameter grid
param_grid = {'kneighborsregressor__n_neighbors': range(1, 201)}

# GridSearchCV setup
marathon_tuned = GridSearchCV(
    marathon_pipe,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1
)

# Fit to training data
marathon_tuned.fit(X_train, y_train)

# Save and display results
marathon_results = pd.DataFrame(marathon_tuned.cv_results_)
marathon_results

In [None]:
# Task 6: Find the best K
# Get best K (as a dictionary)
marathon_min = marathon_tuned.best_params_

# Get best RMSPE (convert to positive)
marathon_best_RMSPE = -marathon_tuned.best_score_

# Display both
marathon_min, marathon_best_RMSPE


In [None]:
# Task 7: Access the model
# Set seed per spec
np.random.seed(1234)

# Predict on test data
X_test = marathon_testing[['max']]
y_test = marathon_testing['time_hrs']

marathon_prediction = marathon_tuned.predict(X_test)

# Compute RMSPE
marathon_summary = np.sqrt(mean_squared_error(y_test, marathon_prediction))
marathon_summary

In [None]:
# Task 8: Visualize what the relationship between max and time_hrs
# Predict using the tuned model on training data
knn_predictions = marathon_tuned.predict(X_train)

# Add predictions column to training DataFrame
marathon_preds = marathon_training.assign(predictions=knn_predictions)

# Create scatterplot of actual data
# Corrected scatterplot encoding
scatter = alt.Chart(marathon_preds).mark_circle(opacity=0.4).encode(
    x=alt.X('max', title='Max Distance per Week (miles)'),
    y=alt.Y('time_hrs', title='Actual Race Time (hrs)', scale=alt.Scale(zero=False))
)

# Add line for KNN predictions
line = alt.Chart(marathon_preds).mark_line(color='black').encode(
    x='max',
    y='predictions'
).transform_loess('max', 'predictions', bandwidth=0.3)

# Combine both
marathon_plot = (scatter + line).properties(
    title='KNN Regression: Max Distance vs Predicted Race Time'
)

marathon_plot


In [None]:
"""
Part 2: Using simple regression
"""

In [None]:
# Task 1: Load the marathon data
marathon = pd.read_csv('data/marathon.csv')

# Display the df
marathon

In [None]:
# Task 2: Display the relationship between race time nd max distance run per week
# Split the data into training and testing sets
marathon_training, marathon_testing = train_test_split(
    marathon,
    test_size=0.25,
    random_state=2000
)

# Set up X and y for both training and test sets
X_train = marathon_training[['max']]
y_train = marathon_training['time_hrs']
X_test = marathon_testing[['max']]
y_test = marathon_testing['time_hrs']

# Scatterplot of training data
marathon_scatter = alt.Chart(marathon_training).mark_point(opacity=0.4).encode(
    x=alt.X('max', title='Max Distance per Week (miles)'),
    y=alt.Y('time_hrs', title='Race Time (hours)', scale=alt.Scale(zero=False))
).properties(
    title='Linear Regression Training Data: Max Distance vs Race Time'
)

marathon_scatter

In [None]:
# Task 3: Build a linear regression model
# Create linear regression model
lm = LinearRegression()
lm

In [None]:
lm_fit = lm.fit(X_train, y_train)
lm_fit

In [None]:
# Task 4: Visualize the model predictions on the training data
# Predict using the fitted linear regression model
linear_predictions = lm.predict(X_train)

# Add predictions column to training set
marathon_preds = marathon_training.assign(predictions=linear_predictions)

# Create scatterplot of actual data
scatter = alt.Chart(marathon_preds).mark_circle(opacity=0.4).encode(
    x=alt.X('max', title='Max Distance per Week (miles)'),
    y=alt.Y('time_hrs', title='Race Time (hours)', scale=alt.Scale(zero=False))
)

# Add line of predicted values
line = alt.Chart(marathon_preds).mark_line(color='black').encode(
    x='max',
    y='predictions'
)

# Combine both layers
marathon_plot = (scatter + line).properties(
    title='Linear Regression: Max Distance vs Predicted Race Time (Training Data)'
)

marathon_plot


In [None]:
# Task 5: Visualize the model predictions over test data
# Predict on test data
test_predictions = lm.predict(X_test)

# Add predictions to test set
marathon_test_preds = marathon_testing.assign(predictions=test_predictions)

# Scatterplot of test data
scatter_test = alt.Chart(marathon_test_preds).mark_circle(opacity=0.4).encode(
    x=alt.X('max', title='Max Distance per Week (miles)'),
    y=alt.Y('time_hrs', title='Race Time (hours)', scale=alt.Scale(zero=False))
)

# Prediction line
line_test = alt.Chart(marathon_test_preds).mark_line(color='black').encode(
    x='max',
    y='predictions'
)

# Combine
marathon_plot_test = (scatter_test + line_test).properties(
    title='Linear Regression: Max Distance vs Predicted Race Time (Test Data)'
)

marathon_plot_test


In [None]:
"""
Part 3: Reflection questions
"""
print("Question 1: When might KNN regression be more appropriate then simple linear regression, and vice versa?")
print("KNN regression is more appropriate when the relationship between the predictor and the response variable is non-linear and we don't want to assume a specific functional form. Simple linear regression is more appropriate when there is a clear linear relationship between the predictor and the response.  \n")

print("Question 2: How does KNN regression differ from KNN classification, and what challenges arise when predicting continuous (vs. categorical) targets?")
print("KNN classification predicts the most common class label among nearest neighbors where as KNN regression predicts the average value of the target variable among the neighbors. Predicting continuous values can be more sensitive to outliers and noise since the average can be skewed. Assessing performance is also tricky because there is no direct answer like classification, so the errors are continuous. \n")

print("Question 3: Why is the linearity assumption in simple linear regression important, and what happens if it's violated?")
print("The linearity assumption is important because linear regression fits a straight line to model the relationship between the input and output. If the actual relationship is not linear, the model can underfit and cause inaccurate predictions. Violating it can result in poor model performance and invalid conclusions/predictions about the relationship between variables. \n")
