In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from datetime import datetime
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
%run StarterCode.ipynb
local_traffic_merged = traffic_merged.copy()

Exception: File `'StarterCode.ipynb.py'` not found.

Data cleaning (copied from Kathi)

In [None]:
# strip whitespace from column names
local_traffic_merged.columns = local_traffic_merged.columns.str.strip()

# drop the columns
local_traffic_merged.drop(['BUS COUNT', 'MESSAGE COUNT', 'datetime', 'name', 'precipprob',
               'snow', 'windspeed', 'winddir', 'feelslike', 'sealevelpressure', 'solarenergy','uvindex','severerisk', 'icon', 'stations'], axis=1, inplace=True)

In [None]:
for column in local_traffic_merged.columns:
    nan_count_before = local_traffic_merged[column].isnull().sum()
    if nan_count_before > 0:
        print(f"Processing column: {column}")

        local_traffic_merged[column] = local_traffic_merged[column].ffill().bfill()

        most_common_value = local_traffic_merged[column].mode()[0]

        nan_count_after = local_traffic_merged[column].isnull().sum()
        if nan_count_after > 0:
            local_traffic_merged[column].fillna(most_common_value, inplace=True)

        if local_traffic_merged[column].isnull().any():
            raise Exception(f"NaN values still present in {column} after filling!")

        unique_values = local_traffic_merged[column].unique()[:10]
        nan_filled = nan_count_before - nan_count_after

        print(f"Used '{most_common_value}' to fill {nan_filled} NaNs in {column}")
        print(f"First 10 unique values in {column} after filling NaNs: {unique_values}\n")

Processing column: temp
Used '33.9' to fill 126557 NaNs in temp
First 10 unique values in temp after filling NaNs: [67.2 69.6 72.1 74.9 78.2 79.1 79.7 77.8 74.7 72.7]

Processing column: dew
Used '38.0' to fill 126557 NaNs in dew
First 10 unique values in dew after filling NaNs: [29.4 28.9 29.  27.5 27.2 25.5 24.1 25.2 26.4 23.8]

Processing column: humidity
Used '95.55' to fill 126557 NaNs in humidity
First 10 unique values in humidity after filling NaNs: [24.24 22.35 20.53 18.23 16.46 15.01 14.54 13.85 13.62 15.77]

Processing column: precip
Used '0.0' to fill 126557 NaNs in precip
First 10 unique values in precip after filling NaNs: [0.    0.013 0.008 0.002 0.011 0.018 0.015 0.001 0.005 0.003]

Processing column: preciptype
Used 'rain' to fill 2306808 NaNs in preciptype
First 10 unique values in preciptype after filling NaNs: ['rain' 'rain,snow' 'snow']

Processing column: snowdepth
Used '0.0' to fill 126557 NaNs in snowdepth
First 10 unique values in snowdepth after filling NaNs: [

In [None]:
for column in local_traffic_merged.columns:
    if local_traffic_merged[column].isnull().any():
        # If NaN values are found, raise an exception
        raise Exception(f"NaN values found in column {column}")

In [None]:
# Convert the 'TIME' column to datetime format and create a new 'accurate_time' column
local_traffic_merged['accurate_time'] = pd.to_datetime(local_traffic_merged['TIME'], format='%m/%d/%Y %I:%M:%S %p')

# Now 'accurate_time' is in the common pandas datetime format and can be used for further analysis
local_traffic_merged = local_traffic_merged.sort_values("accurate_time", ascending=True)

# Check the first few rows to ensure the 'accurate_time' column is processed correctly
print(local_traffic_merged[['TIME', 'accurate_time']].head())

                           TIME       accurate_time
2577600  04/01/2018 12:01:06 AM 2018-04-01 00:01:06
2577571  04/01/2018 12:01:06 AM 2018-04-01 00:01:06
2577570  04/01/2018 12:01:06 AM 2018-04-01 00:01:06
2577569  04/01/2018 12:01:06 AM 2018-04-01 00:01:06
2577568  04/01/2018 12:01:06 AM 2018-04-01 00:01:06


In [None]:
local_traffic_merged = local_traffic_merged.sort_values(by=['accurate_time','SEGMENTID'])

In [None]:
# time to numerical -> day of the week & time
local_traffic_merged['accurate_time'] = pd.to_datetime(local_traffic_merged['accurate_time'])

local_traffic_merged['day_of_week'] = local_traffic_merged['accurate_time'].dt.weekday

local_traffic_merged['time_of_day'] = local_traffic_merged['accurate_time'].dt.time

In [None]:
local_traffic_merged

Unnamed: 0,TIME,SEGMENTID,SPEED,temp,dew,humidity,precip,preciptype,snowdepth,windgust,cloudcover,visibility,solarradiation,conditions,accurate_time,day_of_week,time_of_day
2577550,04/01/2018 12:01:06 AM,1,25,32.9,14.3,45.72,0.0,rain,0.0,13.2,78.4,9.9,496.0,Partially cloudy,2018-04-01 00:01:06,6,00:01:06
2577551,04/01/2018 12:01:06 AM,2,20,32.9,14.3,45.72,0.0,rain,0.0,13.2,78.4,9.9,496.0,Partially cloudy,2018-04-01 00:01:06,6,00:01:06
2577552,04/01/2018 12:01:06 AM,7,32,32.9,14.3,45.72,0.0,rain,0.0,13.2,78.4,9.9,496.0,Partially cloudy,2018-04-01 00:01:06,6,00:01:06
2577553,04/01/2018 12:01:06 AM,8,27,32.9,14.3,45.72,0.0,rain,0.0,13.2,78.4,9.9,496.0,Partially cloudy,2018-04-01 00:01:06,6,00:01:06
2577554,04/01/2018 12:01:06 AM,9,28,32.9,14.3,45.72,0.0,rain,0.0,13.2,78.4,9.9,496.0,Partially cloudy,2018-04-01 00:01:06,6,00:01:06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,04/30/2018 11:50:28 PM,1305,20,67.2,29.4,24.24,0.0,rain,0.0,27.7,24.2,9.9,0.0,Partially cloudy,2018-04-30 23:50:28,0,23:50:28
80,04/30/2018 11:50:28 PM,1307,25,67.2,29.4,24.24,0.0,rain,0.0,27.7,24.2,9.9,0.0,Partially cloudy,2018-04-30 23:50:28,0,23:50:28
81,04/30/2018 11:50:28 PM,1307,25,67.2,29.4,24.24,0.0,rain,0.0,27.7,24.2,9.9,0.0,Partially cloudy,2018-04-30 23:50:28,0,23:50:28
82,04/30/2018 11:50:28 PM,1309,29,67.2,29.4,24.24,0.0,rain,0.0,27.7,24.2,9.9,0.0,Partially cloudy,2018-04-30 23:50:28,0,23:50:28


In [None]:
one_hot_enc = pd.get_dummies(local_traffic_merged['conditions']).astype(int)
local_traffic_merged = pd.concat([local_traffic_merged, one_hot_enc], axis=1)
local_traffic_merged

Unnamed: 0,TIME,SEGMENTID,SPEED,temp,dew,humidity,precip,preciptype,snowdepth,windgust,...,conditions,accurate_time,day_of_week,time_of_day,Clear,Overcast,Partially cloudy,"Rain, Overcast","Snow, Overcast","Snow, Rain, Overcast"
2577550,04/01/2018 12:01:06 AM,1,25,32.9,14.3,45.72,0.0,rain,0.0,13.2,...,Partially cloudy,2018-04-01 00:01:06,6,00:01:06,0,0,1,0,0,0
2577551,04/01/2018 12:01:06 AM,2,20,32.9,14.3,45.72,0.0,rain,0.0,13.2,...,Partially cloudy,2018-04-01 00:01:06,6,00:01:06,0,0,1,0,0,0
2577552,04/01/2018 12:01:06 AM,7,32,32.9,14.3,45.72,0.0,rain,0.0,13.2,...,Partially cloudy,2018-04-01 00:01:06,6,00:01:06,0,0,1,0,0,0
2577553,04/01/2018 12:01:06 AM,8,27,32.9,14.3,45.72,0.0,rain,0.0,13.2,...,Partially cloudy,2018-04-01 00:01:06,6,00:01:06,0,0,1,0,0,0
2577554,04/01/2018 12:01:06 AM,9,28,32.9,14.3,45.72,0.0,rain,0.0,13.2,...,Partially cloudy,2018-04-01 00:01:06,6,00:01:06,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,04/30/2018 11:50:28 PM,1305,20,67.2,29.4,24.24,0.0,rain,0.0,27.7,...,Partially cloudy,2018-04-30 23:50:28,0,23:50:28,0,0,1,0,0,0
80,04/30/2018 11:50:28 PM,1307,25,67.2,29.4,24.24,0.0,rain,0.0,27.7,...,Partially cloudy,2018-04-30 23:50:28,0,23:50:28,0,0,1,0,0,0
81,04/30/2018 11:50:28 PM,1307,25,67.2,29.4,24.24,0.0,rain,0.0,27.7,...,Partially cloudy,2018-04-30 23:50:28,0,23:50:28,0,0,1,0,0,0
82,04/30/2018 11:50:28 PM,1309,29,67.2,29.4,24.24,0.0,rain,0.0,27.7,...,Partially cloudy,2018-04-30 23:50:28,0,23:50:28,0,0,1,0,0,0


In [None]:
local_traffic_merged = local_traffic_merged.drop(['TIME', 'SEGMENTID', 'conditions', 'preciptype','accurate_time'], axis=1)

In [None]:
local_traffic_merged

Unnamed: 0,SPEED,temp,dew,humidity,precip,snowdepth,windgust,cloudcover,visibility,solarradiation,day_of_week,time_of_day,Clear,Overcast,Partially cloudy,"Rain, Overcast","Snow, Overcast","Snow, Rain, Overcast"
2577550,25,32.9,14.3,45.72,0.0,0.0,13.2,78.4,9.9,496.0,6,00:01:06,0,0,1,0,0,0
2577551,20,32.9,14.3,45.72,0.0,0.0,13.2,78.4,9.9,496.0,6,00:01:06,0,0,1,0,0,0
2577552,32,32.9,14.3,45.72,0.0,0.0,13.2,78.4,9.9,496.0,6,00:01:06,0,0,1,0,0,0
2577553,27,32.9,14.3,45.72,0.0,0.0,13.2,78.4,9.9,496.0,6,00:01:06,0,0,1,0,0,0
2577554,28,32.9,14.3,45.72,0.0,0.0,13.2,78.4,9.9,496.0,6,00:01:06,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,20,67.2,29.4,24.24,0.0,0.0,27.7,24.2,9.9,0.0,0,23:50:28,0,0,1,0,0,0
80,25,67.2,29.4,24.24,0.0,0.0,27.7,24.2,9.9,0.0,0,23:50:28,0,0,1,0,0,0
81,25,67.2,29.4,24.24,0.0,0.0,27.7,24.2,9.9,0.0,0,23:50:28,0,0,1,0,0,0
82,29,67.2,29.4,24.24,0.0,0.0,27.7,24.2,9.9,0.0,0,23:50:28,0,0,1,0,0,0


In [None]:
features = local_traffic_merged.drop(['SPEED','day_of_week','time_of_day'], axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
scaled_df

Unnamed: 0,temp,dew,humidity,precip,snowdepth,windgust,cloudcover,visibility,solarradiation,Clear,Overcast,Partially cloudy,"Rain, Overcast","Snow, Overcast","Snow, Rain, Overcast"
0,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095
1,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095
2,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095
3,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095
4,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2577596,2.101662,0.305848,-1.431237,-0.198592,-0.485575,1.396107,-1.172105,0.393445,-0.887734,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095
2577597,2.101662,0.305848,-1.431237,-0.198592,-0.485575,1.396107,-1.172105,0.393445,-0.887734,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095
2577598,2.101662,0.305848,-1.431237,-0.198592,-0.485575,1.396107,-1.172105,0.393445,-0.887734,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095
2577599,2.101662,0.305848,-1.431237,-0.198592,-0.485575,1.396107,-1.172105,0.393445,-0.887734,-0.369103,-0.551208,0.917051,-0.264332,-0.028528,-0.198095


In [None]:
np.random.seed(569)

test_proportion = 0.2
# Test/train splits, with the count at the end to make sure we have the same number of indices at the end
permutation = np.random.permutation(range(len(local_traffic_merged)))
test_indices = permutation[:int(test_proportion * len(local_traffic_merged))]
train_indices = permutation[int(test_proportion * len(local_traffic_merged)):]
np.unique(len(test_indices)+len(train_indices))

# Our test dataframe is test_df, and the train dataframe is train_df
test_df = local_traffic_merged.iloc[test_indices]
train_df = local_traffic_merged.iloc[train_indices]

In [None]:
y = local_traffic_merged.loc[:, 'SPEED']
lasso = Lasso(alpha=0.1)
#lasso.fit(scaled_features, y)
lasso.fit(scaled_df, y)
#lasso_features = scaled_features[:, lasso.coef_ != 0]
lasso_features = scaled_df.loc[:, lasso.coef_ != 0]

In [None]:
X_train_lasso = lasso_features.loc[train_indices]
X_test_lasso = lasso_features.loc[test_indices]

y_train = local_traffic_merged.loc[train_indices, 'SPEED']
y_test = local_traffic_merged.loc[test_indices, 'SPEED']

In [None]:
lasso_features

Unnamed: 0,temp,windgust,cloudcover,visibility,solarradiation,Clear,"Rain, Overcast"
0,-0.965505,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.264332
1,-0.965505,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.264332
2,-0.965505,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.264332
3,-0.965505,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.264332
4,-0.965505,-0.728890,0.369227,0.393445,0.755904,-0.369103,-0.264332
...,...,...,...,...,...,...,...
2577596,2.101662,1.396107,-1.172105,0.393445,-0.887734,-0.369103,-0.264332
2577597,2.101662,1.396107,-1.172105,0.393445,-0.887734,-0.369103,-0.264332
2577598,2.101662,1.396107,-1.172105,0.393445,-0.887734,-0.369103,-0.264332
2577599,2.101662,1.396107,-1.172105,0.393445,-0.887734,-0.369103,-0.264332


In [None]:
model_lasso = LinearRegression()
model_lasso.fit(X_train_lasso, y_train)

print("For MLR + LASSO:")
r_sqr = model_lasso.score(X_test_lasso, y_test)
print(f'R^2: {r_sqr}')

pred = model_lasso.predict(X_test_lasso)
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

For MLR + LASSO:
R^2: 0.0012412499911143016
Mean Squared Error: 59.911935701250364
Mean Absolute Error: 5.658658749737852


In [None]:
ridge = Ridge(alpha=0.1)
ridge.fit(scaled_df, y)

threshold = 0.05  #idk

#ridge_features = scaled_features[:, np.abs(ridge.coef_) > threshold]
ridge_features = scaled_df.loc[:, np.abs(ridge.coef_) > threshold]
ridge_features

Unnamed: 0,temp,dew,humidity,precip,snowdepth,windgust,visibility,solarradiation,Clear,Overcast,Partially cloudy,"Rain, Overcast"
0,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332
1,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332
2,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332
3,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332
4,-0.965505,-1.268557,-0.433830,-0.198592,-0.485575,-0.728890,0.393445,0.755904,-0.369103,-0.551208,0.917051,-0.264332
...,...,...,...,...,...,...,...,...,...,...,...,...
2577596,2.101662,0.305848,-1.431237,-0.198592,-0.485575,1.396107,0.393445,-0.887734,-0.369103,-0.551208,0.917051,-0.264332
2577597,2.101662,0.305848,-1.431237,-0.198592,-0.485575,1.396107,0.393445,-0.887734,-0.369103,-0.551208,0.917051,-0.264332
2577598,2.101662,0.305848,-1.431237,-0.198592,-0.485575,1.396107,0.393445,-0.887734,-0.369103,-0.551208,0.917051,-0.264332
2577599,2.101662,0.305848,-1.431237,-0.198592,-0.485575,1.396107,0.393445,-0.887734,-0.369103,-0.551208,0.917051,-0.264332


In [None]:
ridge.coef_

array([-0.11179942, -0.07963898,  0.21088986,  0.08168596, -0.13414046,
       -0.10663699, -0.02474402,  0.19189796, -0.31512139,  0.19896781,
       -0.12320966,  0.0909055 , -0.21416485,  0.04977254, -0.03324339])

In [None]:
X_train_ridge = ridge_features.loc[train_indices]
X_test_ridge = ridge_features.loc[test_indices]

model_ridge = LinearRegression()
model_ridge.fit(X_train_ridge, y_train)

print("For MLR + Ridge:")
r_sqr_ridge = model_ridge.score(X_test_ridge, y_test)
print(f'R^2: {r_sqr}')

pred_ridge = model_ridge.predict(X_test_ridge)
mse_ridge = mean_squared_error(y_test, pred_ridge)
mae_ridge= mean_absolute_error(y_test, pred_ridge)

print(f"Mean Squared Error: {mse_ridge}")
print(f"Mean Absolute Error: {mae_ridge}")

For MLR + Ridge:
R^2: 0.0012412499911143016
Mean Squared Error: 59.83195910823365
Mean Absolute Error: 5.652522162059441
