In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
# Load the dataset
data1 = pd.read_csv('traffic_weather.csv')
data1.head(20)

Unnamed: 0,Intersection Name,Month,Day,Year,Hour,Day of Week,Holiday,Direction,Speed,Volume,Full Date,date,tavg,tmin,tmax,prcp,snow,wspd,temperature
0,Azie MortonBarton Springs,7,9,2019,1,2,0,NB,28.0,9.0,2019-07-09 01:00:00,7/9/19,29.5,23.9,36.1,0.0,0,7.2,23.9
1,Azie MortonBarton Springs,7,9,2019,1,2,0,SB,24.0,8.0,2019-07-09 01:00:00,7/9/19,29.5,23.9,36.1,0.0,0,7.2,23.9
2,Azie MortonBarton Springs,7,9,2019,0,2,0,NB,28.0,25.0,2019-07-09 00:00:00,7/9/19,29.5,23.9,36.1,0.0,0,7.2,23.9
3,Azie MortonBarton Springs,7,9,2019,0,2,0,SB,28.0,36.0,2019-07-09 00:00:00,7/9/19,29.5,23.9,36.1,0.0,0,7.2,23.9
4,Azie MortonBarton Springs,7,8,2019,23,1,0,NB,27.0,26.0,2019-07-08 23:00:00,7/8/19,28.1,22.2,36.1,9.7,0,6.1,33.227611
5,Azie MortonBarton Springs,7,8,2019,23,1,0,SB,28.0,54.0,2019-07-08 23:00:00,7/8/19,28.1,22.2,36.1,9.7,0,6.1,33.227611
6,Azie MortonBarton Springs,7,8,2019,22,1,0,NB,27.0,82.0,2019-07-08 22:00:00,7/8/19,28.1,22.2,36.1,9.7,0,6.1,34.237753
7,Azie MortonBarton Springs,7,8,2019,22,1,0,SB,28.0,107.0,2019-07-08 22:00:00,7/8/19,28.1,22.2,36.1,9.7,0,6.1,34.237753
8,Azie MortonBarton Springs,7,8,2019,21,1,0,NB,26.0,99.0,2019-07-08 21:00:00,7/8/19,28.1,22.2,36.1,9.7,0,6.1,35.041926
9,Azie MortonBarton Springs,7,8,2019,21,1,0,SB,27.0,159.0,2019-07-08 21:00:00,7/8/19,28.1,22.2,36.1,9.7,0,6.1,35.041926


## Creating lagged variables

In [5]:
# Convert 'Full Date' to datetime
data1['Full Date'] = pd.to_datetime(data1['Full Date'], errors='coerce')
data1 = data1.dropna(subset=['Full Date'])  # Drop invalid dates

# Define morning and afternoon hours
morning_hours = [6, 7, 8, 9]  # Morning rush hours
afternoon_hours = [15, 16, 17, 18]  # Afternoon rush hours

# Compute Morning_Traffic
morning_traffic = data1[data1['Hour'].isin(morning_hours)].groupby(data1['Full Date'].dt.date)['Volume'].sum()
data1['Morning_Traffic'] = data1['Full Date'].dt.date.map(morning_traffic)

# Compute Afternoon_Traffic
afternoon_traffic = data1[data1['Hour'].isin(afternoon_hours)].groupby(data1['Full Date'].dt.date)['Volume'].sum()
data1['Afternoon_Traffic'] = data1['Full Date'].dt.date.map(afternoon_traffic)

# Drop rows with missing Morning_Traffic or Afternoon_Traffic
data1 = data1.dropna(subset=['Morning_Traffic', 'Afternoon_Traffic'])

# Add lagged variables
data1['Lag_Morning_Traffic'] = data1['Morning_Traffic'].shift(1)  # Previous day morning traffic
data1['Lag_Afternoon_Traffic'] = data1['Afternoon_Traffic'].shift(1)  # Previous day afternoon traffic
data1 = data1.dropna(subset=['Lag_Morning_Traffic', 'Lag_Afternoon_Traffic'])


# Basically I will be using that particular day' morning traffic along with lagged (historical) traffic of morning,afternoon to predict the upcoming afternoon traffic

In [7]:
# Encode categorical variables
label_encoder = LabelEncoder()
data1['Intersection Name'] = label_encoder.fit_transform(data1['Intersection Name'])
data1['Direction'] = label_encoder.fit_transform(data1['Direction'])

In [9]:
# Define predictors (including encoded categorical variables)
features = [
    'Morning_Traffic', 'Lag_Morning_Traffic', 'Lag_Afternoon_Traffic',
    'Intersection Name', 'Direction', 'Month', 'Holiday', 'Speed', 'prcp', 'wspd', 'temperature'
]
X = data1[features]
y = data1['Afternoon_Traffic']

# Scale predictors and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [13]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Fit SARIMA model
sarima_model = SARIMAX(
    y_train.flatten(),  # Target variable
    exog=X_train,  # Exogenous predictors
    order=(1, 0, 0),  # ARIMA components (p, d, q)
    seasonal_order=(0, 0, 0, 7)  # Weekly seasonality
)
sarima_result = sarima_model.fit(disp=False)

In [15]:
# Predict on test data
y_pred_scaled = sarima_result.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1, exog=X_test)

# Rescale predictions and true values
y_pred_rescaled = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))
y_test_rescaled = scaler_y.inverse_transform(y_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test_rescaled, y_pred_rescaled))
r2 = r2_score(y_test_rescaled, y_pred_rescaled)

print(f"SARIMA RMSE: {rmse}")
print(f"SARIMA R²: {r2}")

SARIMA RMSE: 1653.5306847456836
SARIMA R²: 0.9970849527183726
