In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

In [13]:
data = pd.read_csv('/content/metro_station_peak_hour_data.csv')

In [14]:
data.head()

Unnamed: 0,Time,Day,SpecialEvent,Weather,People Count
0,01-01-2024 00:00,Monday,False,Rainy,10
1,01-01-2024 00:10,Monday,False,Rainy,14
2,01-01-2024 00:20,Monday,False,Cloudy,11
3,01-01-2024 00:30,Monday,False,Rainy,10
4,01-01-2024 00:40,Monday,False,Sunny,10


In [15]:
data.isnull().sum()

Unnamed: 0,0
Time,0
Day,0
SpecialEvent,0
Weather,0
People Count,0


### One HOT Encoding

In [16]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = ohe.fit_transform(data[['Day','SpecialEvent','Weather']])

In [17]:
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(['Day','SpecialEvent','Weather']))
data = pd.concat([data, encoded_df], axis=1)
data.drop(['Day','SpecialEvent','Weather'], axis=1, inplace=True)
data.head()


Unnamed: 0,Time,People Count,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,SpecialEvent_True,Weather_Rainy,Weather_Sunny
0,01-01-2024 00:00,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,01-01-2024 00:10,14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,01-01-2024 00:20,11,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,01-01-2024 00:30,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,01-01-2024 00:40,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
data.set_index('Time',inplace=True)


In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [23]:
# Function to check stationarity
from statsmodels.tsa.stattools import adfuller # import the adfuller function
def check_stationarity(series):
    result = adfuller(series)
    print(f"ADF Statistic: {result[0]}, p-value: {result[1]}")
    if result[1] > 0.05:
        print("Series is NOT stationary. Applying differencing...")
        return series.diff().dropna()
    else:
        print("Series is stationary.")
        return series

# Apply stationarity check
y_train = check_stationarity(y_train)

ADF Statistic: -28.300036634831585, p-value: 0.0
Series is stationary.


In [25]:
# !pip install pmdarima
from pmdarima.arima import auto_arima # Import the auto_arima function

# Scale exogenous variables
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Find optimal order using Auto-ARIMA
auto_model = auto_arima(y_train, exogenous=x_train_scaled, seasonal=True, m=24,
                         stepwise=True, trace=True, suppress_warnings=True, n_jobs=-1)

best_order = auto_model.order
best_seasonal_order = auto_model.seasonal_order

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4




Performing stepwise search to minimize aic
 ARIMA(2,0,2)(1,0,1)[24] intercept   : AIC=9888.376, Time=28.74 sec
 ARIMA(0,0,0)(0,0,0)[24] intercept   : AIC=9885.332, Time=0.05 sec




 ARIMA(1,0,0)(1,0,0)[24] intercept   : AIC=9887.217, Time=1.86 sec




 ARIMA(0,0,1)(0,0,1)[24] intercept   : AIC=9887.163, Time=3.90 sec
 ARIMA(0,0,0)(0,0,0)[24]             : AIC=9884.401, Time=0.18 sec




 ARIMA(0,0,0)(1,0,0)[24] intercept   : AIC=9887.261, Time=1.16 sec




 ARIMA(0,0,0)(0,0,1)[24] intercept   : AIC=9887.260, Time=1.37 sec




 ARIMA(0,0,0)(1,0,1)[24] intercept   : AIC=9889.242, Time=1.72 sec
 ARIMA(1,0,0)(0,0,0)[24] intercept   : AIC=9885.294, Time=0.13 sec




 ARIMA(0,0,1)(0,0,0)[24] intercept   : AIC=9885.242, Time=0.29 sec




 ARIMA(1,0,1)(0,0,0)[24] intercept   : AIC=9887.135, Time=0.83 sec

Best model:  ARIMA(0,0,0)(0,0,0)[24]          
Total fit time: 40.277 seconds


In [30]:
# Define the optimized SARIMAX model
model = SARIMAX(y_train, exog=x_train_scaled, order=best_order, seasonal_order=best_seasonal_order)

# Fit the model
model_fit = model.fit(disp=False)

# Make predictions
y_pred = model_fit.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1, exog=x_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

# Print model summary
print(model_fit.summary())

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE: 0.8279954886124425
MAE: 0.6131896768704402
                               SARIMAX Results                                
Dep. Variable:           People Count   No. Observations:                 3456
Model:                        SARIMAX   Log Likelihood               -4368.751
Date:                Mon, 03 Feb 2025   AIC                           8757.502
Time:                        05:55:27   BIC                           8818.981
Sample:                             0   HQIC                          8779.457
                               - 3456                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0112      0.019      0.577      0.564      -0.027       0.049
x2             0.0149      0.019      0.770      0.441      -0.023       0.053
x3 

  return get_prediction_index(
  return get_prediction_index(
