In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.tsa.arima.model import ARIMA


In [2]:
bus_df = pd.read_csv("municipality_bus_utilization.csv", header=None, names=["timestamp", "municipality_id", "usage", "total_capacity"])

In [3]:
for i in range(bus_df.shape[1]):
    a = pd.isna(bus_df[bus_df.columns[i]])
    while a is False: 
        print("NaN value found")

In [4]:
bus_df = bus_df.iloc[1:]

In [5]:
labels = []
for i in range(1, len(bus_df)):
    if bus_df["timestamp"][i][12] == "7":
        labels.append(i)

In [6]:
bus_df = bus_df.drop(labels=labels, axis=0)

In [7]:
bus_df

Unnamed: 0,timestamp,municipality_id,usage,total_capacity
11,2017-06-04 08:25:42,2,311,697
12,2017-06-04 08:25:42,9,487,1332
13,2017-06-04 08:25:42,6,687,3113
14,2017-06-04 08:25:42,7,789,2019
15,2017-06-04 08:25:42,4,1476,3893
...,...,...,...,...
13066,2017-08-19 16:30:35,2,548,697
13067,2017-08-19 16:30:35,8,1193,2947
13068,2017-08-19 16:30:35,7,1354,2019
13069,2017-08-19 16:30:35,6,1680,3113


In [8]:
bus_df["timestamp"] = pd.to_datetime(bus_df["timestamp"])

In [9]:
bus_df.set_index("timestamp", inplace=True)

In [10]:
bus_df_usage = bus_df.groupby([pd.Grouper(freq="H"), "municipality_id"])["usage"].max()
bus_df_capacity = bus_df.groupby([pd.Grouper(freq="H"), "municipality_id"])["total_capacity"].max()

In [11]:
bus_df_usage_updated = bus_df_usage.unstack()
bus_df_capacity_updated = bus_df_capacity.unstack()

In [12]:
# bus_df.head()
bus_df_usage_updated.head()
# bus_df_capacity_updated.head()

municipality_id,0,1,2,3,4,5,6,7,8,9
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-06-04 08:00:00,332,164,355,967,2016,89,801,873,857,527
2017-06-04 09:00:00,485,245,494,1263,2811,159,941,1151,1172,666
2017-06-04 10:00:00,583,317,582,1396,3178,228,1219,1373,1363,800
2017-06-04 11:00:00,614,341,643,1456,3257,268,1297,1479,1399,845
2017-06-04 12:00:00,613,343,657,1469,3260,275,1306,1493,1406,845


In [13]:
df_training_usage = bus_df_usage_updated[:"2017-08-04"]
df_test_usage = bus_df_usage_updated["2017-08-04": ]

df_training_capacity = bus_df_capacity_updated[:"2017-08-04"]
df_test_capacity = bus_df_capacity_updated["2017-08-04": ]

In [14]:
df_training_usage_4complex = bus_df_usage[:"2017-08-04"]
df_test_usage_4complex = bus_df_usage["2017-08-04": ]

df_training_capacity_4complex = bus_df_capacity[:"2017-08-04"]
df_test_capacity_4complex = bus_df_capacity["2017-08-04": ]

In [15]:
def format_test(test_set):
    count = 0
    real_value = []
    real_id = []
    timestamp =[]
    for i in range(10): # number of municipality  
        for j in range(9): # number of hours in a day
            count = 0
            for k in range(14): # number of days in list
                val = int(test_set[str(i)][9*k+j])
            
                real_id.append(i)
                timestamp.append((j+8.00))
                real_value.append(val)
    d = {'municipality_id': real_id, 'timestamp': timestamp, "usage": real_value}
    df = pd.DataFrame(data=d)
    return df

df_test_usage_formatted = format_test(df_test_usage)
df_test_capacity_formatted = format_test(df_test_capacity)


### simple model starts here !

In [16]:
def simple_model(training_set):
    count = 0
    mean = []
    man_id = []
    timestamp =[]
    for i in range(10): # number of municipality  
        for j in range(9): # number of hours in a day
            count = 0
            for k in range(56): # number of days in list
                count += int(training_set[str(i)][9*k+j])
            
            man_id.append(i)
            timestamp.append((j+8.00))
            mean.append(round(count/(k+1),2))
    d = {'municipality_id': man_id, 'timestamp': timestamp, "usage": mean}
    df = pd.DataFrame(data=d)
    return df

In [17]:
simple_model_usage = simple_model(df_training_usage)

simple_model_usage_formatted = pd.concat([simple_model_usage]*int((len(df_test_usage_formatted)/len(simple_model_usage))))
MAE = mean_absolute_error(simple_model_usage_formatted, df_test_usage_formatted)
print("MAE for usage: ", round(MAE, 2))
MSE = mean_squared_error(simple_model_usage_formatted, df_test_usage_formatted)
print("MSE for usage: ", round(MSE, 2))
R2 = r2_score(simple_model_usage_formatted, df_test_usage_formatted)
print("R2 for usage: ", round(R2, 2))

MAE for usage:  231.52
MSE for usage:  290756.01
R2 for usage:  -1.03


In [18]:
simple_model_capacity = simple_model(df_training_capacity)

simple_model_capacity_formatted = pd.concat([simple_model_capacity]*int((len(df_test_capacity_formatted)/len(simple_model_capacity))))
MAE = mean_absolute_error(simple_model_capacity_formatted, df_test_capacity_formatted)
print("MAE for capacity: ", round(MAE, 2))
MSE = mean_squared_error(simple_model_capacity_formatted, df_test_capacity_formatted)
print("MSE for capacity: ", round(MSE, 2))
R2 = r2_score(simple_model_capacity_formatted, df_test_capacity_formatted)
print("R2 for capacity: ", round(R2, 2))

MAE for capacity:  435.95
MSE for capacity:  862643.58
R2 for capacity:  -0.9


### complex model starts here !

In [19]:
def complex_model(training_set): 
    model = ARIMA(training_set.astype(float), order=(5,1,0))
    model_fit = model.fit()
    print(model_fit.summary())
    residuals = pd.DataFrame(model_fit.resid)
    print(residuals.describe())
    return model, model_fit

In [20]:
complexx, complexx_fit = complex_model(df_training_usage_4complex)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                               SARIMAX Results                                
Dep. Variable:                  usage   No. Observations:                 5130
Model:                 ARIMA(5, 1, 0)   Log Likelihood              -40963.661
Date:                Thu, 09 Mar 2023   AIC                          81939.321
Time:                        14:05:53   BIC                          81978.577
Sample:                             0   HQIC                         81953.062
                               - 5130                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.8081      0.014    -57.798      0.000      -0.836      -0.781
ar.L2         -0.7398      0.023    -32.301      0.000      -0.785      -0.695
ar.L3         -0.6091      0.039    -15.806      0.0

In [21]:
predictions = []
for t in range(len(df_test_usage_4complex)):
    model = complexx
    model_fit = complexx_fit
    output = model_fit.forecast()
    predicted = output
    predictions.append(predicted)
    obs = df_test_usage_4complex[t]


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return

In [22]:
MAE = mean_absolute_error(df_test_usage_4complex, predictions)
print("MAE for capacity: ", round(MAE, 2))
MSE = mean_squared_error(df_test_usage_4complex, predictions)
print("MSE for capacity: ", round(MSE, 2))
R2 = r2_score(df_test_usage_4complex, predictions)
print("R2 for capacity: ", round(R2, 2))

MAE for capacity:  574.78
MSE for capacity:  552214.66
R2 for capacity:  -0.06
