#Sckit-learn linear regression

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Load the dataset
df = pd.read_csv('municipality_bus_utilization.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Resample the data to hourly frequency, taking the max value for each hour
df = df.set_index('timestamp')
df = df.groupby(['municipality_id', pd.Grouper(freq='H')]).max().reset_index()

# Pivot the DataFrame so that each row represents a timestamp and each column represents a municipality
df = df.pivot(index='timestamp', columns='municipality_id', values='usage')

# Interpolate missing values using linear interpolation
df = df.interpolate(method='linear', axis=0)

# Convert the timestamps to Unix timestamps
df['timestamp'] = df.index.astype(int) // 10**9

# Split the data into training and test sets. Use the period from 2017-06-19 to 2017-08-04 as the training set
train = df.loc[df.index < '2017-08-05']
test = df.loc[(df.index >= '2017-08-05') & (df.index < '2017-08-19')]

# Train a linear regression model for each municipality using the training data
models = {}

for col in train.columns:
    model = LinearRegression()
    model.fit(train['timestamp'].values.reshape(-1, 1), train[col])
    models[col] = model

# Make predictions for the test set using each model
predictions = pd.DataFrame(columns=test.columns)

for col in test.columns:
    model = models[col]
    pred = model.predict(test['timestamp'].values.reshape(-1, 1))
    predictions[col] = pred

# Calculate the mean absolute error for the test set
error = mean_absolute_error(test, predictions)

print('MAE for test set:', error)



MAE for test set: 208.38002820097657


#XGBoost

In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# Load the dataset
df = pd.read_csv("/content/municipality_bus_utilization.csv")

# Aggregate the two measurements for each hour by taking the maximum value
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
df = df.groupby([pd.Grouper(freq='H'), 'municipality_id']).max().unstack()

# Interpolate any missing values
df.interpolate(method='linear', inplace=True)

# Split the data into training and test sets
train = df.loc[df.index < '2017-08-05']
test = df.loc[(df.index >= '2017-08-05') & (df.index < '2017-08-19')]

# Interpolate any missing values in the test set
test.interpolate(method='linear', inplace=True)

# Remove any rows in test that still have missing values
test.dropna(inplace=True)

# Define the features and target
X_train = train.index.to_series().dt.hour.values.reshape(-1, 1)
X_test = test.index.to_series().dt.hour.values.reshape(-1, 1)
y_train = train.values.reshape(-1)
y_test = test.values.reshape(-1)

# Train the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=6, learning_rate=0.1)
model.fit(X_train, y_train)

# Make predictions for the test set
y_pred = model.predict(X_test)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.interpolate(method='linear', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.dropna(inplace=True)


In [3]:
test

Unnamed: 0_level_0,usage,usage,usage,usage,usage,usage,usage,usage,usage,usage,total_capacity,total_capacity,total_capacity,total_capacity,total_capacity,total_capacity,total_capacity,total_capacity,total_capacity,total_capacity
municipality_id,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2017-08-05 08:00:00,252,158,327,781,2132,23,571,651,572,333,2813,397,697,1930,3893,587,3113,2019,2947,1332
2017-08-05 09:00:00,393,216,405,1121,2814,48,755,820,847,424,2813,397,697,1930,3893,587,3113,2019,2947,1332
2017-08-05 10:00:00,517,300,483,1289,3123,96,903,1038,1059,564,2813,397,697,1930,3893,587,3113,2019,2947,1332
2017-08-05 11:00:00,653,397,567,1557,3447,157,1057,1319,1249,761,2813,397,697,1930,3893,587,3113,2019,2947,1332
2017-08-05 12:00:00,655,398,584,1569,3461,170,1082,1374,1264,799,2813,397,697,1930,3893,587,3113,2019,2947,1332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-18 12:00:00,1287,412,440,861,1723,224,1531,1255,805,824,2813,397,697,1930,3893,587,3113,2019,2947,1332
2017-08-18 13:00:00,1286,411,485,930,1819,267,1595,1334,847,858,2813,397,697,1930,3893,587,3113,2019,2947,1332
2017-08-18 14:00:00,1286,411,576,987,1893,319,1699,1470,928,905,2813,397,697,1930,3893,587,3113,2019,2947,1332
2017-08-18 15:00:00,1286,411,557,916,1764,302,1670,1423,944,875,2813,397,697,1930,3893,587,3113,2019,2947,1332


In [4]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(test.values.ravel(), y_pred.ravel())
print("MAE: ", mae)

MAE:  86.18221218424632


#CNN

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Load the dataset
df = pd.read_csv("/content/municipality_bus_utilization.csv")

# Aggregate the two measurements for each hour by taking the maximum value
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
df = df.groupby([pd.Grouper(freq='H'), 'municipality_id']).max().unstack()

# Interpolate any missing values
df.interpolate(method='linear', inplace=True)

# Split the data into training and test sets
train = df.loc[df.index < '2017-08-05']
test = df.loc[(df.index >= '2017-08-05') & (df.index < '2017-08-19')]

# Interpolate any missing values in the test set
test.interpolate(method='linear', inplace=True)

# Remove any rows in test that still have missing values
test.dropna(inplace=True)

# Define the features and target
X_train = train.values.reshape(-1, train.shape[1], 1)
X_test = test.values.reshape(-1, test.shape[1], 1)
y_train = train.values.reshape(-1, train.shape[1])
y_test = test.values.reshape(-1, test.shape[1])

# Define the CNN model architecture
model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(train.shape[1], 1)))
model.add(MaxPooling1D(2))
model.add(Conv1D(32, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(train.shape[1]))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=16, verbose=0)

# Make predictions for the test set
y_pred = model.predict(X_test)

# Compute the mean absolute error
mae = mean_absolute_error(y_test.ravel(), y_pred.ravel())
print("MAE: ", mae)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.interpolate(method='linear', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.dropna(inplace=True)


MAE:  21.68852417814104
