In [1]:
# MLP Benchmark Model

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Load the data
merged_downtown_data = pd.read_csv('./merged_downtown_data.csv')
ridesourcing_data = pd.read_csv("./Ridesourcing_CensusCount_ALL_0_Filled.csv")

# Renaming and setting index for merging
merged_downtown_data = merged_downtown_data.rename(columns={"TractID": "index"})
merged_downtown_data.set_index("index", inplace=True)
ridesourcing_data.set_index("index", inplace=True)

# Merging the dataframes
merged_df = merged_downtown_data.join(ridesourcing_data, how='inner')

# Dropping the unnecessary columns
columns_to_drop = ["Unnamed: 0", "X", "Y"]
merged_df = merged_df.drop(columns=columns_to_drop)

# Create the "is_downtown" column
downtown_areas = [
    '17031839000', '17031080202', '17031833000', '17031833100', '17031839100', 
    '17031081201', '17031080201', '17031081202', '17031842200', '17031838300', 
    '17031081401', '17031081403', '17031081300', '17031081100', '17031080300', 
    '17031080100', '17031080400', '17031081000', '17031320400', '17031320600', 
    '17031081402', '17031320100', '17031081900', '17031081500', '17031081800', 
    '17031081600', '17031081700', '17031280100', '17031281900'
]

downtown_areas = [int(tract_id) for tract_id in downtown_areas]

merged_df['is_downtown'] = merged_df.index.isin(downtown_areas).astype(int)

# Verify the 'is_downtown' column
print("Is Downtown Column Distribution:\n", merged_df['is_downtown'].value_counts())

# Aggregate the travel demand by summing across all time columns
ridesourcing_data_aggregated = ridesourcing_data.sum(axis=1)

# Splitting the data into features and aggregated target
features = merged_df.drop(columns=ridesourcing_data.columns)
target = ridesourcing_data_aggregated

# Create MinMaxScaler instances for features and target
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Fit and transform the features and target
features_scaled = feature_scaler.fit_transform(features)
target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1))

# Ensure deterministic split with stratification based on "is_downtown" column
X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(features_scaled, target_scaled, features.index, test_size=0.2, stratify=merged_df['is_downtown'], random_state=42)

# Convert data to PyTorch tensors
X_train = torch.from_numpy(X_train).float()
X_test = torch.from_numpy(X_test).float()
y_train = torch.from_numpy(y_train).float()
y_test = torch.from_numpy(y_test).float()

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
n_features = X_train.shape[1]
seq_len = X_train.shape[1]
pre_len = 1
n_neurons = 64
batch_size = 32
num_epochs = 100
learning_rate = 0.001

# Define the MLP model
class Net(nn.Module):
    def __init__(self, n_features, n_neurons, device, seq_len, pre_len):
        super(Net, self).__init__()
        self.flatten = nn.Flatten()
        self.device = device
        self.n_features = n_features
        self.n_neurons = n_neurons
        self.seq_len = seq_len
        self.pre_len = pre_len
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(seq_len, n_neurons),
            nn.ReLU(),
            nn.Dropout(p=0.01),
            nn.Linear(n_neurons, pre_len),
            nn.ReLU(),
            nn.Dropout(p=0.01),
        )

    '''
    def forward(self, x):
        # x is a 2D tensor (batch_size, n_features)
        x = x.reshape((-1, self.seq_len))
        pred = self.linear_relu_stack(x).to(self.device)
        pred = pred.reshape((-1, self.n_neurons))
        return pred.float()
    '''
    def forward(self, x):
        b_size = x.size(0)
        x = x.reshape((-1, self.seq_len))
        pred = self.linear_relu_stack(x).to(self.device)
        pred = pred.reshape((b_size, -1))  # Reshape to (batch_size, output_features)
        return pred.float()

# Create the model instance
model = Net(n_features, n_neurons, device, seq_len, pre_len).to(device)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for i in range((X_train.size(0) - 1) // batch_size + 1):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, X_train.size(0))
        
        inputs = X_train[start_idx:end_idx, :].to(device)
        targets = y_train[start_idx:end_idx].to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        with torch.no_grad():
            y_pred = model(X_test.to(device))
            mse = criterion(y_pred, y_test.to(device))
            print(f'Epoch [{epoch+1}/{num_epochs}], MSE: {mse.item():.4f}')

# Inverse transform the scaled predictions and actual values
y_pred_original = target_scaler.inverse_transform(y_pred.cpu().numpy().reshape(-1, 1))
y_test_original = target_scaler.inverse_transform(y_test.numpy().reshape(-1, 1))

# Calculate MSE and MAPE
mse = mean_squared_error(y_test_original, y_pred_original)
mape = mean_absolute_percentage_error(y_test_original, y_pred_original)
print(f"\nMean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}")

# Create a dataframe with actual and predicted values
results_df = pd.DataFrame({'Actual': y_test_original.flatten(), 'Predicted': y_pred_original.flatten()}, index=index_test)

# Print the results for all downtown areas and some other areas
print("\nResults for Downtown Areas:")
print(results_df[results_df.index.isin(downtown_areas)])

print("\nResults for Some Other Areas:")
other_areas = results_df[~results_df.index.isin(downtown_areas)].sample(5).index
print(results_df[results_df.index.isin(other_areas)])

Is Downtown Column Distribution:
 is_downtown
0    742
1     29
Name: count, dtype: int64
Epoch [10/100], MSE: 0.0021
Epoch [20/100], MSE: 0.0018
Epoch [30/100], MSE: 0.0014
Epoch [40/100], MSE: 0.0013
Epoch [50/100], MSE: 0.0012
Epoch [60/100], MSE: 0.0012
Epoch [70/100], MSE: 0.0011
Epoch [80/100], MSE: 0.0011
Epoch [90/100], MSE: 0.0009
Epoch [100/100], MSE: 0.0008

Mean Squared Error (MSE): 2197058560.0000
Mean Absolute Percentage Error (MAPE): 0.8588

Results for Downtown Areas:
                    Actual      Predicted
index                                    
17031081900   53560.996094  187097.265625
17031080202  179386.000000  271374.343750
17031081402  225875.000000  410526.968750
17031320400  397826.000000  689930.562500
17031080100  135940.000000  292310.250000
17031839000  288007.000000  529001.812500

Results for Some Other Areas:
               Actual    Predicted
index                             
17031250800    9818.0      50.0000
17031400500   17087.0      50.0000
1703

In [2]:
# Linear Regression Model

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Load the data
merged_downtown_data = pd.read_csv('./merged_downtown_data.csv')
ridesourcing_data = pd.read_csv("./Ridesourcing_CensusCount_ALL_0_Filled.csv")

# Renaming and setting index for merging
merged_downtown_data = merged_downtown_data.rename(columns={"TractID": "index"})
merged_downtown_data.set_index("index", inplace=True)
ridesourcing_data.set_index("index", inplace=True)

# Merging the dataframes
merged_df = merged_downtown_data.join(ridesourcing_data, how='inner')

# Dropping the unnecessary columns
columns_to_drop = ["Unnamed: 0", "X", "Y"]
merged_df = merged_df.drop(columns=columns_to_drop)

# Create the "is_downtown" column
downtown_areas = [
    '17031839000', '17031080202', '17031833000', '17031833100', '17031839100', 
    '17031081201', '17031080201', '17031081202', '17031842200', '17031838300', 
    '17031081401', '17031081403', '17031081300', '17031081100', '17031080300', 
    '17031080100', '17031080400', '17031081000', '17031320400', '17031320600', 
    '17031081402', '17031320100', '17031081900', '17031081500', '17031081800', 
    '17031081600', '17031081700', '17031280100', '17031281900'
]

downtown_areas = [int(tract_id) for tract_id in downtown_areas]

merged_df['is_downtown'] = merged_df.index.isin(downtown_areas).astype(int)

# Verify the 'is_downtown' column
print("Is Downtown Column Distribution:\n", merged_df['is_downtown'].value_counts())

# Aggregate the travel demand by summing across all time columns
ridesourcing_data_aggregated = ridesourcing_data.sum(axis=1)

# Splitting the data into features and aggregated target
features = merged_df.drop(columns=ridesourcing_data.columns)
target = ridesourcing_data_aggregated

# Create MinMaxScaler instances for features and target
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Fit and transform the features and target
features_scaled = feature_scaler.fit_transform(features)
target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1)).flatten()

# Ensure deterministic split with stratification based on "is_downtown" column
X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(features_scaled, target_scaled, features.index, test_size=0.2, stratify=merged_df['is_downtown'], random_state=42)

# Create and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Inverse transform the scaled predictions and actual values
y_pred_original = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Calculate MSE and MAPE
mse = mean_squared_error(y_test_original, y_pred_original)
mape = mean_absolute_percentage_error(y_test_original, y_pred_original)
print(f"\nMean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}")

# Create a dataframe with actual and predicted values
results_df = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original}, index=index_test)

# Print the results for all downtown areas and some other areas
print("\nResults for Downtown Areas:")
print(results_df[results_df.index.isin(downtown_areas)])

print("\nResults for Some Other Areas:")
other_areas = results_df[~results_df.index.isin(downtown_areas)].sample(5).index
print(results_df[results_df.index.isin(other_areas)])

Is Downtown Column Distribution:
 is_downtown
0    742
1     29
Name: count, dtype: int64

Mean Squared Error (MSE): 3923321098.2496
Mean Absolute Percentage Error (MAPE): 16.3147

Results for Downtown Areas:
               Actual      Predicted
index                               
17031081900   53561.0  344741.405281
17031080202  179386.0  473668.883472
17031081402  225875.0  393177.677720
17031320400  397826.0  674698.192450
17031080100  135940.0  401206.169146
17031839000  288007.0  616844.185044

Results for Some Other Areas:
              Actual      Predicted
index                              
17031200300   2130.0  -33245.086603
17031030103  20203.0    6862.516583
17031050200  82031.0   50829.303819
17031841700   1249.0  -17376.685568
17031831100  99091.0  107825.826000


In [3]:
# Random Forest Model

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Load the data
merged_downtown_data = pd.read_csv('./merged_downtown_data.csv')
ridesourcing_data = pd.read_csv("./Ridesourcing_CensusCount_ALL_0_Filled.csv")

# Renaming and setting index for merging
merged_downtown_data = merged_downtown_data.rename(columns={"TractID": "index"})
merged_downtown_data.set_index("index", inplace=True)
ridesourcing_data.set_index("index", inplace=True)

# Merging the dataframes
merged_df = merged_downtown_data.join(ridesourcing_data, how='inner')

# Dropping the unnecessary columns
columns_to_drop = ["Unnamed: 0", "X", "Y"]
merged_df = merged_df.drop(columns=columns_to_drop)

# Create the "is_downtown" column
downtown_areas = [
    '17031839000', '17031080202', '17031833000', '17031833100', '17031839100', 
    '17031081201', '17031080201', '17031081202', '17031842200', '17031838300', 
    '17031081401', '17031081403', '17031081300', '17031081100', '17031080300', 
    '17031080100', '17031080400', '17031081000', '17031320400', '17031320600', 
    '17031081402', '17031320100', '17031081900', '17031081500', '17031081800', 
    '17031081600', '17031081700', '17031280100', '17031281900'
]

downtown_areas = [int(tract_id) for tract_id in downtown_areas]

merged_df['is_downtown'] = merged_df.index.isin(downtown_areas).astype(int)

# Verify the 'is_downtown' column
print("Is Downtown Column Distribution:\n", merged_df['is_downtown'].value_counts())

# Aggregate the travel demand by summing across all time columns
ridesourcing_data_aggregated = ridesourcing_data.sum(axis=1)

# Splitting the data into features and aggregated target
features = merged_df.drop(columns=ridesourcing_data.columns)
target = ridesourcing_data_aggregated

# Create MinMaxScaler instances for features and target
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Fit and transform the features and target
features_scaled = feature_scaler.fit_transform(features)
target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1)).flatten()

# Ensure deterministic split with stratification based on "is_downtown" column
X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(features_scaled, target_scaled, features.index, test_size=0.2, stratify=merged_df['is_downtown'], random_state=42)

# Create and train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Inverse transform the scaled predictions and actual values
y_pred_original = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Calculate MSE and MAPE
mse = mean_squared_error(y_test_original, y_pred_original)
mape = mean_absolute_percentage_error(y_test_original, y_pred_original)
print(f"\nMean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}")

# Create a dataframe with actual and predicted values
results_df = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original}, index=index_test)

# Print the results for all downtown areas and some other areas
print("\nResults for Downtown Areas:")
print(results_df[results_df.index.isin(downtown_areas)])

print("\nResults for Some Other Areas:")
other_areas = results_df[~results_df.index.isin(downtown_areas)].sample(5).index
print(results_df[results_df.index.isin(other_areas)])

Is Downtown Column Distribution:
 is_downtown
0    742
1     29
Name: count, dtype: int64

Mean Squared Error (MSE): 2711255094.9553
Mean Absolute Percentage Error (MAPE): 2.7825

Results for Downtown Areas:
               Actual  Predicted
index                           
17031081900   53561.0  249805.66
17031080202  179386.0  296901.04
17031081402  225875.0  271395.47
17031320400  397826.0  737212.16
17031080100  135940.0  284684.28
17031839000  288007.0  591158.98

Results for Some Other Areas:
              Actual  Predicted
index                          
17031150200  56225.0   32081.07
17031842000  38639.0   46896.93
17031710500  14672.0   23826.02
17031400300  18902.0   20411.15
17031190702   6251.0   14428.21
