In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Fix the random seed for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set a fixed random seed
set_seed(42)

# Load the data
merged_downtown_data = pd.read_csv('./merged_downtown_data.csv')
ridesourcing_data = pd.read_csv("./Ridesourcing_CensusCount_ALL_0_Filled.csv")

# Renaming and setting index for merging
merged_downtown_data = merged_downtown_data.rename(columns={"TractID": "index"})
merged_downtown_data.set_index("index", inplace=True)
ridesourcing_data.set_index("index", inplace=True)

# Merging the dataframes
merged_df = merged_downtown_data.join(ridesourcing_data, how='inner')

# Dropping the unnecessary columns
columns_to_drop = ["Unnamed: 0", "X", "Y"]
merged_df = merged_df.drop(columns=columns_to_drop)

# Create the "is_downtown" column
downtown_areas = [
    '17031839000', '17031080202', '17031833000', '17031833100', '17031839100', 
    '17031081201', '17031080201', '17031081202', '17031842200', '17031838300', 
    '17031081401', '17031081403', '17031081300', '17031081100', '17031080300', 
    '17031080100', '17031080400', '17031081000', '17031320400', '17031320600', 
    '17031081402', '17031320100', '17031081900', '17031081500', '17031081800', 
    '17031081600', '17031081700', '17031280100', '17031281900'
]

downtown_areas = [int(tract_id) for tract_id in downtown_areas]
merged_df['is_downtown'] = merged_df.index.isin(downtown_areas).astype(int)

# Verify the 'is_downtown' column
print("Is Downtown Column Distribution:\n", merged_df['is_downtown'].value_counts())

# Aggregate the travel demand by summing across all time columns
ridesourcing_data_aggregated = ridesourcing_data.sum(axis=1)

# Splitting the data into features and aggregated target
features = merged_df.drop(columns=ridesourcing_data.columns)
target = ridesourcing_data_aggregated

# Create MinMaxScaler instances for features and target
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Fit and transform the features and target
features_scaled = feature_scaler.fit_transform(features)
target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1)).flatten()

# Use stratified sampling to split the data
train_indices, test_indices = train_test_split(range(len(features_scaled)), test_size=0.2, stratify=merged_df['is_downtown'])
X_train, X_test = features_scaled[train_indices], features_scaled[test_indices]
y_train, y_test = target_scaled[train_indices], target_scaled[test_indices]
is_downtown_train = merged_df['is_downtown'].iloc[train_indices].to_numpy().astype(float)
is_downtown_test = merged_df['is_downtown'].iloc[test_indices].to_numpy().astype(float)

# Define the PyTorch dataset and dataloader
class TravelDataset(Dataset):
    def __init__(self, X, y, is_downtown):
        self.X = X
        self.y = y
        self.is_downtown = is_downtown

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx, :].astype(np.float32)
        y = np.array(self.y[idx], dtype=np.float32)
        is_downtown = np.array(self.is_downtown[idx], dtype=np.float32)
        return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(is_downtown)

train_dataset = TravelDataset(X_train, y_train, is_downtown_train)
test_dataset = TravelDataset(X_test, y_test, is_downtown_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define the Net model
class Net(nn.Module):
    def __init__(self, n_features, n_neurons, device, seq_len, pre_len):
        super(Net, self).__init__()
        self.flatten = nn.Flatten()
        self.device = device
        self.n_features = n_features
        self.n_neurons = n_neurons
        self.seq_len = seq_len
        self.pre_len = pre_len
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(n_features, n_neurons),
            nn.ReLU(),
            nn.Dropout(p=0.01),
            nn.Linear(n_neurons, pre_len),
            nn.ReLU(),
            nn.Dropout(p=0.01),
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x).to(self.device)
        return pred.squeeze()

# Simple MSE loss function
criterion = nn.MSELoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 100

model = Net(n_features=features.shape[1], n_neurons=128, device=device, seq_len=1, pre_len=1)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

best_val_loss = float('inf')
best_model_state_dict = None

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for inputs, targets, _ in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for inputs, targets, _ in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

        val_loss /= len(test_loader)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state_dict = model.state_dict()

# Load the best model based on the validation loss
model.load_state_dict(best_model_state_dict)

# Evaluate the best model on the test set
model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    for inputs, targets, _ in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        y_true.extend(targets.cpu().numpy())
        y_pred.extend(outputs.cpu().numpy())  

y_true = np.array(y_true)
y_pred = np.array(y_pred)
y_true = target_scaler.inverse_transform(y_true.reshape(-1, 1)).flatten()
y_pred = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
mape = mean_absolute_percentage_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)

print(f"MAPE: {mape:.4f}, MSE: {mse:.4f}")

Is Downtown Column Distribution:
 is_downtown
0    742
1     29
Name: count, dtype: int64
MAPE: 0.9094, MSE: 2110705664.0000


In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Fix the random seed for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Load the data
merged_downtown_data = pd.read_csv('./merged_downtown_data.csv')
ridesourcing_data = pd.read_csv("./Ridesourcing_CensusCount_ALL_0_Filled.csv")

# Renaming and setting index for merging
merged_downtown_data = merged_downtown_data.rename(columns={"TractID": "index"})
merged_downtown_data.set_index("index", inplace=True)
ridesourcing_data.set_index("index", inplace=True)

# Merging the dataframes
merged_df = merged_downtown_data.join(ridesourcing_data, how='inner')

# Dropping the unnecessary columns
columns_to_drop = ["Unnamed: 0", "X", "Y"]
merged_df = merged_df.drop(columns=columns_to_drop)

# Create the "is_downtown" column
downtown_areas = [
    '17031839000', '17031080202', '17031833000', '17031833100', '17031839100', 
    '17031081201', '17031080201', '17031081202', '17031842200', '17031838300', 
    '17031081401', '17031081403', '17031081300', '17031081100', '17031080300', 
    '17031080100', '17031080400', '17031081000', '17031320400', '17031320600', 
    '17031081402', '17031320100', '17031081900', '17031081500', '17031081800', 
    '17031081600', '17031081700', '17031280100', '17031281900'
]

downtown_areas = [int(tract_id) for tract_id in downtown_areas]
merged_df['is_downtown'] = merged_df.index.isin(downtown_areas).astype(int)

# Verify the 'is_downtown' column
print("Is Downtown Column Distribution:\n", merged_df['is_downtown'].value_counts())

# Aggregate the travel demand by summing across all time columns
ridesourcing_data_aggregated = ridesourcing_data.sum(axis=1)

# Splitting the data into features and aggregated target
features = merged_df.drop(columns=ridesourcing_data.columns)
target = ridesourcing_data_aggregated

# Create MinMaxScaler instances for features and target
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Fit and transform the features and target
features_scaled = feature_scaler.fit_transform(features)
target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1)).flatten()

# Use stratified sampling to split the data
train_indices, test_indices = train_test_split(range(len(features_scaled)), test_size=0.2, stratify=merged_df['is_downtown'])
X_train, X_test = features_scaled[train_indices], features_scaled[test_indices]
y_train, y_test = target_scaled[train_indices], target_scaled[test_indices]
is_downtown_train = merged_df['is_downtown'].iloc[train_indices].to_numpy().astype(float)
is_downtown_test = merged_df['is_downtown'].iloc[test_indices].to_numpy().astype(float)

# Define the PyTorch dataset and dataloader
class TravelDataset(Dataset):
    def __init__(self, X, y, is_downtown):
        self.X = X
        self.y = y
        self.is_downtown = is_downtown

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx, :].astype(np.float32)
        y = np.array(self.y[idx], dtype=np.float32)
        is_downtown = np.array(self.is_downtown[idx], dtype=np.float32)
        return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(is_downtown)

train_dataset = TravelDataset(X_train, y_train, is_downtown_train)
test_dataset = TravelDataset(X_test, y_test, is_downtown_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define the Net model
class Net(nn.Module):
    def __init__(self, n_features, n_neurons, device, seq_len, pre_len):
        super(Net, self).__init__()
        self.flatten = nn.Flatten()
        self.device = device
        self.n_features = n_features
        self.n_neurons = n_neurons
        self.seq_len = seq_len
        self.pre_len = pre_len
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(n_features, n_neurons),
            nn.ReLU(),
            nn.Dropout(p=0.01),
            nn.Linear(n_neurons, pre_len),
            nn.ReLU(),
            nn.Dropout(p=0.01),
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x).to(self.device)
        return pred.squeeze()

# Simple MSE loss function
criterion = nn.MSELoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 100
num_runs = 10

all_results = {'mape': [], 'mse': []}

for run in range(num_runs):
    set_seed(run)
    
    model = Net(n_features=features.shape[1], n_neurons=128, device=device, seq_len=1, pre_len=1)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    best_val_loss = float('inf')
    best_model_state_dict = None

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets, _ in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)

        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            for inputs, targets, _ in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

            val_loss /= len(test_loader)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state_dict = model.state_dict()

    # Load the best model based on the validation loss
    model.load_state_dict(best_model_state_dict)

    # Evaluate the best model on the test set
    model.eval()
    with torch.no_grad():
        y_true = []
        y_pred = []
        for inputs, targets, _ in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            y_true.extend(targets.cpu().numpy())
            y_pred.extend(outputs.cpu().numpy())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_true = target_scaler.inverse_transform(y_true.reshape(-1, 1)).flatten()
    y_pred = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)

    all_results['mape'].append(mape)
    all_results['mse'].append(mse)

    print(f"Run {run + 1}/{num_runs} - MAPE: {mape:.4f}, MSE: {mse:.4f}")

# Compute average results
avg_mape = np.mean(all_results['mape'])
avg_mse = np.mean(all_results['mse'])
std_mape = np.std(all_results['mape'])
std_mse = np.std(all_results['mse'])

print(f"Benchmark Model - Average MAPE: {avg_mape:.4f} ± {std_mape:.4f}, Average MSE: {avg_mse:.4f} ± {std_mse:.4f}")

Is Downtown Column Distribution:
 is_downtown
0    742
1     29
Name: count, dtype: int64
Run 1/10 - MAPE: 0.8336, MSE: 2780583936.0000
Run 2/10 - MAPE: 0.9697, MSE: 15673330688.0000
Run 3/10 - MAPE: 0.8492, MSE: 2845880064.0000
Run 4/10 - MAPE: 0.8375, MSE: 2646804224.0000
Run 5/10 - MAPE: 0.9697, MSE: 15673330688.0000
Run 6/10 - MAPE: 0.9697, MSE: 15673330688.0000
Run 7/10 - MAPE: 0.9697, MSE: 15673330688.0000
Run 8/10 - MAPE: 0.8782, MSE: 3299755776.0000
Run 9/10 - MAPE: 0.9697, MSE: 15673330688.0000
Run 10/10 - MAPE: 0.8530, MSE: 2722555904.0000
Benchmark Model - Average MAPE: 0.9100 ± 0.0607, Average MSE: 9266223104.0000 ± 6409168896.0000
