In [2]:
import pandas as pd
import glob

# Load all CSV files from data folder
data_files = glob.glob('data/*.csv')
dataframes = [pd.read_csv(file) for file in data_files]

# Combine all datasets
combined_data = pd.concat(dataframes, ignore_index=True)

  dataframes = [pd.read_csv(file) for file in data_files]


In [3]:
combined_data.head(1000)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-02-25 17:24:20,2016-02-25 17:27:20,2,0.70,-73.947250,40.763771,1,N,-73.992012,40.735390,2.0,5.0,0.0,0.5,0.00,0.0,0.3,5.80
1,2,2016-02-25 23:10:50,2016-02-25 23:31:50,2,5.52,-73.983017,40.750992,1,N,-73.988586,40.758839,2.0,20.0,0.5,0.5,0.00,0.0,0.3,21.30
2,2,2016-02-01 00:00:01,2016-02-01 00:10:52,6,1.99,-73.992340,40.758202,1,N,-73.964355,40.757977,1.0,9.5,0.5,0.5,0.70,0.0,0.3,11.50
3,1,2016-02-01 00:00:04,2016-02-01 00:05:16,1,1.50,-73.981453,40.749722,1,N,-73.982323,40.763985,2.0,6.5,0.5,0.5,0.00,0.0,0.3,7.80
4,2,2016-02-01 00:00:05,2016-02-01 00:20:59,1,5.60,-74.000603,40.729755,1,N,-73.951324,40.669834,1.0,20.0,0.5,0.5,4.00,0.0,0.3,25.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2,2016-02-19 20:51:22,2016-02-19 20:53:51,1,0.56,-73.991707,40.769863,1,N,-73.996605,40.763168,1.0,4.0,0.5,0.5,1.59,0.0,0.3,6.89
996,1,2016-02-19 20:51:23,2016-02-19 21:08:46,1,4.20,-73.949875,40.822170,1,N,-73.954147,40.776306,2.0,16.0,0.0,0.5,0.00,0.0,0.3,16.80
997,1,2016-02-19 20:51:23,2016-02-19 21:00:04,1,1.60,-73.977966,40.752289,1,N,-74.002617,40.756081,1.0,8.0,0.5,0.5,0.00,0.0,0.3,9.30
998,1,2016-02-19 20:51:23,2016-02-19 20:57:24,1,1.00,-73.995850,40.764332,1,N,-73.991554,40.755051,1.0,6.0,0.5,0.5,1.45,0.0,0.3,8.75


In [4]:
# Create meaningful features
combined_data['pickup_datetime'] = pd.to_datetime(combined_data['tpep_pickup_datetime'])
combined_data['dropoff_datetime'] = pd.to_datetime(combined_data['tpep_dropoff_datetime'])
combined_data['trip_duration'] = (combined_data['dropoff_datetime'] - combined_data['pickup_datetime']).dt.total_seconds() / 60
combined_data['hour'] = combined_data['pickup_datetime'].dt.hour
combined_data['day_of_week'] = combined_data['pickup_datetime'].dt.dayofweek
combined_data['speed'] = combined_data['trip_distance'] / (combined_data['trip_duration'] / 60)

# Select features for correlation analysis
features = ['trip_distance', 'trip_duration', 'passenger_count', 'fare_amount',
           'tip_amount', 'tolls_amount', 'total_amount', 'hour', 'day_of_week', 'speed', 'RateCodeID']


# Data preparation

In [5]:
# Remove duplicates and handle missing values
data = combined_data.drop_duplicates()
data = data.dropna(subset=['pickup_longitude', 'pickup_latitude', 'trip_distance', 'fare_amount', 'RateCodeID'])



# Remove invalid coordinates (outside NYC bounds)
data = data[(data['pickup_longitude'] >= -74.3) & (data['pickup_longitude'] <= -73.7)]
data = data[(data['pickup_latitude'] >= 40.5) & (data['pickup_latitude'] <= 40.9)]


In [6]:
# Convert datetime to numerical features
data['pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
data['hour'] = data['pickup_datetime'].dt.hour
data['day_of_week'] = data['pickup_datetime'].dt.dayofweek

# Calculate trip duration
data['dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])
data['trip_duration'] = (data['dropoff_datetime'] - data['pickup_datetime']).dt.total_seconds() / 60

In [7]:
# Remove extreme outliers that are clearly data errors
data = data[data['trip_distance'] > 0]
data = data[data['fare_amount'] > 0]
data = data[data['trip_duration'] > 0]
data = data[data['trip_duration'] < 300]  # Less than 5 hours


In [8]:
# Select numerical features for anomaly detection
nn_features = [
    'trip_distance', 'fare_amount', 'trip_duration',
    'pickup_longitude', 'pickup_latitude',
    'hour', 'day_of_week', 'passenger_count', 'RateCodeID'
]

# 1. Remove extreme outliers
y = data['fare_amount']
X = data[nn_features]

# Filter out extreme outliers (keep 95% of data)
fare_q95 = y.quantile(0.99)
fare_q05 = y.quantile(0.01)
mask = (y >= fare_q05) & (y <= fare_q95)
X = X[mask]
y = y[mask]

print(f"Removed {(~mask).sum()} extreme outliers ({(~mask).mean()*100:.1f}%)")
print(f"New fare range: ${y.min():.2f} - ${y.max():.2f}")
print(f"New fare std: ${y.std():.2f}")

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_scaler = StandardScaler()
X[nn_features] = X_scaler.fit_transform(X[nn_features])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Removed 57115 extreme outliers (1.0%)
New fare range: $3.50 - $52.00
New fare std: $9.10


In [9]:
print(len(X_train))
X_train.head()

4452822


Unnamed: 0,trip_distance,fare_amount,trip_duration,pickup_longitude,pickup_latitude,hour,day_of_week,passenger_count,RateCodeID
1520956,-0.001328,-0.334741,-0.301993,0.045044,0.148907,1.161736,0.12643,-0.49517,-0.07606
122211,-7.1e-05,0.269707,0.395274,0.196537,1.879661,-0.603129,-1.579052,-0.49517,-0.07606
1553074,-0.000228,0.104858,-0.009141,-0.416846,0.228574,1.322179,0.12643,-0.49517,-0.07606
447257,-0.001611,-0.38969,-0.301993,0.106295,0.105634,-0.121802,1.831913,2.54067,-0.07606
4152941,-0.00227,-0.664439,-0.610534,-0.214782,-0.004266,-0.282244,-1.579052,2.54067,-0.07606


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

class TaxiModel(nn.Module):
    def __init__(self, input_size=9):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    
    def forward(self, x):
        return self.layers(x)

# Initialize model with proper configuration
model = TaxiModel()
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)

# Convert data to tensors with proper shapes
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1)
X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False)


In [None]:
def train_model(model, train_loader, test_loader, epochs=10):
    train_losses, test_losses = [], []
    
    for epoch in range(epochs):
        # Training
        model.train(True)
        train_loss = 0
        for i, (batch_x, batch_y) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = loss_fn(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            if i % 1000 == 0:
                print(f'  Batch {i}: Loss {loss.item():.4f}')
                print(f'  Batch {i}: Absolute Loss {nn.L1Loss()(outputs, batch_y).item():.4f}')

        model.train(False)

        # Validation
        model.eval()
        test_loss = 0
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                outputs = model(batch_x)
                loss = loss_fn(outputs, batch_y)
                test_loss += loss.item()
        
        train_loss /= len(train_loader)
        test_loss /= len(test_loader)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
        scheduler.step(test_loss)
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}: Train RMSE: {train_loss**0.5:.3f}, Test RMSE: {test_loss**0.5:.3f}')
    
    return train_losses, test_losses

# Train the model
train_losses, test_losses = train_model(model, train_dataloader, test_dataloader)


  Batch 0: Loss 230.9985
  Batch 0: Absolute Loss 12.2792
  Batch 1000: Loss 1.9665
  Batch 1000: Absolute Loss 0.9783
  Batch 2000: Loss 1.7200
  Batch 2000: Absolute Loss 0.8634
  Batch 3000: Loss 1.7349
  Batch 3000: Absolute Loss 0.8133
  Batch 4000: Loss 2.2250
  Batch 4000: Absolute Loss 0.8841
  Batch 5000: Loss 1.7938
  Batch 5000: Absolute Loss 0.7732
  Batch 6000: Loss 1.3281
  Batch 6000: Absolute Loss 0.6471
  Batch 7000: Loss 1.6757
  Batch 7000: Absolute Loss 0.6869
  Batch 8000: Loss 1.2257
  Batch 8000: Absolute Loss 0.5427
Epoch 0: Train RMSE: 6.712, Test RMSE: 1.144


In [13]:
# Evaluate model performance
from sklearn.metrics import mean_squared_error, r2_score

model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor).numpy()
    y_true = y_test_tensor.numpy()

rmse = mean_squared_error(y_true, y_pred)**0.5
r2 = r2_score(y_true, y_pred)

print(f'Final RMSE: ${rmse:.2f}')
print(f'R² Score: {r2:.3f}')
print(f'Mean fare: ${y_true.mean():.2f}')


Final RMSE: $1.14
R² Score: 0.984
Mean fare: $12.04
