In [42]:
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer
import numpy as np
from itertools import product

Note: Data is from https://opendata.dc.gov/ and the dataset from September 2023 is missing
Metadata: https://www.arcgis.com/sharing/rest/content/items/17d73d958f8247e19a4885a4d8bce9dd/info/metadata/metadata.xml?format=default&output=html

In [35]:
def load_parking_violation_data(data_folder):
    files = os.listdir(data_folder)
    print(files)
    all_csvs = glob.glob(os.path.join(data_folder, "*.csv"))
    dfs = []
    for c in all_csvs:
        try:
          df = pd.read_csv(c)
          dfs.append(df)
          print(f"successfully loaded {c}")
        except Exception as e:
          print(f"error opening {c}: {e}")
    return pd.concat(dfs, ignore_index=True)

In [36]:
df = load_parking_violation_data("../CleanData")

['cleaned_parking_violations_v2.csv']


  df = pd.read_csv(c)


successfully loaded ../CleanData/cleaned_parking_violations_v2.csv


Checking the length of the data frame

In [37]:
len(df)

1094297

In [38]:
df.head()

Unnamed: 0,objectid,ticket_number,issuing_agency_code,issuing_agency_name,issuing_agency_short,violation_code,location,plate_state,disposition_code,disposition_type,...,violation_type_desc,issue_datetime,year,month,day,day_of_week,hour,is_weekend,Unnamed: 29,is_fleet_gov_or_rental
0,84395696,275257905,50,FEDERAL PROTECTIVE SERVICES (US GV),FPS,P055,SIDE 1201 CONSTITUTION AVE NW,,134,Dismissed,...,,2025-06-17 12:28:00,2025,6,17,Tuesday,12,False,UNKNOWN,True
1,84395698,275697181,4,METROPOLITAN POLICE DPT-DISTRICT 4,MPD-4D,P170,600 BLK OTIS PL NW,,0,Other,...,,2025-06-20 23:38:00,2025,6,20,Friday,23,False,UNKNOWN,True
2,84395699,275697203,4,METROPOLITAN POLICE DPT-DISTRICT 4,MPD-4D,P170,600 BLK OTIS PL NW,,0,Other,...,,2025-06-20 23:54:00,2025,6,20,Friday,23,False,UNKNOWN,True
3,84395717,276503242,50,FEDERAL PROTECTIVE SERVICES (US GV),FPS,P269,NS 600 BLK MARYLAND AVE SW,,0,Other,...,,2025-06-04 07:45:00,2025,6,4,Wednesday,7,False,UNKNOWN,True
4,84395718,276503253,50,FEDERAL PROTECTIVE SERVICES (US GV),FPS,P168,NS 600 BLK MARYLAND AVE SW,,0,Other,...,,2025-06-04 07:45:00,2025,6,4,Wednesday,7,False,UNKNOWN,True


In [39]:
# parse datetime
df['issue_datetime'] = pd.to_datetime(df['issue_datetime'])
cell = 0.001
# =========================
# 2) SPATIOTEMPORAL BINNING
# =========================
# hourly bins for time
df['hour_bin'] = df['issue_datetime'].dt.floor('1H')

# spatial bins (~100m cells)
cell = 0.001  # adjust for desired grid size
df['lat_bin'] = (df['latitude'] // cell) * cell
df['lon_bin'] = (df['longitude'] // cell) * cell

  df['hour_bin'] = df['issue_datetime'].dt.floor('1H')


In [43]:
# =========================
# 3) CREATE FULL GRID
# =========================
hours = pd.date_range(df['hour_bin'].min(), df['hour_bin'].max(), freq='1H')
lat_bins = df['lat_bin'].unique()
lon_bins = df['lon_bin'].unique()

grid = pd.DataFrame(product(hours, lat_bins, lon_bins),
                    columns=['hour_bin', 'lat_bin', 'lon_bin'])

# label positive if ≥ 1 ticket
hits = df.groupby(['hour_bin','lat_bin','lon_bin']).size().rename('ticket_count').reset_index()
X = grid.merge(hits, on=['hour_bin','lat_bin','lon_bin'], how='left').fillna({'ticket_count': 0})
X['label'] = (X['ticket_count'] > 0).astype(int)

# =========================
# 4) FEATURES FOR MODEL
# =========================
X['dayofweek'] = X['hour_bin'].dt.dayofweek
X['hour']      = X['hour_bin'].dt.hour
X['hour_sin']  = np.sin(2*np.pi*X['hour']/24)
X['hour_cos']  = np.cos(2*np.pi*X['hour']/24)

cat_cols  = ['dayofweek']                 # could also bucket lat/lon into discrete categories
cont_cols = ['hour_sin','hour_cos','lat_bin','lon_bin']
target_col = 'label'

  hours = pd.date_range(df['hour_bin'].min(), df['hour_bin'].max(), freq='1H')


KeyboardInterrupt: 

In [None]:
# =========================
# 5) ENCODE & NORMALIZE
# =========================
cat_cols  = ['day_of_week', 'year', 'month', 'hour', 'issue_datetime', 'latitude', 'longitude']

category_sizes = []
for col in cat_cols:
    X[col] = X[col].astype('category')
    category_sizes.append(len(X[col].cat.categories))

cont_means = X[cont_cols].mean().values
cont_stds  = X[cont_cols].std(ddof=0).replace(0, 1).values

def df_to_tensors(frame):
    x_categ = torch.tensor(frame[cat_cols].apply(lambda s: s.cat.codes).values, dtype=torch.long)
    x_cont  = torch.tensor(((frame[cont_cols].values - cont_means) / cont_stds), dtype=torch.float32)
    y       = torch.tensor(frame[target_col].values, dtype=torch.float32).unsqueeze(1)
    return x_categ, x_cont, y

x_categ, x_cont, y = df_to_tensors(X)

NameError: name 'cat_cols' is not defined

In [None]:


model = TabTransformer(
    categories=tuple(category_sizes),
    num_continuous=len(cont_cols),
    dim=32,
    depth=4,
    heads=4,
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_mults=(4, 2),
    mlp_act=nn.ReLU(),
    dim_out=1
)