In [4]:
# pip install torch_geometric

In [5]:
import pandas as pd
import numpy as np
import requests
import torch
from torch_geometric.data import Data
# from itertools import combinations
from torch.utils.data import Dataset, DataLoader
from torch_geometric.nn import GATConv
import torch.nn.functional as F
import mlflow

In [6]:
mlflow.set_experiment('GAT_Rodent_Complaints')

<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1745732747616, experiment_id='3', last_update_time=1745732747616, lifecycle_stage='active', name='GAT_Rodent_Complaints', tags={}>

In [7]:
import requests

url = "https://data.cityofnewyork.us/resource/3q43-55fe.json"

params = {
    "$limit": 300000,  # Number of records
    "complaint_type": "Rodent",
    "city": "BROOKLYN"
}

response = requests.get(url, params=params)

if response.status_code == 200:
    data = response.json()

    print(f"Retrieved {len(data)} records.")
else:
    print(f"Failed to retrieve data: {response.status_code}")


Retrieved 96101 records.


In [8]:
complaints_data = pd.DataFrame(data)
complaints_data['created_date'] = pd.to_datetime(complaints_data['created_date'])
complaints_data['zipcode'] = complaints_data['incident_zip'].astype(str)
complaints_data['month'] = complaints_data['created_date'].dt.to_period('M')

In [9]:
rodent_counts = complaints_data.groupby(['zipcode', 'month']).size().reset_index(name='rodent_complaints')

In [10]:
rodent_counts

Unnamed: 0,zipcode,month,rodent_complaints
0,10458,2011-07,1
1,11201,2010-01,4
2,11201,2010-02,4
3,11201,2010-03,2
4,11201,2010-04,3
...,...,...,...
6643,11385,2024-02,1
6644,11385,2024-10,1
6645,11416,2010-05,2
6646,11426,2018-01,1


In [11]:
params = {
    "$limit": 300000,  # Number of records
    "boro" : "Brooklyn"
}

In [12]:
url_health = "https://data.cityofnewyork.us/resource/43nn-pn8j.json"
response_health = requests.get(url_health, params = params)
health_data = pd.DataFrame(response_health.json())

In [13]:
health_data['score'] = pd.to_numeric(health_data['score'], errors='coerce')
health_data['zipcode'] = health_data['zipcode'].astype(str)
health_data['inspection_date'] = pd.to_datetime(health_data['grade_date'])
health_data['month'] = health_data['inspection_date'].dt.to_period('M')
health_data['key'] = health_data['dba'] + health_data['building']

In [14]:
avg_health_score = health_data.groupby(['zipcode', 'month'])['score'].mean().reset_index()
avg_health_score.rename(columns={'score': 'avg_health_score'}, inplace=True)

# Merge both datasets on ZIP and month
df = pd.merge(rodent_counts, avg_health_score, on=["zipcode", "month"], how="left")

In [15]:
df.drop(columns = ['avg_health_score'], inplace = True)
df = df.sort_values(by=['zipcode', 'month'])

In [16]:
zipcodes = df['zipcode'].unique()
months = df['month'].unique()

all_combinations = pd.MultiIndex.from_product([zipcodes, months], names=['zipcode', 'month'])

df.set_index(['zipcode', 'month'], inplace=True)

df = df.reindex(all_combinations, fill_value=0)

df.reset_index(inplace=True)
df = df[df.zipcode != 'N/A']

In [17]:
df

Unnamed: 0,zipcode,month,rodent_complaints
0,10458,2011-07,1
1,10458,2010-01,0
2,10458,2010-02,0
3,10458,2010-03,0
4,10458,2010-04,0
...,...,...,...
8275,11426,2024-12,0
8276,11426,2025-01,0
8277,11426,2025-02,0
8278,11426,2025-03,0


In [18]:
import torch

window_size = 3

# Dictionary for rodent complaints for each (zipcode, month)
data_dict = {(row['zipcode'], row['month']): row['rodent_complaints'] for _, row in df.iterrows()}

unique_months = sorted(df['month'].unique())
zipcodes = sorted(df['zipcode'].unique())

# Split the months into training and testing months
train_months = unique_months[:-9]  # Use all months except the last two for training
test_months = unique_months[-9:]  # Keep the last two months for testing

# Create training samples
training_samples = []

for t in range(window_size, len(train_months) - 1):
    current_month = train_months[t]
    next_month = train_months[t+1]

    x_features = []
    y_targets = []
    available_zipcodes = []

    for zipcode in zipcodes:
        features = []
        valid = True

        for past_t in range(t - window_size, t):
            month_val = train_months[past_t]

            if (zipcode, month_val) in data_dict:
                features.append(data_dict[(zipcode, month_val)])
            else:
                valid = False
                break
        if valid and ((zipcode, next_month) in data_dict):
            x_features.append(features)
            y_targets.append(data_dict[(zipcode, next_month)])
            available_zipcodes.append(zipcode)

    if len(x_features) == 0:
        continue

    x_tensor = torch.tensor(x_features, dtype=torch.float)
    y_tensor = torch.tensor(y_targets, dtype=torch.float)

    sample = {
        'x': x_tensor,
        'y': y_tensor,
        'zipcodes': available_zipcodes,
        'current_time': current_month,
        'next_time': next_month
    }
    training_samples.append(sample)

print(f"Created {len(training_samples)} training samples.")

# Create test samples (same structure as training samples, but for the test months)
test_samples = []


for t in range(window_size, len(test_months) - 1):
    current_month = test_months[t]
    next_month = test_months[t+1]

    x_features = []
    y_targets = []
    available_zipcodes = []
    # print(zipcodes)
    for zipcode in zipcodes:
        features = []
        valid = True

        for past_t in range(t - window_size, t):
            month_val = test_months[past_t]
            # print(month_val)

            if (zipcode, month_val) in data_dict:
                features.append(data_dict[(zipcode, month_val)])
            else:
                valid = False
                break
        if valid and ((zipcode, next_month) in data_dict):
            x_features.append(features)
            y_targets.append(data_dict[(zipcode, next_month)])
            available_zipcodes.append(zipcode)

    if len(x_features) == 0:
        continue

    x_tensor = torch.tensor(x_features, dtype=torch.float)
    y_tensor = torch.tensor(y_targets, dtype=torch.float)

    sample = {
        'x': x_tensor,
        'y': y_tensor,
        'zipcodes': available_zipcodes,
        'current_time': current_month,
        'next_time': next_month
    }
    test_samples.append(sample)

print(f"Created {len(test_samples)} test samples.")


Created 171 training samples.
Created 5 test samples.


In [19]:
import torch
from torch_geometric.data import Data

def get_edge_index(zipcodes_list):

    num_nodes = len(zipcodes_list)
    edge_list = []

    # Connect each node to its left and right neighbor in the sorted list
    for i in range(num_nodes):
        if i > 2:
            edge_list.append([i, i - 1])
            edge_list.append([i, i - 2])
            edge_list.append([i, i - 3])
        if i < num_nodes - 3:
            edge_list.append([i, i + 1])
            edge_list.append([i, i + 2])
            edge_list.append([i, i + 3])

    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    return edge_index

sample = training_samples[0]
edge_index = get_edge_index(sample['zipcodes'])


data = Data(x=sample['x'], y=sample['y'], edge_index=edge_index)

print("Edge index tensor:")
print(edge_index)
print("\nGraph Data Object:")
print(data)


Edge index tensor:
tensor([[ 0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,  3,  3,  3,  4,  4,  4,
          4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,
          7,  7,  7,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10,
         10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13, 13, 13,
         13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16,
         16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 19, 19, 19,
         19, 19, 19, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 22, 22, 22,
         22, 22, 22, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 25, 25, 25,
         25, 25, 25, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 28, 28, 28,
         28, 28, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 31, 31, 31,
         31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
         34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37,
         

In [20]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4):

        super(GAT, self).__init__()
        # First GAT layer: Outputs concatenated features from multiple heads.
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, concat=True)
        # Second GAT layer: Aggregates information to output a single value per node.
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False)

    def forward(self, x, edge_index):

        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = self.conv2(x, edge_index)
        return x.squeeze()


In [21]:
hidden_channels = 4
out_channels = 1
heads = 2
lr = 0.01
num_epochs = 100

In [None]:
import torch
import torch.nn.functional as F
import mlflow
import mlflow.pytorch

# Set up MLflow logging
mlflow.set_experiment('GAT_Rodent_Complaints')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Assuming GAT model and parameters (hidden_channels, out_channels, heads, etc.) are already defined
model = GAT(in_channels=window_size, hidden_channels=hidden_channels, out_channels=out_channels, heads=heads).to(device)

# Define optimizer and number of epochs
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
num_epochs = num_epochs

# Log hyperparameters and model configuration
with mlflow.start_run():
    mlflow.log_param("window_size", window_size)
    mlflow.log_param("hidden_channels", hidden_channels)
    mlflow.log_param("out_channels", out_channels)
    mlflow.log_param("heads", heads)
    mlflow.log_param("learning_rate", lr)
    mlflow.log_param("num_epochs", num_epochs)

    # Training loop
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0.0
        for sample in training_samples:
            # Build graph connectivity
            edge_index = get_edge_index(sample['zipcodes']).to(device)
            x = sample['x'].to(device)
            y = sample['y'].to(device)

            optimizer.zero_grad()

            # Forward pass
            out = model(x, edge_index)
            
            # Compute MSE loss
            loss = F.mse_loss(out, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(training_samples)

        # Log training loss to MLflow
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

        # Test part - Compute test loss
        model.eval()  # Set the model to evaluation mode
        total_test_loss = 0.0
        with torch.no_grad():
            for sample in test_samples:
                edge_index = get_edge_index(sample['zipcodes']).to(device)
                x = sample['x'].to(device)
                y = sample['y'].to(device)

                # Forward pass on test data
                out = model(x, edge_index)
                
                # Compute MSE loss for test data
                test_loss = F.mse_loss(out, y)
                total_test_loss += test_loss.item()

        avg_test_loss = total_test_loss / len(test_samples)

        # Log test loss to MLflow
        mlflow.log_metric("test_loss", avg_test_loss, step=epoch)

        # Print both training and test loss
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}")

    # Log the final model after training
    mlflow.pytorch.log_model(model, "model")

    print("Training finished and model logged to MLflow!")


Epoch 1/100, Train Loss: 143.8269, Test Loss: 106.9844
Epoch 2/100, Train Loss: 127.8149, Test Loss: 164.3982
Epoch 3/100, Train Loss: 120.2522, Test Loss: 97.7150
Epoch 4/100, Train Loss: 117.5446, Test Loss: 103.0806
Epoch 5/100, Train Loss: 114.7482, Test Loss: 97.2210
Epoch 6/100, Train Loss: 115.6971, Test Loss: 95.4029
Epoch 7/100, Train Loss: 115.4759, Test Loss: 96.2067
Epoch 8/100, Train Loss: 114.4364, Test Loss: 99.2976
Epoch 9/100, Train Loss: 116.0736, Test Loss: 94.3362
Epoch 10/100, Train Loss: 110.9584, Test Loss: 96.6450
Epoch 11/100, Train Loss: 112.0713, Test Loss: 99.1697
Epoch 12/100, Train Loss: 111.8682, Test Loss: 98.7909
Epoch 13/100, Train Loss: 110.7951, Test Loss: 99.1131
Epoch 14/100, Train Loss: 112.2371, Test Loss: 105.7800
Epoch 15/100, Train Loss: 117.9026, Test Loss: 100.6273
Epoch 16/100, Train Loss: 111.4012, Test Loss: 96.1079
Epoch 17/100, Train Loss: 110.3655, Test Loss: 100.9759
Epoch 18/100, Train Loss: 109.0423, Test Loss: 101.4615
Epoch 19/100

In [None]:
print("Actual vs Predicted for all test samples:")

actual_vs_predicted = []

for sample in test_samples:
    edge_index = get_edge_index(sample['zipcodes']).to(device)
    x = sample['x'].to(device)
    y = sample['y'].to(device)


    model.eval()
    with torch.no_grad():
        predicted = model(x, edge_index)

    for i, zipcode in enumerate(sample['zipcodes']):
        actual_value = y[i].item()  # Actual rodent complaints value
        predicted_value = predicted[i].item()  # Predicted rodent complaints value
        actual_vs_predicted.append((zipcode, actual_value, predicted_value))

for zipcode, actual, predicted in actual_vs_predicted:
    print(f"Zipcode: {zipcode}, Actual: {actual:.4f}, Predicted: {predicted:.4f}")
