Machine Learning Component -- Cassie Chou and Julia Martin

In [3]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [4]:
# Read in Data
calls = pd.read_csv("911_Calls_for_Service_2024.csv")

In [5]:
# Select relevant columns
calls = calls[['callDateTime', 'priority', 'description', 'incidentLocation', 'Neighborhood', 'district', 'Community_Statistical_Areas']]

# Convert callDateTime to Month, Day, time format
calls['callDateTime'] = pd.to_datetime(calls['callDateTime'], format="%Y/%m/%d %H:%M:%S%z")
calls["month"] = calls["callDateTime"].dt.month       # 1 (int)
calls["day"] = calls["callDateTime"].dt.day           # 1 (int)
calls["time"] = calls["callDateTime"].dt.strftime('%H%M%S').astype(int)

# Encode categorical variables
le_priority = LabelEncoder()
calls['priority'] = le_priority.fit_transform(calls['priority'])
le_csa = LabelEncoder()
calls['Community_Statistical_Areas'] = le_csa.fit_transform(calls['Community_Statistical_Areas'])

# Remove NA Values
calls.dropna(inplace = True)
calls.head(5)

Unnamed: 0,callDateTime,priority,description,incidentLocation,Neighborhood,district,Community_Statistical_Areas,month,day,time
0,2024-01-01 06:16:00+00:00,4,NOISE COMPLAINT,3600 PULASKI HY,Baltimore Highlands,SE,41,1,1,61600
1,2024-01-01 06:33:00+00:00,4,Private Tow,5000 DENMORE AV,Central Park Heights,ND,44,1,1,63300
2,2024-01-01 08:01:00+00:00,4,PRKG COMPLAINT,1400 ANCHOR ST,Riverside,CD,29,1,1,80100
3,2024-01-01 08:23:00+00:00,4,PRKG COMPLAINT,1900 EASTERN AV,Fells Point,CD,15,1,1,82300
4,2024-01-01 10:53:00+00:00,2,CHECK WELL-BEING,1600 MARSHALL ST,South Baltimore,SD,29,1,1,105300


Prepare Data for Machine Learning

In [6]:
n = calls.shape[0]

# Split into Train/Test Fraction
trainFraction = 0.6
sample = np.random.uniform(size = n) < trainFraction
train_data = calls[sample]
test_data = calls[~sample]

# X Train/Test
features = ['month', 'day', 'time', 'Community_Statistical_Areas']
X_train = torch.from_numpy(train_data[features].astype(np.float32).to_numpy())
X_test = torch.from_numpy(test_data[features].astype(np.float32).to_numpy())

# Y Train/Test
Y_train = torch.tensor(train_data['priority'].values, dtype=torch.long)
Y_test = test_data['priority'].values

In [44]:
# Prep Model
class PriorityPredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layer1 = nn.Linear(input_size, 50)  # 1 hidden layer, 50 nodes
        self.layer2 = nn.Linear(50, 20)  # 1 hidden layer, 50 nodes
        self.output = nn.Linear(20, 6)           # Output layer (priority prediction)
        
    def forward(self, x):
        x = torch.relu(self.layer1(x))  # ReLU activation
        x = torch.relu(self.layer2(x))  # ReLU activation
        x = self.output(x)
        return x

# Initialize Model
model = PriorityPredictor(input_size=X_train.shape[1])
print(model)

PriorityPredictor(
  (layer1): Linear(in_features=4, out_features=50, bias=True)
  (layer2): Linear(in_features=50, out_features=20, bias=True)
  (output): Linear(in_features=20, out_features=6, bias=True)
)


In [46]:
# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss() # Cross Entropy for Classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Train for 10 epochs
epochs = 500
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, Y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')


Epoch 10, Loss: 2994.1970
Epoch 20, Loss: 2405.6611
Epoch 30, Loss: 1882.8325
Epoch 40, Loss: 1893.5414
Epoch 50, Loss: 1742.5376
Epoch 60, Loss: 1639.6698
Epoch 70, Loss: 1553.6205
Epoch 80, Loss: 1476.5256
Epoch 90, Loss: 1400.9222
Epoch 100, Loss: 1330.1730
Epoch 110, Loss: 1265.8051
Epoch 120, Loss: 1204.4348
Epoch 130, Loss: 1142.5413
Epoch 140, Loss: 1079.8585
Epoch 150, Loss: 1020.0089
Epoch 160, Loss: 963.0158
Epoch 170, Loss: 900.1150
Epoch 180, Loss: 841.3658
Epoch 190, Loss: 783.5968
Epoch 200, Loss: 726.9319
Epoch 210, Loss: 665.1627
Epoch 220, Loss: 602.8495
Epoch 230, Loss: 546.2654
Epoch 240, Loss: 486.7184
Epoch 250, Loss: 431.7891
Epoch 260, Loss: 372.8779
Epoch 270, Loss: 306.5300
Epoch 280, Loss: 250.4577
Epoch 290, Loss: 192.4089
Epoch 300, Loss: 130.5642
Epoch 310, Loss: 66.4200
Epoch 320, Loss: 10.0705
Epoch 330, Loss: 13.8150
Epoch 340, Loss: 9.7341
Epoch 350, Loss: 7.8528
Epoch 360, Loss: 5.4890
Epoch 370, Loss: 4.2396
Epoch 380, Loss: 9.2644
Epoch 390, Loss: 11

In [47]:
model.eval()
with torch.no_grad():
    logits = model(X_test)
    probs = torch.softmax(logits, dim = 1)
    preds = torch.argmax(probs, dim = 1)


In [48]:
# Get Proportions of Each Prediction Class
preds_labeled = pd.DataFrame(le_priority.inverse_transform(preds.numpy()))
print(preds_labeled.value_counts())

# Get Proportions of Each Prediction Class
counts_test = pd.DataFrame(le_priority.inverse_transform(Y_test))
print(counts_test.value_counts())

# Proportions of Each Class in Original Data
category_counts = pd.DataFrame(le_priority.inverse_transform(calls['priority'])).value_counts()
print(category_counts)

0            
Non-Emergency    662060
Low                 271
Name: count, dtype: int64
0             
Non-Emergency     443242
Low               111805
Medium             85537
High               21577
Emergency            115
Out of Service        55
Name: count, dtype: int64
0             
Non-Emergency     1108450
Low                278775
Medium             213464
High                53525
Emergency             257
Out of Service        106
Name: count, dtype: int64


In [49]:
np.mean(preds.numpy() == Y_test)

0.6689646113499141