Machine Learning Component -- Cassie Chou and Julia Martin

In [327]:
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import pickle


In [212]:
# Read in Data
calls = pd.read_csv("911_Calls_for_Service_2024.csv")
calls.head(5)

Unnamed: 0,callKey,callDateTime,priority,district,description,callNumber,incidentLocation,location,Neighborhood,PoliceDistrict,...,CouncilDistrict,SheriffDistricts,Community_Statistical_Areas,Census_Tracts,VRIZones,ZIPCode,NeedsSync,IsDeleted,HashedRecord,ESRI_OID
0,10CC9DA965929F12,2024/01/01 06:16:00+00,Non-Emergency,SE,NOISE COMPLAINT,P240011042,3600 PULASKI HY,3600 PULASKI HY BALTIMORE MD,Baltimore Highlands,Southeastern,...,1.0,D6,Orangeville/East Highlandtown,Census Tract 2604.04,,21224.0,0,0,Kitomt4UpJkCoRg0jBbL41PKw1//fHtb9aCjz23s2A0=,1
1,10CC9DA96592A321,2024/01/01 06:33:00+00,Non-Emergency,ND,Private Tow,P240011065,5000 DENMORE AV,5000 DENMORE AV BALTIMORE MD,Central Park Heights,Northwestern,...,6.0,D1,Pimlico/Arlington/Hilltop,Census Tract 2718.02,Northwestern,21215.0,0,0,9AmT5HCrkqTZ8webGYDs+VbQDpTXZ0+lJyL3O+iQ5v4=,2
2,10CC9DA96592B7B9,2024/01/01 08:01:00+00,Non-Emergency,CD,PRKG COMPLAINT,P240011183,1400 ANCHOR ST,1400 ANCHOR ST BALTIMORE MD,Riverside,Southern,...,11.0,D7,Inner Harbor/Federal Hill,Census Tract 2402,,21230.0,0,0,WHIVLB5k+BOGyUmlCPUzCKKr4Bg4f6S4NwrvbgqMkZ0=,3
3,10CC9DA96592BCBA,2024/01/01 08:23:00+00,Non-Emergency,CD,PRKG COMPLAINT,P240011224,1900 EASTERN AV,1900 EASTERN AV BALTIMORE MD,Fells Point,Southeastern,...,1.0,D6,Fells Point,Census Tract 203,,21231.0,0,0,Bvouy/qh83Ejj/1V3Q0AuFs3tnChlZAY4lYsW0Korrs=,4
4,10CC9DA96592DFE2,2024/01/01 10:53:00+00,Low,SD,CHECK WELL-BEING,P240011605,1600 MARSHALL ST,1600 MARSHALL ST BALTIMORE MD,South Baltimore,Southern,...,11.0,D7,Inner Harbor/Federal Hill,Census Tract 2302,,21230.0,0,0,Jo2YzEoa5g+q9cmpQdYzipvP3sQxBYSXg95Rd/97RdM=,5


In [214]:
# Select relevant columns
calls = calls[['callDateTime', 'priority', 'description', 'incidentLocation', 'Neighborhood', 'district', 'Community_Statistical_Areas']]

# Encode categorical variables
priority_order = ['Non-Emergency', 'Low', 'Medium', 'High', 'Emergency', 'Out of Service']
le_priority = LabelEncoder()
le_priority.classes_ = np.array(priority_order)

calls['priority'] = le_priority.transform(calls['priority'])
le_csa = LabelEncoder()
calls['Community_Statistical_Areas'] = le_csa.fit_transform(calls['Community_Statistical_Areas'])

# Convert callDateTime to Month, Day, time format
calls['callDateTime'] = pd.to_datetime(calls['callDateTime'], format="%Y/%m/%d %H:%M:%S%z")
calls["month"] = calls["callDateTime"].dt.month       # 1 (int)
calls["day"] = calls["callDateTime"].dt.day           # 1 (int)
calls["time"] = calls["callDateTime"].dt.strftime('%H%M%S').astype(int)
calls["day_of_week"] = calls["callDateTime"].dt.dayofweek

# Remove NA Values
calls.dropna(inplace = True)
calls.head(5)

Unnamed: 0,callDateTime,priority,description,incidentLocation,Neighborhood,district,Community_Statistical_Areas,month,day,time,day_of_week
0,2024-01-01 06:16:00+00:00,0,NOISE COMPLAINT,3600 PULASKI HY,Baltimore Highlands,SE,41,1,1,61600,0
1,2024-01-01 06:33:00+00:00,0,Private Tow,5000 DENMORE AV,Central Park Heights,ND,44,1,1,63300,0
2,2024-01-01 08:01:00+00:00,0,PRKG COMPLAINT,1400 ANCHOR ST,Riverside,CD,29,1,1,80100,0
3,2024-01-01 08:23:00+00:00,0,PRKG COMPLAINT,1900 EASTERN AV,Fells Point,CD,15,1,1,82300,0
4,2024-01-01 10:53:00+00:00,1,CHECK WELL-BEING,1600 MARSHALL ST,South Baltimore,SD,29,1,1,105300,0


Prepare Data for Machine Learning

In [313]:
n = calls.shape[0]

# Split into Train/Test Fraction
trainFraction = 0.6
sample = np.random.uniform(size = n) < trainFraction
train_data = calls[sample]
test_data = calls[~sample]

valFraction = 0.5
valSample = np.random.uniform(size = test_data.shape[0]) < valFraction
val_data = test_data[valSample]
test_data = test_data[~valSample]

# Standardize Data
features = ['month', 'day', 'time']
scaler = StandardScaler()
train_data[features] = scaler.fit_transform(train_data[features])
test_data[features] = scaler.transform(test_data[features])
val_data[features] = scaler.transform(val_data[features])

# X Train/Test/Val
predict_features = ['month', 'time', 'day', 'day_of_week', 'Community_Statistical_Areas']
X_train = torch.from_numpy(train_data[predict_features].values.astype(np.float32))
X_test = torch.from_numpy(test_data[predict_features].values.astype(np.float32))
X_val = torch.from_numpy(val_data[predict_features].values.astype(np.float32))

# Y Train/Test/Val
Y_train = torch.tensor(train_data['priority'].values, dtype=torch.long)
Y_val = torch.tensor(val_data['priority'].values, dtype=torch.long)
Y_test = test_data['priority'].values

# Create Weights
class_counts = np.bincount(Y_train.numpy())
weights = 1 / np.sqrt(class_counts + 1)  # +1 avoids division by zero

# Normalize weights
weights = weights / weights.sum() * len(weights)
class_weights_tensor = torch.tensor(weights, dtype=torch.float32)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[features] = scaler.fit_transform(train_data[features])


In [319]:
# Prep Model
'''
class PriorityPredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layer1 = nn.Linear(input_size, 100)  # 1 hidden layer, 50 nodes
        self.layer2 = nn.Linear(100, 50)  # 1 hidden layer, 50 nodes
        self.output = nn.Linear(50, 6)           # Output layer (priority prediction)
        
    def forward(self, x):
        x = torch.relu(self.layer1(x))  # ReLU activation
        x = torch.relu(self.layer2(x))  # ReLU activation
        x = self.output(x)
        return x

# Initialize Model
model = PriorityPredictor(input_size=X_train.shape[1])
print(model)
'''
# Define Model (3 Layers, 256, 128, 64 nodes each)
class EnhancedPredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.LeakyReLU(0.2),
            
            nn.Linear(128, 6)
        )
        
    def forward(self, x):
        return self.net(x)
    
model = EnhancedPredictor(input_size=X_train.shape[1])

In [320]:
# Define Focal Loss for Unbalanced Data
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=1.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        # Standard cross entropy
        ce_loss = F.cross_entropy(inputs, targets, 
                                weight=self.alpha, 
                                reduction='none')
        
        # Focal Loss adjustment
        pt = torch.exp(-ce_loss)  # Softmax probability of true class
        focal_loss = (1 - pt)**self.gamma * ce_loss
        
        return focal_loss.mean()

In [321]:
# Loss and optimizer
criterion = FocalLoss(alpha=class_weights_tensor, gamma=1.0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.1) 

set_seed = 42
torch.manual_seed(set_seed)
# Train for 10 epochs
epochs = 25
for epoch in range(epochs):
    # Training
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, Y_train)
    loss.backward()
    optimizer.step()
    
    # Validation
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad(): 
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, Y_val)
        predicted = torch.argmax(val_outputs, dim = 1)
        correct = (predicted == Y_val).sum().item()
        total = Y_val.size(0)
    val_acc = correct / total

    # Slow Learning Rate if Validation Loss does not improve
    scheduler.step(val_loss) 
    
    
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f},  Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}')


Epoch 1, Loss: 0.0113,  Val Loss = 0.0091, Val Acc = 0.1429
Epoch 2, Loss: 0.0097,  Val Loss = 0.0088, Val Acc = 0.2223
Epoch 3, Loss: 0.0090,  Val Loss = 0.0085, Val Acc = 0.5625
Epoch 4, Loss: 0.0087,  Val Loss = 0.0083, Val Acc = 0.5728
Epoch 5, Loss: 0.0085,  Val Loss = 0.0082, Val Acc = 0.4750
Epoch 6, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.2357
Epoch 7, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.2438
Epoch 8, Loss: 0.0085,  Val Loss = 0.0083, Val Acc = 0.2900
Epoch 9, Loss: 0.0085,  Val Loss = 0.0083, Val Acc = 0.3985
Epoch 10, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.4050
Epoch 11, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.4103
Epoch 12, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.4153
Epoch 13, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.4194
Epoch 14, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.4183
Epoch 15, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.4178
Epoch 16, Loss: 0.0084,  Val Loss = 0.0083, Val Acc = 0.4174
Epoch 17, Loss: 0.0084,  Val Loss

In [None]:
# Make Predictions on Test Data
model.eval()
with torch.no_grad():
    logits = model(X_test)
    probs = torch.softmax(logits, dim = 1)
    preds = torch.argmax(probs, dim = 1)



In [324]:
# Get Counts of Each Prediction Class
preds_labeled = pd.DataFrame(le_priority.inverse_transform(preds.numpy()))
print(preds_labeled.value_counts())

# Get Counts of Each Class in Test Data
counts_test = pd.DataFrame(le_priority.inverse_transform(Y_test))
print(counts_test.value_counts())

# Counts of Each Class in Original Data
category_counts = pd.DataFrame(le_priority.inverse_transform(calls['priority'])).value_counts()
print(category_counts)

0            
Low              148125
Non-Emergency    143038
Medium            32643
High               7208
Name: count, dtype: int64
0             
Non-Emergency     221585
Low                55828
Medium             42835
High               10701
Emergency             49
Out of Service        16
Name: count, dtype: int64
0             
Non-Emergency     1108450
Low                278775
Medium             213464
High                53525
Emergency             257
Out of Service        106
Name: count, dtype: int64


In [325]:
# Get Accuracy on Testing Set
np.mean(preds.numpy() == Y_test)

0.41559269396460574

In [None]:
# Export Model and Preprocessors
with open('priority_model.pkl', 'rb') as f:
    pickle.dump({'model': model}, f)

with open('preprocessors.pkl', 'wb') as f:
    pickle.dump({'scaler': scaler, 'le_priority': le_priority, 'le_csa' : le_csa}, f)