In [43]:
import sys

import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.utils import data
from sklearn.metrics import classification_report

In [44]:
pl.seed_everything(42)

42

In [45]:
def reshape_by_catagory(array, category, SEQ=2):
    # l=[]
    # if category!='geohash' and  category!='NLP' :
    #     b = array[0:-14]
    #     for i in range(SEQ):
    #         c = b[i*25:i*25+25]
    #         if category == 'traffic':
    #             #d = np.concatenate((c[0:9],c[-5:]),axis=1)
    #             d = np.concatenate([c[1:2],c[3:10]],axis=1)
    #         elif category=='weather':
    #             d = c[10:-5]
    #         elif category=='time':
    #             d = np.concatenate([c[0:1],c[2:3],c[-5:]],axis=1)
    #         else:
    #             d = c
    #         l.append(d)
    #     n = np.concatenate(l,axis=1)
    #     #if category!='no_geohash':
    #     #    return np.concatenate((n,array[-14:]),axis=1)
    #     return n
    if category=='NLP':
        return array[-100:]
    elif category=='TimeVariant':
        array = array[0:-114]
        return array.reshape((SEQ,int(array.shape[0]/SEQ)))
    else:
        return array[-114:-100]

In [46]:
class AccidentDataset(data.Dataset):
    def __init__(self, X_npy_files: [str], y_npy_files: [str]):
        self.X_data = np.concatenate([np.load(f, allow_pickle=True) for f in X_npy_files], axis=0)
        self.y_data = np.concatenate([np.load(f, allow_pickle=True) for f in y_npy_files], axis=0)

    def __len__(self):
        return self.y_data.shape[0]

    def __getitem__(self, item):
        elemX = self.X_data[item, :-1]
        geo_code = self.X_data[item, -1]
        time_variant = reshape_by_catagory(elemX, "TimeVariant", SEQ=8)
        geohash2vec = reshape_by_catagory(elemX, "NLP")
        poi = reshape_by_catagory(elemX, "geohash")
        elemY = self.y_data[item]

        # print('shapes')
        # print('elemX', elemX.shape)
        # print('time_variant', time_variant.shape)
        # print('desc2vec', desc2vec.shape)
        # print('poi', poi.shape)
        # print('geocode', geo_code)
        # sys.exit()

        sample = {
            'time_variant': torch.tensor(time_variant.astype(np.float), dtype=torch.float),
            'geohash2vec': torch.tensor(geohash2vec.astype(np.float), dtype=torch.float),
            'poi': torch.tensor(poi.astype(np.float), dtype=torch.float),
            'geo_code': torch.tensor(geo_code),
            'y': torch.tensor(elemY)
        }

        return sample


In [47]:
class DeepNeuralNetwork(pl.LightningModule):
    def __init__(self, cities: [str], *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cities = cities
        self.final_dense_layers =  nn.Sequential(
            nn.Linear(139, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )
        self.loss = nn.CrossEntropyLoss()

    def forward(self, tv, g2v, poi, gcode):
        # gcode is single 1x1 element so ignoring
        fc_inp = torch.cat((tv[:, -1], g2v, poi), dim=1)
        return self.final_dense_layers(fc_inp)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001, weight_decay=1e-5)
        return [optimizer]

    def train_dataloader(self):
        dataset = AccidentDataset(
            X_npy_files=[f"../data_files/dataset/X_train_{city}.npy" for city in self.cities],
            y_npy_files=[f"../data_files/dataset/y_train_{city}.npy" for city in self.cities],
        )
        dataloader = data.DataLoader(dataset, batch_size=1024, shuffle=True, num_workers=4, pin_memory=True)
        return dataloader

    def val_dataloader(self):
        dataset = AccidentDataset(
            X_npy_files=[f"../data_files/dataset/X_test_{city}.npy" for city in self.cities],
            y_npy_files=[f"../data_files/dataset/y_test_{city}.npy" for city in self.cities],
        )
        dataloader = data.DataLoader(dataset, batch_size=128, num_workers=4, pin_memory=True)
        return dataloader

    def training_step(self, batch, batch_nb):
        out =  self.forward(tv=batch['time_variant'], poi=batch['poi'], g2v=batch['geohash2vec'], gcode=batch['geo_code'])
        loss = self.loss(out, batch['y'])
        self.log("train_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_nb):
        out =  self.forward(tv=batch['time_variant'], poi=batch['poi'], g2v=batch['geohash2vec'], gcode=batch['geo_code'])
        acc =  (torch.argmax(out, dim=1) == batch['y']).count_nonzero() / (len(batch['y']) * 1.0)
        loss = self.loss(out, batch['y'])
        self.log("val_loss", loss, on_epoch=True, prog_bar=False, logger=True)
        self.log("val_acc", acc, on_epoch=True, prog_bar=True, logger=True)
        return {"val_batch_loss": loss, "val_batch_acc": acc}

    def on_train_end(self):
        y_pred, y_true = self.predict(return_labels=True)
        print("Train Stopped: Printing F1 Score (of Validation) ...")
        print(classification_report(y_true, y_pred))

    @torch.no_grad()
    def predict(self, return_labels=True):
        dataloader = self.val_dataloader()
        predY: [int] = []
        actualY: [int] = []
        for batch in dataloader:
            tv, poi, geohash2vec, geo_code, y = batch['time_variant'].to(self.device), \
                                             batch['poi'].to(self.device), \
                                             batch['geohash2vec'].to(self.device), \
                                             batch['geo_code'].to(self.device), \
                                             batch['y'].to(self.device)


            y_hat = self.forward(tv=tv, poi=poi, g2v=geohash2vec, gcode=geo_code)
            class_predictions = torch.argmax(y_hat, dim=1)
            predY.extend(class_predictions.tolist())
            actualY.extend(y.tolist())

        if return_labels:
            return predY, actualY

        return predY

In [48]:
print("Begin training ...")
model = DeepNeuralNetwork(cities=["Atlanta", "Austin", "Charlotte", "Dallas", "Houston", "LosAngeles"])
trainer = pl.Trainer(
            gpus=1,
            num_nodes=1,
            deterministic=True,
            max_epochs=10,
            progress_bar_refresh_rate=0, # comment to enable progress bar
        )
trainer.fit(model)
print("Completed training!")

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type             | Params
--------------------------------------------------------
0 | final_dense_layers | Sequential       | 219 K 
1 | loss               | CrossEntropyLoss | 0     
--------------------------------------------------------
219 K     Trainable params
0         Non-trainable params
219 K     Total params


Begin training ...
Train Stopped: Printing F1 Score (of Validation) ...
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     26137
           1       0.71      0.30      0.42      5341

    accuracy                           0.86     31478
   macro avg       0.79      0.63      0.67     31478
weighted avg       0.84      0.86      0.83     31478

Completed training!


In [49]:
for city in ["Atlanta", "Austin", "Charlotte", "Dallas", "Houston", "LosAngeles"]:
    print(f"Begin training ... Dataset: {city}")
    model = DeepNeuralNetwork(cities=[city])
    trainer = pl.Trainer(
                gpus=1,
                num_nodes=1,
                deterministic=True,
                max_epochs=10,
                progress_bar_refresh_rate=0, # comment to enable progress bar
            )
    trainer.fit(model)
print("Completed training!")


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type             | Params
--------------------------------------------------------
0 | final_dense_layers | Sequential       | 219 K 
1 | loss               | CrossEntropyLoss | 0     
--------------------------------------------------------
219 K     Trainable params
0         Non-trainable params
219 K     Total params
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type             | Params
--------------------------------------------------------
0 | final_dense_layers | Sequential       | 219 K 
1 | loss               | CrossEntropyLoss | 0     
--------------------------------------------------------
219 K     Trainable params
0         Non-trainable params
219 K     Total params
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LO

Begin training ... Dataset: Atlanta
Train Stopped: Printing F1 Score (of Validation) ...
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      1984
           1       0.72      0.36      0.48       531

    accuracy                           0.84      2515
   macro avg       0.78      0.66      0.69      2515
weighted avg       0.82      0.84      0.81      2515

Begin training ... Dataset: Austin
Train Stopped: Printing F1 Score (of Validation) ...
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      3863
           1       0.67      0.62      0.64       801

    accuracy                           0.88      4664
   macro avg       0.80      0.78      0.79      4664
weighted avg       0.88      0.88      0.88      4664

Begin training ... Dataset: Charlotte
Train Stopped: Printing F1 Score (of Validation) ...
              precision    recall  f1-score   support

           0       0.88