In [None]:
# Import libraries.
import pandas as pd
import matplotlib.pyplot as plt

# Settings.
pd.set_option('display.max_colwidth', None)

# Load in data. For now this is the sample generated
# in p2000_brandweer_create_data.r.
scrapes = pd.read_csv("../data/all_sample_scrapes.csv")

# Check load correctly.
#scrapes.head()

In [None]:
# Rename columns.
scrapes.rename(columns={'X1': 'code' ,
                        'X2': 'times',
                        'X3': 'dates',
                        'X4': 'sign',
                        'X5': 'info'},
               inplace=True)

In [None]:
# Check it worked.
scrapes.columns

In [None]:
# Count missings.
scrapes.isna().sum()

In [None]:
# Missings are an artefact of the scrape, separating incidents.
scrapes_nm = scrapes.dropna(inplace = False)

In [None]:
# Due to scrape frequency, there are loads of duplicates. Remove them.
scrapes_nm_dd = scrapes_nm.drop_duplicates(subset = ['times', 'dates', 'info'],
                                           keep = False)

In [None]:
# Check it worked as expected.
raw_dim = scrapes_nm.shape
dd_dim  = scrapes_nm_dd.shape
print(raw_dim, dd_dim)

In [None]:
# Load in CAP data.
cap = pd.read_csv("../data/capcodelijst_source_in_header.csv", skiprows=1, delimiter=';')

In [None]:
# Check contents.
cap.head()

In [None]:
# Join by code. 
scrapes_wcodes = scrapes_nm_dd.merge(cap, on = "code", how = "left")

In [None]:
# Check structure now.
scrapes_wcodes.head()

In [None]:
# Flag occurences to ambulance or AMBU (we know these are commonly used).
scrapes_wcodes['info'].str.contains('Ambulance|AMBU').value_counts()

In [None]:
# Add flag to the data frame.
scrapes_wcodes = scrapes_wcodes.assign(ambu_flag = scrapes_wcodes['info'].str.contains('Ambulance|AMBU'))

In [None]:
# Check it worked.
scrapes_wcodes['ambu_flag'].value_counts()

In [None]:
# Compare the existing CAP flag with the character string filter.
pd.crosstab(index = scrapes_wcodes.cap_service, columns = scrapes_wcodes.ambu_flag)

The above demonstrates that the flag works nicely for KNRM and Politie, and almost always for Brandweer. Manualy inspection of those Brandweer 'True' flags indicates that maybe the CAP codes really does miss them, for some reason, because the info string definitely states AMBU or Ambulance.

In [None]:
disgaree = scrapes_wcodes[(scrapes_wcodes['cap_service'] == "Ambulance") &
                          (scrapes_wcodes['ambu_flag'] == False) ]

In [None]:
# Inspect manually.
disgaree.head()

For now, we decide to create a hybrid flag: if it's identified via the character string, or the CAP code, it's an ambulance! Feel free to explore this more or try something different.

In [None]:
scrapes_wcodes = scrapes_wcodes.assign(hyb_ambu_flag = (scrapes_wcodes['cap_service'] == "Ambulance")  | (scrapes_wcodes['ambu_flag'] == True))

In [None]:
# Check that it worked as expected.
pd.crosstab(index = scrapes_wcodes.hyb_ambu_flag, columns = scrapes_wcodes.cap_service)

In [None]:
# Now we make the filter for ambulance-only.
ambu_only = scrapes_wcodes[scrapes_wcodes.hyb_ambu_flag == True]

In [None]:
ambu_only.head()

In [None]:
# Pull out the priority codes using first three characters, then remove any whitespace.
ambu_only = ambu_only.assign(prio = ambu_only['info'].str[:3].str.replace(" ", ""))

In [None]:
# Check resulting categories. 
ambu_only['prio'].value_counts()

As per information on P2000 hobby websites (e.g., [112 Zuidland](https://www.112-zuidland.nl/p1-en-a1-wat-betekend-dat-eigenlijk/)), we're only really interested in P1 or P2 codes. Vast majority of info strings are these anyway. To avoid capturing non-dispatch P2000 incidents, we now only filter by those incidents that have such a code.

In [None]:
ambu_disp = ambu_only[(ambu_only.prio == "A1")  | (ambu_only.prio == "A2")]

In [None]:
# Frequency counts of prios.
prio_counts = ambu_disp['prio'].value_counts()
print(prio_counts)

In [None]:
# Bar plot of prios nationwide.
prio_counts.plot.bar()

In [None]:
# Frequency counts of incidents, nationwide.
region_counts = ambu_disp['region'].value_counts()
print(region_counts)

In [None]:
# Bar plot of regional counts.
region_counts.plot.bar()

In [None]:
ambu_disp.columns

In [None]:
# Create time variable.
ambu_disp.loc[:, 'times'] = pd.to_datetime(ambu_disp['times'], format='%H:%M:%S').dt.time

In [None]:
print(type(ambu_disp['times'][0]))

In [None]:
ambu_disp.columns

In [None]:
print(ambu_disp['times'])

In [None]:
# Extract hours.
hours_result = []

for h in ambu_disp['times']:
    hours_result.append(h.hour)

In [None]:
# Check type. 
type(hours_result)

In [None]:
# Assign back to data frame. 
ambu_disp.loc[:, 'day_hours'] = pd.to_numeric(hours_result, downcast = 'signed')

print(ambu_disp['day_hours'])

In [None]:
# Freq count.
hourly_counts = ambu_disp['day_hours'].value_counts().reset_index()
print(hourly_counts)

In [None]:
# Plot.
hourly_counts.plot.scatter(x='day_hours', y='count')