In [1]:
# Import libraries.
import pandas as pd

# Settings.
pd.set_option('display.max_colwidth', None)

# Load in data. For now this is the sample generated
# in p2000_brandweer_create_data.r.
scrapes = pd.read_csv("../data/all_sample_scrapes.csv")

# Check load correctly.
scrapes.head()

Unnamed: 0,scrape_number,X1,X2,X3,X4,X5
0,2573,126158.0,22:34:30,17-02-23,GROUP,A1 12158 Rit 23268 Adriaan Morrienpad Driehuis NHAmbulance Kennemerland (Ambulance 12-158 Witte Kruis)
1,25660,,,,,
2,33315,1420054.0,23:50:19,24-04-22,GROUP,A1 AMBU 17154 1e Opbouwstraat 3076PR Rotterdam ROTTDM bon 60012CPA Rijnmond AZRR Ambulance 17-154
3,20392,,,,,
4,24735,1520999.0,05:05:00,26-05-23,GROUP,A1 Robertaland SGRAVH : 15118MKA Haaglanden (Monitorcode ALS)


In [2]:
# Rename columns.
scrapes.rename(columns={'X1': 'code' ,
                        'X2': 'times',
                        'X3': 'dates',
                        'X4': 'sign',
                        'X5': 'info'},
               inplace=True)

In [3]:
# Check it worked.
scrapes.columns

Index(['scrape_number', 'code', 'times', 'dates', 'sign', 'info'], dtype='object')

In [4]:
# Count missings.
scrapes.isna().sum()

scrape_number      0
code             346
times            346
dates            346
sign             346
info             346
dtype: int64

In [5]:
# Missings are an artefact of the scrape, separating incidents.
scrapes_nm = scrapes.dropna(inplace = False)

In [6]:
# Due to scrape frequency, there are loads of duplicates. Remove them.
scrapes_nm_dd = scrapes_nm.drop_duplicates(subset = ['times', 'dates', 'info'],
                                           keep = False)

In [7]:
# Check it worked. ***Re-check with full data**
raw_dim = scrapes_nm.shape
dd_dim  = scrapes_nm_dd.shape
print(raw_dim, dd_dim)

(654, 6) (654, 6)


In [8]:
# Load in CAP data.
cap = pd.read_csv("../data/capcodelijst_source_in_header.csv", skiprows=1, delimiter=';')

In [9]:
# Check contents.
cap.head()

Unnamed: 0,code,cap_service,region,dorp,unit_type,extra
0,100000,Brandweer,Amsterdam-Amstelland,,Proefalarm,
1,100001,Brandweer,Amsterdam-Amstelland,Aalsmeer,Bevelvoerders,
2,100004,Brandweer,Amsterdam-Amstelland,Aalsmeer,Korpsalarm,
3,100005,Brandweer,Amsterdam-Amstelland,Aalsmeer,Officier van Dienst Aalsmeer/UitHoorn,
4,100007,Brandweer,Amsterdam-Amstelland,Aalsmeer,Chauffeurs,


In [10]:
# Join by code. 
scrapes_wcodes = scrapes_nm_dd.merge(cap, on = "code", how = "left")

In [None]:
# Check structure now.
scrapes_wcodes.head()

In [24]:
# Count 'ambulance' occurence after making all lower.
str(scrapes_wcodes['info']).find('Ambulance', 0, len(scrapes_wcodes['info']) )

56

In [26]:
# Flag by row.
scrapes_wcodes['info'].str.contains('Ambulance').value_counts()

info
False    380
True     276
Name: count, dtype: int64

In [27]:
# Add flag to the data frame.
scrapes_wcodes = scrapes_wcodes.assign(ambu_flag = scrapes_wcodes['info'].str.contains('Ambulance'))

In [28]:
# Check it worked.
scrapes_wcodes['ambu_flag'].value_counts()

ambu_flag
False    380
True     276
Name: count, dtype: int64

In [31]:
# Crosstab to check agreement.
pd.crosstab(index = scrapes_wcodes.cap_service, columns = scrapes_wcodes.ambu_flag)

ambu_flag,False,True
cap_service,Unnamed: 1_level_1,Unnamed: 2_level_1
Ambulance,73,186
Brandweer,121,1
KNRM,1,0
Politie,22,0


In [59]:
# Create conditions for when there is disagreement.
con1 = scrapes_wcodes['cap_service'] == "Ambulance"
con2 = scrapes_wcodes['ambu_flag'] == True

