In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from shapely.geometry import MultiPolygon, Point

In [2]:
interventions_df =  pd.read_parquet('interventions_df.parquet.gzip')

In [3]:
interventions_df.head()

Unnamed: 0,missionid,eventlevel,eventtype,latitude,longitude,moment,dataframe
0,20222490011,N5,P033 - Trauma,5085139.0,436918.0,2022-09-06 11:49:21.586859+02:00,interventions_bxl
1,20222490011,N5,P033 - Trauma,5085139.0,436918.0,2022-09-06 11:49:21.586859+02:00,interventions_bxl
2,20222490012,N5,P059 - Dizziness - Nausea,5083336.0,434504.0,2022-09-06 11:55:35.793679+02:00,interventions_bxl
3,20222490015,N5,P019 - Unconscious - syncope,5085076.0,436359.0,2022-09-06 12:39:23.433732+02:00,interventions_bxl
4,20222490019,N5,P033 - Trauma,508561.0,443169.0,2022-09-06 13:26:48.337914+02:00,interventions_bxl


In [4]:
interventions_train, interventions_test = train_test_split(
interventions_df, test_size=0.25, random_state=123)

In [3]:
eventleveltype_dict = {'Requires AED': ['N1P00[0-9]', 'N1P01[0-5]', 'N1P01[6-9]',
                                        'N1P0[2-6][0-9]', 'N1P07[0-5]', 'N1P07[7-9]',
                                        'N1P0[8-9][0-9]',
                                        'N0P0[0-9][0-9]'
                                        ],
             'Maybe requires AED': ['N2P001', 'N2P002', 'N2P004',
                                    'N2P017', 'N2P023', 'N2P024', 
                                    'N2P029', 'N2P030', 'N2P033', 
                                    'N2P034', 'N2P036', 'N2P071',
                                    'N2P072', 'N2P073',
                                    'N[2-3]P007', 'N[2-3]P014', 'N[2-3]P021', 
                                    'N[2-3]P028', 
                                    'N[2-4]P009', 'N[2-4]P019', 'N[2-4]P020',
                                    'N[2-4]P022', 'N[2-4]P026', 'N[2-4]P039',
                                    'N[2-6]P011', 
                                    'N[2-8]P008', 'N[2-8]P038'
                                    ],
             'Does not require AED': ['N[1-8]P016', 'N[1-8]P076',
                                      'N[2-8]P003', 'N[2-8]P005', 'N[2-8]P006', 
                                      'N[2-8]P010', 'N[2-8]P012', 'N[2-8]P013',
                                      'N[2-8]P015', 'N[2-8]P018', 'N[2-8]P025',
                                      'N[2-8]P027', 'N[2-8]P031', 'N[2-8]P032', 
                                      'N[2-8]P035', 'N[2-8]P059', 'N[2-8]P060', 
                                      'N[2-8]P061', 'N[2-8]P063', 'N[2-8]P064',
                                      'N[2-8]P065', 'N[2-8]P066', 'N[2-8]P067', 
                                      'N[2-8]P068', 'N[2-8]P069', 'N[2-8]P070',  
                                      'N[2-8]P074', 'N[2-8]P075', 'N[2-8]P077',
                                      'N[3-8]P001', 'N[3-8]P002', 'N[3-8]P004', 
                                      'N[3-8]P017', 'N[3-8]P023', 'N[3-8]P024', 
                                      'N[3-8]P029', 'N[3-8]P030', 'N[3-8]P033', 
                                      'N[3-8]P034', 'N[3-8]P036', 'N[3-8]P071', 
                                      'N[3-8]P072', 'N[3-8]P073',
                                      'N[4-8]P007', 'N[4-8]P014', 'N[4-8]P021', 
                                      'N[4-8]P028', 
                                      'N[5-8]P009', 'N[5-8]P019', 'N[5-8]P020', 
                                      'N[5-8]P022', 'N[5-8]P026', 'N[5-8]P039',
                                      'N[7-8]P011',
                                      ],
            }

In [4]:
class InterventionsCleaning:
    def __init__(self, dataset, eventleveltype_dict):
        self.dataset = dataset
        self.eventleveltype_dict = eventleveltype_dict
        self.belgium_gdf = gpd.read_file(r"belgium_Belgium_Country_Boundary.geojson\belgium_Belgium_Country_Boundary.geojson")
        self.belgium_polygon = self.belgium_gdf['geometry']
        self.drop_missing()
        self.cleaning()
        self.create_AEDneeded_column()
        self.select_belgium()
        self.select_interventions()
        self.add_indicators()

    def drop_missing(self):
        self.dataset_noNA = self.dataset.dropna().copy()
        self.dataset_NA = self.dataset[self.dataset.isna().any(axis=1)]
    def extract_eventtype(self, value):
        if isinstance(value, str):
            match = re.search(r'P\d{3}', value)
            return match.group(0) if match else None
        return None
    def extract_eventlevel(self, value):
        if isinstance(value, str):
            match = re.search(r'N\d{1,2}', value)
            if match:
                eventlevel = match.group(0)
                eventlevel = 'N' + str(int(eventlevel[1:]))
                return eventlevel
        return None
    def moment_cleaning_day(self, moment):
        return moment.day_name()
    def moment_cleaning_time(self, moment):
        return moment.time()
    def format_latitude(self,value):
        if value == 0:
            return 0
        return value / 10**(np.floor(np.log10(value)) - 1)
    def format_longitude(self,value):
        if value == 0:
            return 0
        return value / 10**(np.floor(np.log10(value)))
    def cleaning(self):
        self.dataset_noNA.loc[:, 'eventleveltype_cleaned'] = self.dataset_noNA.loc[:, 'eventlevel'].apply(self.extract_eventlevel) + self.dataset_noNA.loc[:, 'eventtype'].apply(self.extract_eventtype)
        self.dataset_noNA.loc[:, 'day_cleaned'] = self.dataset_noNA.loc[:, 'moment'].apply(self.moment_cleaning_day)
        self.dataset_noNA.loc[:, 'time_cleaned'] = self.dataset_noNA.loc[:, 'moment'].apply(self.moment_cleaning_time)
        self.dataset_noNA.loc[:, 'latitude_cleaned'] = self.dataset_noNA.loc[:, 'latitude'].apply(self.format_latitude)
        self.dataset_noNA.loc[:, 'longitude_cleaned'] = self.dataset_noNA.loc[:, 'longitude'].apply(self.format_longitude)
        self.dataset_dropped = self.dataset_noNA.drop(columns = ['eventlevel','eventtype','latitude','longitude'])
        self.dataset_dropped = self.dataset_dropped.dropna().copy()
    def create_AEDneeded_column(self):
        self.dataset_dropped['AEDneeded_cleaned'] = None
        for key, value_list in self.eventleveltype_dict.items():
            for value in value_list:
                self.dataset_dropped.loc[self.dataset_dropped['eventleveltype_cleaned'].str.contains(value, na=False, case=False), 'AEDneeded_cleaned'] = key
        self.dataset_dropped = self.dataset_dropped.dropna().copy()
    def select_belgium (self):
        self.gdf_dataset= gpd.GeoDataFrame(self.dataset_dropped, geometry=gpd.points_from_xy(self.dataset_dropped.longitude_cleaned, self.dataset_dropped.latitude_cleaned), crs="EPSG:4326")
        self.gdf_dataset['in_Belgium'] = self.gdf_dataset['geometry'].apply(lambda x: self.belgium_polygon.contains(x))
        self.gdf_bel_dataset = self.gdf_dataset[self.gdf_dataset['in_Belgium'] == True]
    def select_interventions(self):
        self.df_sorted = self.gdf_bel_dataset.sort_values(by=['missionid', 'AEDneeded_cleaned', 'moment'], ascending=[True, False, True])
        self.df_cleaned = self.df_sorted.groupby('missionid').first().reset_index()
    def add_indicators(self):
        self.indicators = pd.get_dummies(self.df_cleaned['day_cleaned'])
        self.indicators = self.indicators.astype(int)
        self.final_cleaned = pd.concat([self.df_cleaned, self.indicators], axis=1)

In [5]:
all_clean = InterventionsCleaning(dataset=interventions_df, eventleveltype_dict=eventleveltype_dict)

In [None]:
        self.areas = {'Anderlecht': 17.74,'Oudergem':9.03,'Sint-Agatha-Berchem':2.95, 'Bruxelles':32.61,'Etterbeek':3.15,'Evere': 5.02,'Forest': 6.25, 'Ganshoren': 2.46,
         'Ixelles': 6.34,'Jette': 5.04,'Koekelberg': 1.17,'Sint-Jans-Molenbeek': 5.89,'Saint-Gilles':2.52,'Sint-Joost-ten-Node': 1.14,'Schaerbeek': 8.14,
         'Uccle': 22.91,'Watermael-Boitsfort':12.93,'Sint-Lambrechts-Woluwe': 7.22,'Sint-Pieters-Woluwe': 8.85,'Antwerpen': 204.51,'Brugge': 138.40,
         'Gent': 156.18,'Hasselt': 102.24,'Leuven': 56.63,  'Mons': 146.53,'Liège': 69.39,'Charleroi': 102.08, 'Namur': 175.69, 'Arlon': 118.64}

In [10]:
len(all_clean.final_cleaned)/(17.74+9.03+2.95+32.61+3.15+5.02+6.25+2.46
                              +6.34+5.04+1.17+5.89+2.52+1.14+8.14+22.91
                              +12.93+7.22+8.85+204.51+138.40+156.18+102.24
                              +56.63+146.53+69.39+102.08+175.69+118.64)

504.72531694199

In [7]:
train_clean = InterventionsCleaning(dataset=interventions_train, 
                                    #eventleveltype_dict=eventleveltype_dict)


In [8]:
test_clean = InterventionsCleaning(dataset=interventions_test, 
                                    #eventleveltype_dict=eventleveltype_dict)

In [9]:
train_df = train_clean.final_cleaned
test_df = test_clean.final_cleaned

In [10]:
train_df.to_parquet('train_df.parquet.gzip',
              compression='gzip')

In [11]:
test_df.to_parquet('test_df.parquet.gzip',
              compression='gzip')

In [12]:
test_df.head()

Unnamed: 0,missionid,moment,dataframe,eventleveltype_cleaned,day_cleaned,time_cleaned,latitude_cleaned,longitude_cleaned,AEDneeded_cleaned,geometry,in_Belgium,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,10221520002,2022-06-01 02:03:51+02:00,interventions1,N1P068,Wednesday,02:03:51,51.30626,4.40502,Requires AED,POINT (4.40502 51.30626),True,0,0,0,0,0,0,1
1,10221520006,2022-06-01 02:14:38+02:00,interventions1,N4P033,Wednesday,02:14:38,51.33113,4.79938,Does not require AED,POINT (4.79938 51.33113),True,0,0,0,0,0,0,1
2,10221520010,2022-06-01 02:26:53+02:00,interventions1,N5P033,Wednesday,02:26:53,51.17138,4.47345,Does not require AED,POINT (4.47345 51.17138),True,0,0,0,0,0,0,1
3,10221520017,2022-06-01 03:34:37+02:00,interventions1,N5P010,Wednesday,03:34:37,51.24724,4.45954,Does not require AED,POINT (4.45954 51.24724),True,0,0,0,0,0,0,1
4,10221520022,2022-06-01 04:29:56+02:00,interventions1,N1P099,Wednesday,04:29:56,51.33432,4.92224,Requires AED,POINT (4.92224 51.33432),True,0,0,0,0,0,0,1
