In [1]:
#Import libraries
import pandas as pd
import geopandas as gpd
import re
import pickle

In [2]:
df_aed = pd.read_csv('all_aed_coordinates.csv')

In [3]:
class AEDCleaning:
    def __init__(self, dataset, region):
        self.dataset = dataset
        self.region = region
        self.cleaning_municipalities()
        self.select_city()
        self.clean_locations()
        self.get_opening_times()
        self.access24h()
        self.assign_access()
        self.dropping_columns()

    def string_cleaning(self,df, columnname, old_and_new_labels):
        repl = {
            r'\/': None,
            r'\.': None,
            r'\. ': None,
            r'\ ': None,
            r'\;': None
        }
        new_column = f'{columnname}_cleaned'
        df[new_column] = df[columnname].replace(repl, regex=True)
        for key, input_list in old_and_new_labels.items():
            for word in input_list:
                df.loc[df[columnname].str.contains(word, na=False, case=False), new_column] = key
        return df

    def cleaning_municipalities(self):
        self.df_all_mun = self.dataset.dropna(subset=['municipality'])
        self.df_all_mun['municipality'] = self.df_all_mun['municipality'].str.lower()
        cleaning_cities = {
            'Antwerp': ['anvers', 'antwerpen', 'antwerp'],
            'Bruges': ['brugge', 'bruges'],
            'Brussels': ['bruxelles', 'bxl','brussel','anderlecht', 'Elsene', 'Etterbeek', 'Evere', 'Ganshoren', 'Jette', 'Koekelberg', 'Oudergem', 'Schaarbeek', 'Sint-Agatha-Berchem', 'Sint-Gillis', 'Sint-Jans-Molenbeek', 'Sint-Joost-ten-Node', 'Sint-Lambrechts-Woluwe', 'Sint-Pieters-Woluwe', 'Ukkel', 'Vorst', 'Watermaal-Bosvoorde', 'Auderghem', 'Berchem-Sainte-Agathe', 'Bruxelles-ville', 'Evere', 'Forest', 'Ixelles', 'Molenbeek-Saint-Jean', 'Saint-Gilles', 'Saint-Josse-ten-Noode', 'Schaerbeek', 'Uccle', 'Watermael-Boitsfort', 'Woluwe-Saint-Lambert', 'Woluwe-Saint-Pierre'],
            'Ghent': ['gent', 'ghent', 'gand'],
            'Hasselt': ['hasselt'],
            'Leuven': ['leuv', 'louvain', 'leuven'],
            'Liege': ['luik', 'liège', 'lieg'],
            'Mons': ['mons', 'bergen'],
            'Namur': ['namen', 'namur'],
            'Charleroi': ['charleroi'],
            'Arlon': ['arlon', 'aarlen'],
        }

        replacement_dict = {synonym: city for city, synonyms in cleaning_cities.items() for synonym in synonyms}
        self.df_all_mun_cleaned = pd.DataFrame()
        self.df_all_mun_cleaned['municipality_cleaned'] = self.df_all_mun['municipality'].replace(replacement_dict)
        self.df_all_mun_cleaned = pd.concat([self.df_all_mun,self.df_all_mun_cleaned], axis =1)
        self.df_all_mun_cleaned = self.df_all_mun_cleaned.drop(columns=['municipality'])

    def select_city(self):
        self.df_city = self.df_all_mun_cleaned[self.df_all_mun_cleaned['municipality_cleaned'] == self.region]
    
    def clean_locations(self):
        renaming_locations = {'outside public access': ['Extérieur','buit','Façade', 'facade','gevel', 'straat', 'veld','rue','veranda','muur', 'pied',
                                   'voorkant','mur','parking','pont', 'chemin', 'parc','plein','strand','tuin','jardin','fit-o-meter', 'hoek',
                                   'haute tension', 'thv','t.h.v', 'Loods', 'Militar','Dépôt','depot','economaat','park','Schelde','terras','Bois','port',
                                   'cour','pierre','bus', 'terrain'], 
                      'inside access': ['Accueil', 'Acceuil','accueuil','inkom','ing','onth','recep','récep','balie','comptoir','entr',
                                    'garde','keet', 'secrétar', 'quichets','secretar', 'caisse', 'sorti', 'portier', 'lobby', 'lobie', 'foyer', 'gardien',
                                    'info','securi', 'chantier', 'koer 2','klantendienst','wacht','kassa', 'self', 'TIKKLOK', 'pointeuse','poort','accès', 
                                    'préau', 'rez', 'réz','glvl','Gelijkvloers','rdc','r.d.c', '0','geschoss', 'geijkvloers', 'einpfang'],
                      'inside less accessible': ['sous-sol','sous sol','1er', '1e', '°', 'etage', 'ètage','étage','étg' ,'3de', '2de', '4de', '2e','1ste', 
                                    'verd', 'niv', '18th', '5i','5e', 'quai', 'spoor', 'perron','club', 'dojo', 'petan','tennis','voetbal','fiets', 'sport', 
                                    'athlét', 'athlet', 'zwem', 'football', 'hockey','jeux', 'fitness','Atletiek','speel','speler','foot','ski','padel', 'récré', 'redder',
                                 'sauveteurs','natation','pattinoire', 'sal','ascenseur','ASCENCEUR', 'Escalier' ,'local','lokaal', 'caf', 'bure', 'hal', 'gang', 'buvette', 
                                 'vestia','zaal','couloir','binnen','réfectoire',
                                    'keuken', 'cuisine','biblio','bar','infirmerie', 'ruimte', 'secret','maga','ateli', 'garage','cantine', 'rentré',
                                    'kantine', 'sacristie', 'kantoor', 'production', 'kleedkamer', 'lift', 'Intérieur','bât', 'bat', 'bouw', 'house',
                                    'shop', 'station', 'piscine', 'werkplaats', 'vestaire', 'restaurant','tandarts','praktijk', 'kabinet', 'refter',
                                    'winkel', 'toilet', 'room','maison', 'sanitair', 'office', 'lab','kamer','deur', 'school', 'facult', 
                                    'gym', 'accomodat','ecole', 'école', 'kerk', 'bloc', 'blok', 'Collège','eglise','trap','aula','huis',
                                    'restauration','catering','wc','hangar', 'kazerne','kabine', 'casernes', 'terminal', 'kai','fabriek','flats',
                                    'pharma','complexe', 'hulppost', 'onder','afspraak', 'bellen', 'facility', 'loge', 'pavillon', 'toonbank',
                                    'Fours', 'Enseignement', 'JBC', 'Refectoire', 'imprimante','Kast','Uitpunt','auditoraat','administratif','Recycl',
                                    'photocop', 'usine','chateau','class','afd','galeri', 'cc ', 'kinderboerderij', 'babygroup', 'Geriaterie', 'cabinet',
                                    'hébergement','bassin','Ortho','neuro','SP Dienst','panoptique', 'mobi','nomade','dispatch','volant','portable', 
                                    'Brandweerwagen', 'wagen','pas de véhicule fixe', 'non fixé','pas véhicule fixe',
                                 'leen', 'bouillet', 'fond réfectaire', 'operateur four', 'kinderopvang MIKI', 'espace client', 'ouvriers', 
                                 'schuin over medische dienst', 'all-ranks', 'ancienne boulangerie', 'jeu de balle', 'anciennes papeteries', 'graaf jansdijk', 
                                 'nouvelle tribune', '"chant des oiseaux' 'brasserie tinto', 'du coté gauche,près des gradins' 'Jeu de balle', 'piece de vie', 'omkleedse', 
                                 'chiro jongens SSW', 'autorail', 'milka', 'service travaux', 'ensachage', 'tour', 'voie humide', 'oiseaux', 'piece de vie', 
                                 'boverie', 'service population', 'locaux sociaux', 'espace vital', 'OC Lauwe', 'ecluse', 'poste de commande', 'medic','préfecture', 
                                 'bornavie', 'usage professionnel','chariot', 'Flegado', 'expédition', 'CDC', 'DCU', 'magneet', 'play 2 move','supermarkt', 'testarea', 
                                 'roekhout', 'sellerie', 'espace public','Leie', 'laugh', 'stad', 'leveren locatie', 'TEn HOve', 'CBRN', 'Events', 'rochehaut', 
                                 'Munsterbilzen', 'brasserie', 'Buanderie','Premiers Soins','EHBO','camion','un sac','secours','reserve', 'Afrika', 'définier', 'rechange', 
                                 'vervangt','aucun', 'section', 'corner','preau', 'dessus lance d Incendie', 'commande poste', 'zone de titre éditable', 'poste de commandes',
                                 'sacoche', 'plaine des sapins', 'boulodrôme', 'péniche', 'stand de tir', 'pièce de confidentialité', 'zijde spaarbekken', 'untergeschassPlakettenaum', 
                                 'expéditions', 'coté gauche','sambre', 'Woluwe', 'Villers', 'au sein du site prod', 'site du CTA', 'oc de klakeye', '0', '1', '2', '3', '4', '5', '6',  ]
                      }
        self.df_city_loc_cleaned = self.string_cleaning(self.df_city, 'location', renaming_locations)
        self.df_city_loc_cleaned['location_cleaned'] = self.df_city_loc_cleaned['location_cleaned'].fillna('inside less accessible')

    def extract_time_info(self, row):
        available = str(row['available']).lower()
        hours = str(row['hours']).lower()
        combined_info = f"{available} {hours}"
        return combined_info
        
    def get_opening_times(self):
        self.df_cleaning_opening = self.df_city_loc_cleaned.copy()
        self.df_cleaning_opening['combined_info'] = self.df_cleaning_opening.apply(self.extract_time_info, axis=1)
        self.df_cleaning_opening['opening_days'] = self.df_cleaning_opening['combined_info'].apply(self.find_days)
        self.df_cleaning_opening['opening_hour'], self.df_cleaning_opening['closing_hour'] = zip(*self.df_cleaning_opening['combined_info'].apply(self.extract_time_info_from_combined_info))
        
    def find_days(self,info):
        day_mapping = {
                r'\b(?:sun(?:day)?)\w*': 'Sun',
                r'\b(?:mon(?:day)?)\w*': 'Mon',
                r'\b(?:tue(?:sday)?)\w*': 'Tue',
                r'\b(?:wed(?:nesday)?)\w*': 'Wed',
                r'\b(?:thu(?:rsday)?)\w*': 'Thu',
                r'\b(?:fri(?:day)?)\w*': 'Fri',
                r'\b(?:sat(?:urday)?)\w*': 'Sat',
                r'\b(?:zon(?:dag)?)\w*': 'Sun',
                r'\b(?:maan(?:dag)?)\w*': 'Mon',
                r'\b(?:din(?:sdag)?)\w*': 'Tue',
                r'\b(?:woe(?:nsdag)?)\w*': 'Wed',
                r'\b(?:do(?:nderdag)?)\w*': 'Thu',
                r'\b(?:vr(?:ijdag)?)\w*': 'Fri',
                r'\b(?:za(?:terdag)?)\w*': 'Sat',
                r'\b(?:dim(?:anche)?)\w*': 'Sun',
                r'\b(?:lun(?:di)?)\w*': 'Mon',
                r'\b(?:mar(?:di)?)\w*': 'Tue',
                r'\b(?:mer(?:credi)?)\w*': 'Wed',
                r'\b(?:jeu(?:di)?)\w*': 'Thu',
                r'\b(?:ven(?:dredi)?)\w*': 'Fri',
                r'\b(?:sam(?:edi)?)\w*': 'Sat'
            }
        for pattern, day in day_mapping.items():
            info = re.sub(pattern, day, info, flags=re.IGNORECASE)
        pattern = r'(\b(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)\b)\s*(?:-\s*|\b(?:to|a|à|au|tot|tem|t\.e\.m)\s*)\s*(\b(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)\b)'
        matches = re.findall(pattern, info)
        found_pattern = False
        day_list =[]
        days = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun','Mon','Tue','Wed','Thu','Fri','Sat']
        short_days = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
        if matches:
            for match in matches: 
                if (match[0] in short_days and match[1] in short_days):
                    found_pattern= True
                    start = match[0]   
                    end =   match[1]
                    in_between = False
                    for day in days:
                        if (day == start or in_between==True):
                            day_list.append(day)
                            in_between= True
                            if day == end:
                                in_between=False
                                break 
            if found_pattern==False:       
                for day in short_days:  
                    matches2 = re.findall(day, info)
                    if matches2:
                        day_list.append(matches2[0])
        else:  
            for day in short_days:  
                matches2 = re.findall(day, info)
                if matches2:
                    day_list.append(matches2[0])
        if day_list:
            returnvalue =''
            for day in day_list:
                returnvalue = day + ' ,' +returnvalue
            return returnvalue
        else:
            None
    
    
    def extract_time_info_from_combined_info(self,combined_info):
        time_pattern = r'(\b\d{1,2}[:h\.]?\d{2}\b|\b\d{1,2}[-:\s]?\d{1,2}[:h\.]?\d{2}\b|\b\d{1,2}\s?[hH]?[rs]?\b)(?![^\s]*\/[^\s]*)'
        opening_hour, closing_hour = None, None
        times = re.findall(time_pattern, combined_info)
        if len(times) == 2:
            hours = []
            for time in times:
                hour_pattern = r'(\d{1,2})[:.]?\d{0,2}[a-zA-Z]?'
                match = re.match(hour_pattern, time)
                if match:
                    hour = int(match.group(1))
                    hours.append(hour)
            if len(hours) == 2:        
                opening_hour, closing_hour = hours[0], hours[1]
        return opening_hour, closing_hour
    
    def contains_only_24_or_7(self, text):
        numbers = re.findall(r'\d+', text)
        return all(num in {'24', '7'} for num in numbers)
    
    def access24h(self):
        rows_with_24 = self.df_cleaning_opening[self.df_cleaning_opening['combined_info'].str.contains('24', case=False, na=False)]
        mask_combined_24 = rows_with_24['combined_info'].apply(self.contains_only_24_or_7)
        self.df_cleaning_opening.loc[(mask_combined_24 & self.df_cleaning_opening['opening_hour'].isna()), 'opening_hour'] = 0
        self.df_cleaning_opening.loc[(mask_combined_24 & self.df_cleaning_opening['closing_hour'].isna()), 'closing_hour'] = 23
        self.df_cleaning_opening.loc[(mask_combined_24 & self.df_cleaning_opening['opening_days'].isna()), 'opening_days'] = 'Mon ,Tue ,Wed ,Thu ,Fri, Sat ,Sun'
        
    def assign_access(self):
        self.assign_value('inside access','opening_hour', int(9))
        self.assign_value('inside access','closing_hour', int(16))
        self.assign_value('inside access','opening_days', 'Mon ,Tue ,Wed ,Thu ,Fri')
        self.assign_value('inside less accessible','opening_hour', int(9))
        self.assign_value('inside less accessible','closing_hour', int(16))
        self.assign_value('inside less accessible','opening_days', 'Mon ,Tue ,Wed ,Thu ,Fri')
        self.assign_value('outside public access','opening_hour', int(0))
        self.assign_value('outside public access','closing_hour', int(23))
        self.assign_value('outside public access','opening_days', 'Mon ,Tue ,Wed ,Thu ,Fri, Sat, Sun')


    def assign_value(self,label, column, value): 
        mask = (self.df_cleaning_opening['location_cleaned'] == label) & (
            self.df_cleaning_opening[column].isnull())
        self.df_cleaning_opening.loc[mask, column] =value
        
    def dropping_columns(self):
        self.final_df = self.df_cleaning_opening.drop(columns=['id','type','address','number', 'postal_code', 
                                                               'province', 'location', 'public', 'available', 
                                                               'hours', 'full_address', 'combined_info','municipality_cleaned'])
        self.final_df = self.final_df.dropna().reset_index(drop=True)

        


In [4]:
cities =['Antwerp', 'Bruges', 'Brussels', 'Ghent','Hasselt','Leuven','Liege','Mons','Namur','Charleroi','Arlon']

In [5]:
dic_all_cities ={}
for city in cities:
    cleaner = AEDCleaning(df_aed, city)
    dic_all_cities[city] = cleaner.final_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_all_mun['municipality'] = self.df_all_mun['municipality'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column] = df[columnname].replace(repl, regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_city_loc_cleaned['location_cleaned'] = self.df_city_loc_clean

In [7]:
days = ['Mon' ,'Tue' ,'Wed' ,'Thu' ,'Fri', 'Sat', 'Sun']
all_aeds ={}
for city in dic_all_cities.keys():
    days_per_city= {}
    df = dic_all_cities[city]
    for day in days:
        df.loc[~df['opening_days'].str.contains(day), ['opening_hour', 'closing_hour']] = 0
        df_final = df.copy()
        df_final = df_final.drop(columns = ['opening_days'])
        days_per_city[day]= df_final
    all_aeds[city] =days_per_city



In [8]:
all_aeds['Brussels']

{'Mon':             lat       lon        location_cleaned  opening_hour  closing_hour
 0     50.842828  4.384426  inside less accessible           9.0          16.0
 1     50.882965  4.335196  inside less accessible           9.0          16.0
 2     50.875724  4.324338  inside less accessible           9.0          16.0
 3     50.831942  4.328980  inside less accessible           9.0          16.0
 4     50.845150  4.369893  inside less accessible           9.0          16.0
 ...         ...       ...                     ...           ...           ...
 1445  50.865428  4.378332  inside less accessible           9.0          16.0
 1446  50.865428  4.378332  inside less accessible           9.0          16.0
 1447  50.849238  4.363506  inside less accessible           9.0          16.0
 1448  50.815654  4.295099  inside less accessible           9.0          16.0
 1449  50.821723  4.362921  inside less accessible           9.0          16.0
 
 [1450 rows x 5 columns],
 'Tue':          

In [9]:
with open('all_aeds.pkl', 'wb') as f:
    pickle.dump(all_aeds,f)

In [10]:
with open('all_aeds.pkl', 'rb') as f:
    loaded_all_aeds =pickle.load(f)

In [11]:
loaded_all_aeds['Brussels']

{'Mon':             lat       lon        location_cleaned  opening_hour  closing_hour
 0     50.842828  4.384426  inside less accessible           9.0          16.0
 1     50.882965  4.335196  inside less accessible           9.0          16.0
 2     50.875724  4.324338  inside less accessible           9.0          16.0
 3     50.831942  4.328980  inside less accessible           9.0          16.0
 4     50.845150  4.369893  inside less accessible           9.0          16.0
 ...         ...       ...                     ...           ...           ...
 1445  50.865428  4.378332  inside less accessible           9.0          16.0
 1446  50.865428  4.378332  inside less accessible           9.0          16.0
 1447  50.849238  4.363506  inside less accessible           9.0          16.0
 1448  50.815654  4.295099  inside less accessible           9.0          16.0
 1449  50.821723  4.362921  inside less accessible           9.0          16.0
 
 [1450 rows x 5 columns],
 'Tue':          