In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
datasets = ['interventions_bxl', 'interventions_bxl2', 
            'interventions1', 'interventions2',
            'interventions3', 'cad9']

timeformat1 = [
            'interventions_bxl2',
            'interventions1',
            'interventions2',
            'interventions3'
        ]

timeformat2 = [
            'interventions_bxl',
            'cad9'
        ]

column_mapping_moment = {
    'interventions_bxl': 't0',
    'interventions_bxl2': 'T0',
    'interventions1': 'T0',
    'interventions2': 'T0',
    'interventions3': 'T0',
    'cad9': 'T0'
}
column_mapping = {
    'interventions_bxl': {'mission_id': 'missionid', 
             'eventlevel_trip': 'eventlevel', 
             'eventtype_trip': 'eventtype',
             'latitude_intervention': 'latitude',
             'longitude_intervention': 'longitude',
             'formatted_t0': 'moment'
             },
    'interventions_bxl2': {'Mission ID': 'missionid', 
              'EventType and EventLevel': ['eventlevel', 'eventtype'], 
              'Latitude intervention': 'latitude',
              'Longitude intervention': 'longitude',
              'formatted_t0': 'moment'
              },
    'interventions1': {'Mission ID': 'missionid', 
           'EventLevel Trip': 'eventlevel', 
           'EventType Trip': 'eventtype',
           'Latitude intervention': 'latitude',
           'Longitude intervention': 'longitude',
           'formatted_t0': 'moment'
           },
    'interventions2': {'Mission ID': 'missionid', 
           'EventLevel Trip': 'eventlevel', 
           'EventType Trip': 'eventtype',
           'Latitude intervention': 'latitude',
           'Longitude intervention': 'longitude',
           'formatted_t0': 'moment'
           },
    'interventions3': {'Mission ID': 'missionid', 
           'EventLevel Trip': 'eventlevel', 
           'EventType Trip': 'eventtype',
           'Latitude intervention': 'latitude',
           'Longitude intervention': 'longitude',
           'formatted_t0': 'moment'
           },
    'cad9': {'Mission ID': 'missionid', 
             'EventLevel Trip': 'eventlevel', 
             'EventType Trip': 'eventtype',
             'Latitude intervention': 'latitude',
             'Longitude intervention': 'longitude',
             'formatted_t0': 'moment'
             }
}

In [4]:
class InterventionProcessor:
    def __init__(self, datasets, list_timeformat1, list_timeformat2, column_mapping_moment, column_mapping):
        self.datasets = datasets
        self.list_timeformat1 = list_timeformat1
        self.list_timeformat2 = list_timeformat2
        self.column_mapping_moment = column_mapping_moment
        self.column_mapping = column_mapping
        self.load_dfs()
        self.format_times()
        self.copy_columns()
        self.concatenate_dataframes()

    def load_dfs(self):
        for set_name in self.datasets:
            setattr(self, set_name, pd.read_parquet(f'{set_name}.parquet.gzip'))

    def format_times(self):
        for name in self.list_timeformat1:
            df = getattr(self, name)
            column_name = self.column_mapping_moment[name]
            df['formatted_t0'] = pd.to_datetime(df[column_name], format="%d%b%y:%H:%M:%S")
        
        for name in self.list_timeformat2:
            df = getattr(self, name)
            column_name = self.column_mapping_moment[name]
            df['formatted_t0'] = pd.to_datetime(df[column_name])

    def copy_columns(self):
        for name, columns in self.column_mapping.items():
            df = getattr(self, name)
            for old_col, new_col in columns.items():
                if isinstance(new_col, list):
                    for new in new_col:
                        df[new] = df[old_col]
                else:
                    df[new_col] = df[old_col]
            df['dataframe'] = name
    def concatenate_dataframes(self):
        self.df_interventions = pd.DataFrame()
        dataframes = []
        for name in self.datasets:
            df = getattr(self, name)
            updated_df = df[['missionid', 'eventlevel', 'eventtype', 
                             'latitude', 'longitude', 'moment', 'dataframe']]
            dataframes.append(updated_df)
        self.df_interventions = pd.concat(dataframes, ignore_index=True)
        




In [5]:
# Example of how you should instantiate the class
interventions = InterventionProcessor(
    datasets=datasets, 
    list_timeformat1=timeformat1, 
    list_timeformat2=timeformat2, 
    column_mapping_moment=column_mapping_moment, 
    column_mapping=column_mapping
)

  df['formatted_t0'] = pd.to_datetime(df[column_name])


In [6]:
interventions_df = interventions.df_interventions

In [7]:
interventions_df.head()

Unnamed: 0,missionid,eventlevel,eventtype,latitude,longitude,moment,dataframe
0,20222490011,N5,P033 - Trauma,5085139.0,436918.0,2022-09-06 11:49:21.586859800+02:00,interventions_bxl
1,20222490011,N5,P033 - Trauma,5085139.0,436918.0,2022-09-06 11:49:21.586859800+02:00,interventions_bxl
2,20222490012,N5,P059 - Dizziness - Nausea,5083336.0,434504.0,2022-09-06 11:55:35.793679100+02:00,interventions_bxl
3,20222490015,N5,P019 - Unconscious - syncope,5085076.0,436359.0,2022-09-06 12:39:23.433732400+02:00,interventions_bxl
4,20222490019,N5,P033 - Trauma,508561.0,443169.0,2022-09-06 13:26:48.337914700+02:00,interventions_bxl


In [8]:
len(interventions_df)

1045549

In [9]:
interventions_df.to_parquet('interventions_df.parquet.gzip',
              compression='gzip')