In [1]:
import pandas as pd
import hashlib

In [73]:
diag_data = {
    'PID': ['P1','P1', 'P1', 'P2', 'P2','P2', 'P3', 'P3', 'P4', 'P4', 'P4'],
    'CONCEPT': ['C0','C1', 'C2', 'C3', 'C4', 'C3','C5', 'C6', 'C7', 'C8', 'C0'],
    'TIMESTAMP': ['2022-01-01 10:00:00',
        '2023-01-01 10:00:00', '2023-01-01 11:00:00',
        '2023-02-01 12:00:00', '2023-02-01 13:00:00',
        '2023-02-01 19:00:00',
        '2023-03-01 14:00:00', '2023-03-01 15:00:00',
        '2023-04-01 16:00:00', '2023-04-01 17:00:00', '2023-05-01 17:00:00'
    ]
}
diag = pd.DataFrame(diag_data)

# Create admission dataframe
adm_data = {
    'PID': ['P1', 'P1','P2', 'P2', 'P3','P6'],
    'TIMESTAMP_START': [
        '2023-01-01 08:00:00','2023-01-01 09:00:00', '2023-02-01 10:00:00',

        '2023-02-01 18:00:00', '2023-03-01 13:00:00','2023-03-01 13:00:00',
       
    ],
    'TIMESTAMP_END': [
        '2023-01-01 12:00:00','2023-01-01 13:00:00', '2023-02-01 15:00:00',
        '2023-02-01 22:00:00', '2023-03-01 17:00:00','2023-03-01 17:00:00',
      
    ],
    'ADMISSION_ID': [1, 2,6, 3, 4,5]
}
adm = pd.DataFrame(adm_data)

diag['TIMESTAMP'] = pd.to_datetime(diag['TIMESTAMP'])
adm['TIMESTAMP_START'] = pd.to_datetime(adm['TIMESTAMP_START'])
adm['TIMESTAMP_END'] = pd.to_datetime(adm['TIMESTAMP_END'])
adm['ADMISSION_ID'] = adm['ADMISSION_ID'].astype(int)
# adm = adm.set_index('PID')
# diag = diag.set_index('PID')

In [47]:
def filter_records_within_admission(concept_df, adm_df):
    """Filter the records that fall within the admission time range."""
    merged_df = pd.merge(concept_df, adm_df, on='PID', how='outer')
    return merged_df[(merged_df['TIMESTAMP']<merged_df['TIMESTAMP_END']) & (merged_df['TIMESTAMP']>merged_df['TIMESTAMP_START'])]

def assign_admission_id(df):
    """Assign unique admission IDs to records based on PID and time difference."""
    df_sorted = df.sort_values(['PID', 'TIMESTAMP'])
    df_sorted['TimeDiff'] = df_sorted.groupby('PID')['TIMESTAMP'].diff()
    df_sorted['NewID'] = df_sorted['TimeDiff'].apply(lambda x: x.total_seconds() > 24*60*60)
    df_sorted['ID'] = df_sorted.groupby('PID')['NewID'].cumsum().astype(int)
    df_sorted['ID'] = (df_sorted['PID'].astype(str) + '_' + df_sorted['ID'].astype(str)).apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
    df_final = df_sorted.drop(columns=['TimeDiff', 'NewID']).rename(columns={'ID':'ADMISSION_ID'})
    return df_final

def combine_dataframes(df1, df2):
    """Combine two dataframes, removing unnecessary columns from the one within admissions."""
    df2 = df2.drop(columns=['TIMESTAMP_START', 'TIMESTAMP_END'])
    return pd.concat([df1, df2])

def add_admission_id(concept_df, adm_df):
    """
    Add unique admission IDs to records. For records within admission times,
    keep existing IDs. For others, generate IDs based on PID and timestamp.
    """
    # Filter records within and outside of admission times
    in_adm = filter_records_within_admission(concept_df, adm_df)
    out_adm = concept_df[~concept_df.index.isin(in_adm.index)]

    # Assign unique admission IDs to records outside of admission times
    out_adm = assign_admission_id(out_adm)

    # Combine dataframes
    result_df = combine_dataframes(out_adm, in_adm)

    return result_df



In [86]:
import pandas as pd
import hashlib

def filter_records_within_admission(concept_df, adm_df):
    """
    Filter the records that fall within the admission time range and assign to closest admission.
    """
    # Reset index and sort values before the merge
    concept_df = concept_df.reset_index().sort_values("TIMESTAMP")
    adm_df = adm_df.reset_index().sort_values("TIMESTAMP_START")

    # Merge on PID with outer join to get all combinations
    merged_df = pd.merge_asof(
        concept_df, 
        adm_df, 
        left_on="TIMESTAMP",
        right_on="TIMESTAMP_START",
        by="PID", 
        direction="nearest"
    ).drop(columns=["index_x", "index_y"])
    
    # Filter to keep only the rows where TIMESTAMP is within the admission time range
    in_admission = (merged_df['TIMESTAMP']<=merged_df['TIMESTAMP_END']) & (merged_df['TIMESTAMP']>=merged_df['TIMESTAMP_START'])
    return merged_df[in_admission]

def assign_admission_id(df):
    """
    Assign unique admission IDs to records based on PID and time difference.
    """
    df_sorted = df.sort_values(['TIMESTAMP'])
    df_sorted['TimeDiff'] = df_sorted.groupby('PID')['TIMESTAMP'].diff()
    df_sorted['NewID'] = df_sorted['TimeDiff'].apply(lambda x: 0 if pd.isnull(x) else int(x.total_seconds() > 24*60*60))

    df_sorted['ID'] = df_sorted.groupby('PID')['NewID'].cumsum().astype(int)
    df_sorted['ID'] = (df_sorted.PID.astype(str) + '_' + df_sorted['ID'].astype(str)).apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
    df_final = df_sorted.drop(columns=['TimeDiff', 'NewID']).rename(columns={'ID':'ADMISSION_ID'})
    return df_final

def combine_dataframes(df1, df2):
    """
    Combine two dataframes, removing unnecessary columns from the one within admissions.
    """
    df2 = df2.drop(columns=['TIMESTAMP_START', 'TIMESTAMP_END'])
    return pd.concat([df1, df2])

def add_admission_id(concept_df, adm_df):
    """
    Add unique admission IDs to records. For records within admission times,
    keep existing IDs. For others, generate IDs based on PID and timestamp.
    """
    # Filter records within and outside of admission times
    concept_df["EVENT_ID"] = range(len(concept_df))
    in_adm = filter_records_within_admission(concept_df, adm_df)
    out_adm = concept_df.loc[~concept_df.EVENT_ID.isin(in_adm.EVENT_ID.unique())]
    # Assign unique admission IDs to records outside of admission times
    out_adm = assign_admission_id(out_adm)
    # Combine dataframes
    result_df = combine_dataframes(out_adm, in_adm)
    # Reset index to make PID a column again
    # result_df = result_df.drop(columns=['EVENT_ID'])

    return result_df.reset_index(drop=True)


In [88]:
adm

Unnamed: 0,PID,TIMESTAMP_START,TIMESTAMP_END,ADMISSION_ID
0,P1,2023-01-01 08:00:00,2023-01-01 12:00:00,1
1,P1,2023-01-01 09:00:00,2023-01-01 13:00:00,2
2,P2,2023-02-01 10:00:00,2023-02-01 15:00:00,6
3,P2,2023-02-01 18:00:00,2023-02-01 22:00:00,3
4,P3,2023-03-01 13:00:00,2023-03-01 17:00:00,4
5,P6,2023-03-01 13:00:00,2023-03-01 17:00:00,5


In [87]:
add_admission_id(diag, adm)

Unnamed: 0,PID,CONCEPT,TIMESTAMP,EVENT_ID,ADMISSION_ID
0,P1,C0,2022-01-01 10:00:00,0,a432666a91e37f7e13acf30ccba5e4203940eb3be022a2...
1,P4,C7,2023-04-01 16:00:00,8,c2ffb3808fb1be8192c7acf9dda697317dd6cf50f9d190...
2,P4,C8,2023-04-01 17:00:00,9,c2ffb3808fb1be8192c7acf9dda697317dd6cf50f9d190...
3,P4,C0,2023-05-01 17:00:00,10,9561fa0af1ae6c8e2cf37deac9a87353556eb640fecfa0...
4,P1,C1,2023-01-01 10:00:00,1,2.0
5,P1,C2,2023-01-01 11:00:00,2,2.0
6,P2,C3,2023-02-01 12:00:00,3,6.0
7,P2,C4,2023-02-01 13:00:00,4,6.0
8,P2,C3,2023-02-01 19:00:00,5,3.0
9,P3,C5,2023-03-01 14:00:00,6,4.0
