## Load merged data tables, index and lookback dates

In [39]:
import pandas as pd
import numpy as np

pos_index_lookback = pd.read_csv("Mock_data/results/pos_index_lookback.csv", 
                                 low_memory=False, index_col=0, parse_dates=["DATE_INDEX", "DATE_LOOKBACK"])
neg_index_lookback = pd.read_csv("Mock_data/results/neg_index_lookback.csv", 
                                 low_memory=False, index_col=0, parse_dates=["DATE_INDEX", "DATE_LOOKBACK"])
pos_data = pd.read_csv("Mock_data/merged_pos.csv", low_memory=False, index_col=0, parse_dates=["DATE"])
neg_data = pd.read_csv("Mock_data/merged_neg.csv", low_memory=False, index_col=0, parse_dates=["DATE"])

## Extract positive and negative datasets

- Get patients who are in the `pos_index_lookback` and `neg_index_lookback`
- Get events and spells that are within these timeframes
  - To make this efficient we duplicate the index and lookback date for each patient for each event
  - Then we simply filter rows that are between index and lookback date
  - Filter out rows that are before lookback date or after index date

In [40]:
# add index and lookback date to pos and neg data - inner join: only keep patients who have index and lookback dates
pos_data = pos_data.set_index("ENCRYPTED_HESID")
pos_data = pos_index_lookback.join(pos_data, how="inner")
neg_data = neg_data.set_index("ENCRYPTED_HESID")
neg_data = neg_index_lookback.join(neg_data, how="inner")

# filter out rows that fall outside of lookback period
pos_rows_to_keep = ((pos_data.DATE_INDEX >= pos_data.DATE) & (pos_data.DATE >= pos_data.DATE_LOOKBACK))
pos_data = pos_data[pos_rows_to_keep]
pos_data.reset_index(inplace=True)
neg_rows_to_keep = ((neg_data.DATE_INDEX >= neg_data.DATE) & (neg_data.DATE >= neg_data.DATE_LOOKBACK))
neg_data = neg_data[neg_rows_to_keep]
neg_data.reset_index(inplace=True)

## Aggregate primary and secondary DIAGs and OPERTNs

#### Aggregate primary and secondary diagnoses on patient level

- First and foremost all ICD codes in the `DIAG_` columns have to be cleaned to only first four characters.
- Then all `DIAG_` columns need to be trunkated to either 1, 2, 3, or 4 characters.
- These are categorical variables so they need to be one hot encoded first. 
- ICD cdes of `DIAG_01` should become `DIAG_1_ICD`
- ICD codes from either of `DIAG_02`-`DIAG_20` columns have to be merged into `DIAG_2_ICD`. 

Following the generation of dummy variables for the primary and secondary ICD codes we need t aggregate them
- Each patient has several episodes (marked by `EPIKEY` columns). 
- Within each episode there could be several spells. 
- We want to aggregate ICD diagnoses codes on the patient level from all episodes but without counting them multiple times on the spell level. 
- So we will first aggregate on the episode level, if any ICD code has a count > 1 we set it to 1, then aggregate on patient level.

In [6]:
# function to clean and trim ICD codes to fixed length
def get_first_n_chars(df, cols, n):
    df = df.copy()
    def converter(col):
        code = (col
                .str.replace("\.", "")
                .str.replace("-", "")
                .str[:n])
        return code
    df[cols] = df[cols].apply(converter)
    return df

# function to aggregate codes to primary and secondary levels across episodes
def create_aggregated_table(data, trim_level, cols_to_agg, primary_col, secondary_col):
    # make trimmed ICD code data 
    data = data.pipe(get_first_n_chars, cols_to_agg, trim_level)

    # generate dummy vars from primary diagnosis codes
    diag_1_df = pd.get_dummies(data[primary_col], prefix=primary_col)

    # generate dummy vars from primary diagnosis codes: DIAG_02
    diag_2_df = pd.get_dummies(data[cols_to_agg[1]], prefix=secondary_col)

    # generate dummies from and add all other secondary diagnoses codes DIAG_03 - DIAG_20 to DIAG_02
    diag_2_df = diag_2_df.astype(dtype=np.float64)
    for col in cols_to_agg[2:]:
        # print(col)
        dummy_tmp = pd.get_dummies(data[col], prefix=secondary_col)
        dummy_tmp = dummy_tmp.astype(dtype=np.float64)
        diag_2_df = diag_2_df.add(dummy_tmp, fill_value=0)

    # create table of patient ids with episode ids as index
    ids = data[["ENCRYPTED_HESID", "EPIKEY"]]
    ids = ids.set_index("EPIKEY")

    # concatenate episode keys with the icd diagnoses tables
    diag_df = pd.concat([data["EPIKEY"], diag_1_df, diag_2_df], axis=1)

    # aggregate on episode level, then correct if there were multiple spells
    epi_level_agg = diag_df.groupby("EPIKEY").sum()
    #ela_larger_than_one_mask = (epi_level_agg > 1).values
    epi_level_agg[epi_level_agg > 1] = 1

    # get patient ids for each of the aggregated episode keys
    ids = ids.loc[epi_level_agg.index]

    # join ICD tables with patient ids series, on episode ids 
    epi_level_agg = ids.join(epi_level_agg)

    # unstack episode ids as index in the aggregated ICD table
    epi_level_agg.reset_index(level=0, inplace=True)

    # group on patient level, we need to sum diag vars, but count epikeys
    agg_funcs = {c:"sum" if c!="EPIKEY" else "count" for c in epi_level_agg.columns}
    
    # delete the patient ids from the dict as we'll group by that variable
    agg_funcs.pop('ENCRYPTED_HESID', None)
    
    # get aggregates on patient level
    patient_level_agg = epi_level_agg.groupby('ENCRYPTED_HESID').agg(agg_funcs)

    # unstack patient ids as index 
    patient_level_agg.reset_index(level=0, inplace=True)

    return patient_level_agg

In [7]:
# create list of ICD diagnoses columns
diag_cols = ["DIAG_0" + str(i+1) for i in range(9)] + ["DIAG_" + str(i+10) for i in range(11)]

# create list of operation columns
oper_cols = ["OPERTN_0" + str(i+1) for i in range(9)] + ["OPERTN_" + str(i+10) for i in range(15)]

# define function to apply create_aggregated_table to any table (APC, AE, OP)
def get_diag_oper_codes(pos_data, neg_data, trim_level, table_to_extract, file_prefix):
    # extract table from positive and negative datasets
    p_data = pos_data[pos_data.TABLE.isin(table_to_extract)]
    n_data = neg_data[neg_data.TABLE.isin(table_to_extract)]
    
    # apply diag and oper extraction function
    pos_diag = create_aggregated_table(p_data, trim_level, diag_cols, "DIAG_01", "DIAG_02")
    pos_oper = create_aggregated_table(p_data, trim_level, oper_cols, "OPERTN_01", "OPERTN_02")
    neg_diag = create_aggregated_table(n_data, trim_level, diag_cols, "DIAG_01", "DIAG_02")
    neg_oper = create_aggregated_table(n_data, trim_level, oper_cols, "OPERTN_01", "OPERTN_02")
    
    # save resulting tables
    pos_diag.to_csv("Mock_data/results/pos_diag_" + str(trim_level) + "_" + file_prefix + ".csv")
    pos_oper.to_csv("Mock_data/results/pos_oper_" + str(trim_level) + "_" + file_prefix + ".csv")
    neg_diag.to_csv("Mock_data/results/neg_diag_" + str(trim_level) + "_" + file_prefix + ".csv")
    neg_oper.to_csv("Mock_data/results/neg_oper_" + str(trim_level) + "_" + file_prefix + ".csv")

## Produce DIAG and OPER tables

For each trim level (1, 2, 3, 4), for each table type (apc, ae, op, all), for both datasets (positives, negatives) create DIAG and OPER tables for primary and secondary procedures.

In [144]:
get_diag_oper_codes(pos_data, neg_data, 4, ["apc"], "apc")
get_diag_oper_codes(pos_data, neg_data, 4, ["ae"], "ae")
get_diag_oper_codes(pos_data, neg_data, 4, ["op"], "op")
get_diag_oper_codes(pos_data, neg_data, 4, ["apc", "ae", "op"], "all")

get_diag_oper_codes(pos_data, neg_data, 3, ["apc"], "apc")
get_diag_oper_codes(pos_data, neg_data, 3, ["ae"], "ae")
get_diag_oper_codes(pos_data, neg_data, 3, ["op"], "op")
get_diag_oper_codes(pos_data, neg_data, 3, ["apc", "ae", "op"], "all")

get_diag_oper_codes(pos_data, neg_data, 2, ["apc"], "apc")
get_diag_oper_codes(pos_data, neg_data, 2, ["ae"], "ae")
get_diag_oper_codes(pos_data, neg_data, 2, ["op"], "op")
get_diag_oper_codes(pos_data, neg_data, 2, ["apc", "ae", "op"], "all")

get_diag_oper_codes(pos_data, neg_data, 1, ["apc"], "apc")
get_diag_oper_codes(pos_data, neg_data, 1, ["ae"], "ae")
get_diag_oper_codes(pos_data, neg_data, 1, ["op"], "op")
get_diag_oper_codes(pos_data, neg_data, 1, ["apc", "ae", "op"], "all")

## Extract any date for any field

This is a tricky one, that requires us to reformat the table quite a bit. For this I wrote a function that turns each row into multiple rows (as many as many ICD or OPERTN codes we have in it). For each new row we duplicate the dates, epikey, patient ID. The resulting table then could be filtered in many ways to answer questions, like what's the first exposure date for patient X for ICD code Y.

The function performs the following steps:
- Define the columns that are needed to identify an event (patient ID, dates, table, epikey)
- Define the primary and secondary codes to act on
- Trim the codes to the required length
- Extract primary column
- Collapse the secondary codes into a single column by stacking them across multiple rows
- Join the primary, secondary columns back with the other columns so for each code we have a patient ID, dates, epikey, etc

In [8]:
# function to collapse any number of columns across a row into a single one
def collapse_cols_into_one(df, cols, stacked_label):
    # extract columns, stack them into a single column
    cols_stacked = df[cols].stack()
    # reset the second level of the index (introduced by stacking)
    cols_stacked = cols_stacked.reset_index(level=1)
    # drop the second level of index which is now a column
    cols_stacked = cols_stacked.drop("level_1", axis=1)
    # rename stacked column to the user defined one
    cols_stacked.rename(columns={0: stacked_label}, inplace=True)
    return cols_stacked

# function to get single column as dataframe
def get_single_column_df(df, col, label):
    col = df[col]
    col = pd.DataFrame(col.values, columns=[label], index=col.index)
    col = col[~pd.isnull(col).values]
    return col

# function to reformat wide format to long format with dates
def wide_to_long_with_dates(df, id_date_cols, 
                            cols_to_act_on, primary_cols, 
                            secondary_cols, trim_level):
    data = df.copy()
    # define data
    data = data[id_date_cols + cols_to_act_on]

    # trim codes to predetermined number of letters
    data = get_first_n_chars(data, cols_to_act_on, trim_level)

    # get primary and secondary dataframes
    primary_cols = get_single_column_df(data, primary_cols, "primary")
    secondary_cols = collapse_cols_into_one(data, secondary_cols, "secondary")

    # merge primary and secondary dataframes
    primary_secondary_df = primary_cols.join(secondary_cols, how="left")
    # stack them, so for each code we have precisely one row
    primary_secondary_df = primary_secondary_df.stack(dropna=True)
    # reset index from stacking
    primary_secondary_df = primary_secondary_df.reset_index([0, 1])
    # drop duplicates, i.e. rows of primary codes that are repeating
    primary_secondary_df = primary_secondary_df.drop_duplicates()
    # reintroduce index so we can join on it with dates later
    primary_secondary_df.set_index("level_0", inplace=True)
    # rename columns
    primary_secondary_df.columns = ["LEVEL", "CODE"]
    
    # merge them with date and id cols 
    id_date_df = data[id_date_cols]
    
    return primary_secondary_df.join(id_date_df, how="left")

## Extract count of MAINSPEF 

- For the positives on the index date
- For the positives prior to the index date, i.e. referral center

In [11]:
# define cols to identify each event precisely
id_date_cols = ["ENCRYPTED_HESID", "EPIKEY", "DATE", "DATE_INDEX", "TABLE"]
pos_mainspef = wide_to_long_with_dates(pos_data, id_date_cols, 
                                       ["MAINSPEF"], "MAINSPEF", 
                                       ["MAINSPEF"], 4)

In [7]:
# find mainspefs on the index date
index_date_mainspef = pos_mainspef.loc[(pos_mainspef.DATE == pos_mainspef.DATE_INDEX),:]
# make sure to only keep one per patient
index_date_mainspef = index_date_mainspef.groupby("ENCRYPTED_HESID").first()
# count mainspefs on index date
index_date_mainspef.groupby("CODE").size()

CODE
100      1
101      1
110    111
120      1
130      1
140      1
160      1
180      2
190      2
192      1
300    164
301      1
302      1
303      1
320     47
330      1
340      1
350      1
361      1
370      1
420     35
430      5
800     15
810      2
960      2
dtype: int64

In [8]:
referral_date_mainspef = pos_mainspef.sort_values("DATE").groupby("ENCRYPTED_HESID").nth(n=1)
referral_date_mainspef.groupby("CODE").size()

CODE
&       1
100    20
101     6
110    41
120     5
130    14
141     1
160     3
180     4
190     2
300    75
301     7
302     2
303     5
305     1
310     1
314     2
320    14
330     5
340     4
350     1
361     3
370     1
400     6
410     5
420    18
421     1
430     5
501     2
502     1
600     1
710     2
800     4
810     1
901     1
950     1
960     1
dtype: int64

## Extract first exposure dates for DIAG and OPERTN codes

For a given dataset (positives or negatives) 
- build a long formatted dataframe with the dates of all primary and secondary codes (ICD or OPERTN) 
- which were trimmed to the specified level,
- then using these tables, 
  - extract the first exposure date for each of these codes 
  - on a primary and secondary level separately 
  - for each patient.

In [15]:
def get_first_exp_dates(df, level, prefix):
    long_df = df.copy()
    # filter down long dataframe to primary or secondary codes
    long_df = long_df[long_df.LEVEL==level]
    # find date of first code for each patient
    long_df = long_df.sort_values("DATE").groupby(["ENCRYPTED_HESID", "CODE"]).first()
    # reset index of groupby operation and make them columns again
    long_df = long_df["DATE"].reset_index([0,1])
    # rename columns
    long_df.columns = ["ENCRYPTED_HESID", "CODE", "DATE"]
    # add prefix to the CODE variables 
    long_df.CODE = prefix + long_df.CODE
    # pivot table to turn codes into columns
    return long_df.pivot(index="ENCRYPTED_HESID", columns="CODE", values="DATE")

def get_first_exp_dates_wrapper(df, id_date_cols, 
                                cols_to_act_on, trim_level, 
                                prefix):
    data = df.copy()
    # get date dataframe with dates for all codes long formatted 
    long_dates = wide_to_long_with_dates(data, id_date_cols, 
                                         cols_to_act_on, cols_to_act_on[0], 
                                         cols_to_act_on[1:], trim_level)
    
    # extract primary and secondary first exposure dates
    primary_first_exp = get_first_exp_dates(long_dates, "primary", prefix + "1_")
    secondary_first_exp = get_first_exp_dates(long_dates, "secondary", prefix + "2_")
    
    # merge them 
    return primary_first_exp.join(secondary_first_exp, how="outer")

#### Get first exp date of ICD codes for positives

In [16]:
get_first_exp_dates_wrapper(pos_data, id_date_cols,
                        diag_cols, 3, "DIAG")

CODE,DIAG1_03,DIAG1_05,DIAG1_053,DIAG1_06,DIAG1_102,DIAG1_14,DIAG1_144,DIAG1_148,DIAG1_172,DIAG1_178,...,DIAG2_Z89,DIAG2_Z90,DIAG2_Z91,DIAG2_Z92,DIAG2_Z93,DIAG2_Z94,DIAG2_Z95,DIAG2_Z96,DIAG2_Z98,DIAG2_Z99
ENCRYPTED_HESID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0017E9B890C74E54AF5B93AB663D2F65,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
006EAF56DD6A4A04950B5E3787FA392B,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
014F5A15EB2E4393B9DE138B35C997B5,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
0179E0F45D5D41DBA73593FDF31B4768,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
01CDC87EEC6740A1843010FDE7DBBA96,NaT,2007-11-11,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
024D2F127EB94E81A6A3A79D6BD6D89C,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
044E3A98BC6D4E63B74B0FD25F751629,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
046B06A93C5C459597D9BF07AA033599,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,2017-01-18,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
049BC400721B49E2BC870BF33333A120,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
04B6C21B666F46E5828BE2F7AEBB6A1E,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT


#### Get first exp date of OPER codes for negatives

In [11]:
get_first_exp_dates_wrapper(neg_data, id_date_cols,
                        oper_cols, 4, "OPER")

CODE,OPER1_,OPER1_&,OPER1_A054,OPER1_A081,OPER1_A113,OPER1_A124,OPER1_A201,OPER1_A203,OPER1_A261,OPER1_A331,...,OPER2_Z976,OPER2_Z981,OPER2_Z983,OPER2_Z984,OPER2_Z985,OPER2_Z988,OPER2_Z989,OPER2_Z991,OPER2_Z992,OPER2_Z993
ENCRYPTED_HESID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7F0DE8C328D64E90AFB94068712AAD88,2005-10-15,2005-10-15,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
7F41E94853DB4957840854C0741DD14D,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
7F6E899DA45E497AB5A2B1E2B59D196F,2016-12-18,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
7F9A5451759E46DFB36C88ED796B781C,2016-10-17,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
7FA50A20E10C4E8FA1470A5423C7B690,2015-02-03,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
7FAC8C5776A9414F969A62F7D1D3608F,2014-07-31,2006-08-22,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
7FE42E4C88E24DBA8F09509FC5D6FB63,2006-08-16,2006-11-02,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
8032E82BA88D40E2AD632A4E81D74362,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
813ADB232BCD4AEFAD15235511BA98D7,2014-10-12,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
814BAF31825A414FB4B39016F35A2BC8,2011-07-20,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT


## Extract count of DIAG and OPERTN within set timeframe

These functions do the following:
- take a long formatted dataframe with dates of codes and filter it to a certain period
- within this filtered period, they open up the code into primary and secondary codes
- for each patient count the number of times the code happened for both primary and secondary codes
- do this for all the time intervals
- join the resulting tables into a single one

In [18]:
# filter down long dates df to a given time interval from index date
def filter_dates_to_interval(df, month1, month2):
    long_df = df.copy()
    # create date limits by adding month1 and month2 to index date
    date1 = long_df.DATE_INDEX - np.timedelta64(month1, 'M')
    date2 = long_df.DATE_INDEX - np.timedelta64(month2, 'M')
    long_df.insert(0, "DATE1", date1)
    long_df.insert(0, "DATE2", date2)
    # filter table and return it
    long_df = long_df[(long_df.DATE <= long_df.DATE1) & (long_df.DATE > long_df.DATE2)]
    # drop date1 and date2 and return df
    long_df.drop("DATE1", axis=1, inplace=True)
    long_df.drop("DATE2", axis=1, inplace=True)
    return long_df

# extract primary and secondary codes for a time interval
def get_interval_counts_prim_secon(df, prefix, time_prefix):
    long_df = df.copy()
    primary_interval_dates = get_interval_counts(long_df, "primary", prefix + "1_", time_prefix)
    secondary_interval_dates = get_interval_counts(long_df, "secondary", prefix + "2_", time_prefix)
    return primary_interval_dates.join(secondary_interval_dates, how="outer")

# extract codes for a time interval for a given level (prim or secon)
def get_interval_counts(df, level, prefix, time_prefix):
    long_df = df.copy()
    # filter down long dataframe to primary or secondary codes
    long_df = long_df[long_df.LEVEL==level]
    # add prefix and time_prefix to the CODE variables
    long_df.CODE = time_prefix + long_df.CODE
    long_df.CODE = prefix + long_df.CODE
    # count number of dates for each code/patient combo
    long_df = long_df.groupby(["CODE", "ENCRYPTED_HESID"]).size()
    # reset groupby index to get a df
    long_df = long_df.reset_index()
    # add count column name
    long_df.rename(columns={0: "COUNT"}, inplace=True)
    # pivot table to turn codes into columns
    return long_df.pivot(index="ENCRYPTED_HESID", columns="CODE", values="COUNT")

# wrapper that segments all code vars into 1, 2, rest years
def get_interval_counts_wrapper(df, id_date_cols, 
                                cols_to_act_on, trim_level, 
                                prefix):
    data = df.copy()
    # get date dataframe with dates for all codes long formatted 
    long_dates = wide_to_long_with_dates(data, id_date_cols, 
                                         cols_to_act_on, cols_to_act_on[0], 
                                         cols_to_act_on[1:], trim_level)
    
    # get time interval tables - THIS NEEDS TO BE SETUP IN THE PROJECT!
    first_year_dates = filter_dates_to_interval(long_dates, 0, 12)
    second_year_dates = filter_dates_to_interval(long_dates, 12, 24)
    rest_year_dates = filter_dates_to_interval(long_dates, 24, 200)
    
    # extract primary and secondary codes within these intervals
    
    first_year_interval_counts = get_interval_counts_prim_secon(first_year_dates, prefix, "Y1_")
    second_year_interval_counts = get_interval_counts_prim_secon(second_year_dates, prefix, "Y2_")
    rest_year_interval_counts = get_interval_counts_prim_secon(rest_year_dates, prefix, "YR_")
    
    # merge them 
    first_second = first_year_interval_counts.join(second_year_interval_counts, how="outer")
    return first_second.join(rest_year_interval_counts, how="outer")

#### Get DIAG codes for negatives in 3 time buckets

# NOTE ! 
are nans zeros for sure?

In [19]:
get_interval_counts_wrapper(neg_data, id_date_cols, diag_cols, 4, "DIAG")

CODE,DIAG1_Y1_01 0,DIAG1_Y1_01 3,DIAG1_Y1_0132,DIAG1_Y1_0220,DIAG1_Y1_03 1,DIAG1_Y1_03 2,DIAG1_Y1_03 3,DIAG1_Y1_0531,DIAG1_Y1_0532,DIAG1_Y1_0533,...,DIAG2_YR_Z968,DIAG2_YR_Z970,DIAG2_YR_Z974,DIAG2_YR_Z975,DIAG2_YR_Z978,DIAG2_YR_Z980,DIAG2_YR_Z981,DIAG2_YR_Z992,DIAG2_YR_Z993,DIAG2_YR_Z998
ENCRYPTED_HESID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7F0DE8C328D64E90AFB94068712AAD88,,,,,,,,,,1.0,...,,,,,,,,,,
7F41E94853DB4957840854C0741DD14D,,,,,,,,,,,...,,,,,,,,,,
7F6E899DA45E497AB5A2B1E2B59D196F,,,,,,,,,,,...,,,,,,,,,,
7F9A5451759E46DFB36C88ED796B781C,,,,,,,,,,,...,,,,,,,,,,
7FA50A20E10C4E8FA1470A5423C7B690,,,,,,,,,,,...,,,,,,,,,,
7FAC8C5776A9414F969A62F7D1D3608F,,,,,,,,,,,...,,,,,,,,,,
7FE42E4C88E24DBA8F09509FC5D6FB63,,,,,,,,,,,...,,,,,,,,,,
8032E82BA88D40E2AD632A4E81D74362,,,,,,,,,,,...,,,,,,,,,,
813ADB232BCD4AEFAD15235511BA98D7,,,,,,,,,,,...,,,,,,,,,,
814BAF31825A414FB4B39016F35A2BC8,,,,,,,,,,,...,,,,,,,,,,


## Episode duration 

Have three variables 
- one that is the count of bed days from the APC table
- another that is the count of episodes in the OP table 
- and a final variable that counts the number of A&E visits. 

In [17]:
pos_data[pos_data.TABLE=="apc"].groupby("EPIKEY").first()[["ENCRYPTED_HESID", "EPIDUR"]].reset_index(0)

Unnamed: 0,EPIKEY,ENCRYPTED_HESID,EPIDUR
0,1.050385e+11,589899F2768D4174927B4825A46AAA73,3.0
1,1.050385e+11,589899F2768D4174927B4825A46AAA73,1.0
2,1.050388e+11,6E55E5A4960C485CBF1FFA3B974528CE,0.0
3,1.050390e+11,2E6C7ED001F34031943DF05B448B53B9,42.0
4,1.050393e+11,6F945C4F8507416F98CF6C9A34D6AEC6,0.0
5,1.050398e+11,40712607658A46198A68C13A43226991,1.0
6,1.050398e+11,3E9CD86B589542D285B99A0A5A46833A,1.0
7,1.050398e+11,3E9CD86B589542D285B99A0A5A46833A,0.0
8,1.050398e+11,3E9CD86B589542D285B99A0A5A46833A,4.0
9,1.050401e+11,57228802A4624F3194AC66890D2CC6F4,5.0


In [18]:
pos_data[pos_data.TABLE=="op"].groupby(["ENCRYPTED_HESID"]).count()["EPIKEY"].reset_index(0)

Unnamed: 0,ENCRYPTED_HESID,EPIKEY
0,0017E9B890C74E54AF5B93AB663D2F65,6
1,006EAF56DD6A4A04950B5E3787FA392B,19
2,014F5A15EB2E4393B9DE138B35C997B5,1
3,0179E0F45D5D41DBA73593FDF31B4768,3
4,01CDC87EEC6740A1843010FDE7DBBA96,2
5,024D2F127EB94E81A6A3A79D6BD6D89C,3
6,046B06A93C5C459597D9BF07AA033599,2
7,049BC400721B49E2BC870BF33333A120,4
8,04B6C21B666F46E5828BE2F7AEBB6A1E,4
9,0504B69B7E974C47AA9CC6C1340EA80A,1


In [19]:
pos_data[pos_data.TABLE=="ae"].groupby(["ENCRYPTED_HESID"]).count()["EPIKEY"].reset_index(0)

Unnamed: 0,ENCRYPTED_HESID,EPIKEY
0,0017E9B890C74E54AF5B93AB663D2F65,0
1,01CDC87EEC6740A1843010FDE7DBBA96,0
2,046B06A93C5C459597D9BF07AA033599,2
3,07EA38E87C314432AFC6ECB42B1D4AA8,0
4,098C62946E8541FC87396327B876050E,0
5,0A24A301A01442848EE59FA886AD531E,0
6,0A7CBE11D9A04EAEBCE570486983D382,1
7,0C8739BBCC104C5F99A47C82F2F461C6,1
8,0E89B02548464B45ABE28DDFED1B7C3B,0
9,0EEDB206C4F04353883053E59DCDD1DB,0


## Extract age at index date

Extreme values (7000+) are neonatal ages. Everything above 150 can be discarded.

In [20]:
pos_age = pos_data.sort_values(by="DATE").groupby("ENCRYPTED_HESID").first()["AGE"]
neg_age = neg_data.sort_values(by="DATE").groupby("ENCRYPTED_HESID").first()["AGE"]
pos_age[pos_age > 150] = 1
neg_age[neg_age > 150] = 1

In [21]:
pos_age.describe()

count    368.000000
mean      55.141304
std       23.281283
min        0.000000
25%       44.750000
50%       62.000000
75%       71.250000
max       92.000000
Name: AGE, dtype: float64

In [22]:
neg_age.describe()

count    449.000000
mean      54.376392
std       23.148848
min        0.000000
25%       43.000000
50%       61.000000
75%       71.000000
max       95.000000
Name: AGE, dtype: float64

## Extract SES at index date

In [23]:
pos_ses = pos_data.sort_values(by="DATE").groupby("ENCRYPTED_HESID").first()["IMD04RK"]
neg_ses = neg_data.sort_values(by="DATE").groupby("ENCRYPTED_HESID").first()["IMD04RK"]

In [24]:
pos_ses.describe()

count      396.000000
mean     14860.588384
std       9305.250327
min         45.000000
25%       6632.250000
50%      14907.000000
75%      22636.250000
max      32347.000000
Name: IMD04RK, dtype: float64

In [25]:
neg_ses.describe()

count      450.000000
mean     14917.351111
std       9635.815696
min         42.000000
25%       6701.250000
50%      13458.000000
75%      23230.500000
max      32460.000000
Name: IMD04RK, dtype: float64

## Extract rural/urban var at index date

In [26]:
pos_ru = pos_data.sort_values(by="DATE").groupby("ENCRYPTED_HESID").first()["RURURB_IND"]
neg_ru = neg_data.sort_values(by="DATE").groupby("ENCRYPTED_HESID").first()["RURURB_IND"]

In [27]:
pos_ru.describe()

count    398.000000
mean       5.170854
std        0.651257
min        2.000000
25%        5.000000
50%        5.000000
75%        5.000000
max        9.000000
Name: RURURB_IND, dtype: float64

In [28]:
neg_ru.describe()

count    450.000000
mean       5.155556
std        0.548558
min        3.000000
25%        5.000000
50%        5.000000
75%        5.000000
max        8.000000
Name: RURURB_IND, dtype: float64