## Unify the columns of the 3 datasets

AE and OP has different columns

In [1]:
import pandas as pd
import numpy as np

# APC
apc = pd.read_csv("Mock_data/HES_Sample_APC.txt", sep="\t", low_memory=False)

# AE
ae = pd.read_csv("Mock_data/HES_Sample_AE.txt", sep="\t", low_memory=False)

# OP
op = pd.read_csv("Mock_data/HES_Sample_OP.txt", sep="\t", low_memory=False)

# check columns against each other
apc_cols = pd.Series(apc.columns)
ae_cols = pd.Series(ae.columns)
op_cols = pd.Series(op.columns)
dataset_cols = pd.concat([apc_cols, ae_cols, op_cols], ignore_index=True, axis=1)
dataset_cols.columns = columns=["APC", "AE", "OP"]
dataset_cols.to_csv("dataset_columns.csv", index=None)

`dataset_columns.csv` was edited in Excel so for each dataset we have `_FINAL` column that holds the columns to keep and their name they should be renamed to.

In [2]:
dataset_cols = pd.read_csv("Mock_data/final_columns.csv", index_col=None)

In [45]:
dataset_cols

Unnamed: 0,APC,APC_FINAL,AE,AE_FINAL,OP,OP_FINAL
0,Table_Year,,Table_Year,,Table_Year,
1,ADMIAGE,AGE,ACTIVAGE,,ADMINCAT,
2,ADMIDATE,DATE,AEARRIVALMODE,,APPTAGE,AGE
3,ADMIMETH,ADMIMETH,AEATTENDCAT,,APPTDATE,DATE
4,ADMINCAT,,AEATTENDDISP,,ATENTYPE,
5,ADMISORC,,AEDEPTTYPE,,ATTENDED,
6,ADMISTAT,,AEINCLOCTYPE,,ATTENDKEY,EPIKEY
7,AEKEY,,AEKEY,,BABYAGE,
8,ANAGEST,,AEPATGROUP,,CARERSI,
9,ANASDATE,,AEREFSOURCE,,CSNUM,


Load in all 3 mock datasets, filter them down to the `_FINAL` columns, unify column names, then concatenate them and save the resulting merged data frame.

In [3]:
# APC
apc_cols_to_keep_ix = np.where(~pd.isnull(dataset_cols.APC_FINAL))[0]
apc = apc.iloc[:, apc_cols_to_keep_ix]
apc.columns = dataset_cols.APC_FINAL[apc_cols_to_keep_ix]

# AE
ae_cols_to_keep_ix = np.where(~pd.isnull(dataset_cols.AE_FINAL))[0]
ae = ae.iloc[:, ae_cols_to_keep_ix]
ae.columns = dataset_cols.AE_FINAL[ae_cols_to_keep_ix]

#OP
op_cols_to_keep_ix = np.where(~pd.isnull(dataset_cols.OP_FINAL))[0]
op = op.iloc[:, op_cols_to_keep_ix]
op.columns = dataset_cols.OP_FINAL[op_cols_to_keep_ix]

In [47]:
apc.shape

(19112, 95)

In [48]:
ae.shape

(8734, 31)

In [49]:
op.shape

(19112, 58)

## Merge 3 tables

- Once we filtered and renamed columns to match across the 3 tables, merge them by rows. If a column is missing from one of the tables we pad it with zeros
- Make date colum into date that pandas understand
- Order by HESID and split data into pos and neg datasets

In [4]:
# add table type to each table so we can later tell which event comes from which table
apc.insert(apc.shape[1], "TABLE", ["apc"]*apc.shape[0])
ae.insert(ae.shape[1], "TABLE", ["ae"]*ae.shape[0])
op.insert(op.shape[1], "TABLE", ["op"]*op.shape[0])

# merge 3 tables
merged_df = pd.concat([apc, ae, op])

# define date variables - make sure pandas understands them as dates
date_col = pd.to_datetime(merged_df.DATE)
merged_df.drop("DATE", axis=1, inplace=True)
merged_df.insert(1, "DATE", date_col)

# Mock data doesn't have PROCODE field so let's generate fake PROCODEs and add it to data
n, p = merged_df.shape
random_procode = np.tile(np.arange(0,10), np.int(np.ceil(n/10)))[:n]
merged_df.insert(1, "PROCODE", random_procode)

# reset index
merged_df.index = np.arange(0, merged_df.shape[0])

# sort by patient ID
merged_df = merged_df.sort_values("ENCRYPTED_HESID")

# Check where to split so we don't cut a patient in half
# merged_df.ENCRYPTED_HESID[45070:450100]
# np.where(merged_df.index == 84379)

pos_data = merged_df.iloc[:45097,]
neg_data = merged_df.iloc[45097:,]

merged_df.to_csv("Mock_data/merged_mock.csv")
pos_data.to_csv("Mock_data/merged_pos.csv")
neg_data.to_csv("Mock_data/merged_neg.csv")