# HES Tables - scripts for initial merging

In this script I prepare some functions for:

* separating positive and negative cohort, for each table

* merging the initial tables by year, for each dataset

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
data_path="./sample_HES/"
file1= "HES_Sample_APC.txt"
filename=data_path+file1

In [63]:
#create mock list of positive patient IDs
df=pd.read_csv(filename, sep="\t", low_memory=False)
sample_rows = random.sample(list(df.ENCRYPTED_HESID), 50)
mock_pos = df["ENCRYPTED_HESID"].ix[sample_rows]
mock_pos.to_csv("./sample_HES/HES_APC_PosIDs.txt", sep="\t")

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  after removing the cwd from sys.path.


In [3]:
#function that creates appropriate names for new files, maintaing the same extension
#it tries to take in account that the filename might be a full path with initial ./
def outfile_name(filename, suffix):
    name=filename.split(".")
    if len(name)==2 :
        nameout=[name[0]+suffix, name[1]]   
    else:
        nameout=["",name[1]+suffix, name[2]]
    return ".".join(nameout)

In [4]:
#function that extract positive cohort (for each table)
def create_positive_cohort(filename, IDcol_label, pos_IDs, sep="\t", suffix_out="_pos"):
    """
    --Parameters--
    filename: str 
              File path of the table
    IDcol_label: str 
                 Name of the column that contains the patient IDS (different for each type of dataset)
    pos_IDs: str 
             File path of the positive IDs file
    sep: str 
         File separator for read_csv
    suffix_out: str
         Suffix that defines the positive cohort files
    --Returns--
    """
    df=pd.read_csv(filename, sep=sep, low_memory=False)
    pos_IDs=pd.read_csv(pos_IDs, sep="\t", low_memory=False)
    pos_df=df[df[IDcol_label].isin(pos_IDs)]
    fileout=outfile_name(filename, suffix_out)
    pos_df.to_csv(fileout,sep=sep,index=False)
    return 

In [86]:
outfile_name(filename, suffix="_pos")

'./sample_HES/HES_Sample_APC_pos.txt'

In [87]:
create_positive_cohort (filename, "ENCRYPTED_HESID", "./sample_HES/HES_APC_PosIDs.txt", sep="\t", suffix_out="_pos")

./sample_HES/HES_Sample_APC.txt


In [5]:
#function that filter out positive cohort (for each table)
def create_negative_cohort (filename, IDcol_label, pos_IDs, sep="\t", suffix_out="_neg"):
    """
    --Parameters--
    filename: str 
              File path of the table
    IDcol_label: str 
                 Name of the column that contains the patient IDS (different for each type of dataset)
    pos_IDs: str 
             File path of the positive IDs file
    sep: str 
         File separator for read_csv
    suffix_out: str
         Suffix that defines the negative cohort files
    --Returns--
    """
    df=pd.read_csv(filename, sep=sep, low_memory=False)
    pos_IDs=pd.read_csv(pos_IDs, sep="\t", low_memory=False)
    neg_df=df[~df[IDcol_label].isin(pos_IDs)]
    fileout=outfile_name(filename, suffix_out)
    neg_df.to_csv(fileout,sep=sep,index=False)
    return 

In [90]:
create_negative_cohort (filename, "ENCRYPTED_HESID", "./sample_HES/HES_APC_PosIDs.txt", sep="\t", suffix_out="_neg")

In [6]:
#Test: inner join of positive and negative cohorts must be empty
pos=pd.read_csv("./sample_HES/HES_Sample_APC_pos.txt", sep="\t", low_memory=False)
neg=pd.read_csv("./sample_HES/HES_Sample_APC_neg.txt", sep="\t", low_memory=False)

merged_inner = neg.merge(pos, left_on='ENCRYPTED_HESID', right_on='ENCRYPTED_HESID', how='inner')

# what's the size of the output data?
merged_inner.shape

(0, 475)