# HES Tables - scripts for initial merging/chunking by patient ID

In this script I prepare some functions for:

* separating positive and negative cohort, for each table

* merging the initial tables by year, for each dataset

In [11]:
#Loading libraries
import pandas as pd
import numpy as np
import random
import glob, os

In [8]:
#just setting paths to test functions
data_path="./sample_HES/"
file1= "HES_Sample_APC.txt"
filename=data_path+file1

In [63]:
#creating mock list of positive patient IDs
df=pd.read_csv(filename, sep="\t", low_memory=False)
sample_rows = random.sample(list(df.ENCRYPTED_HESID), 50)
mock_pos = df["ENCRYPTED_HESID"].ix[sample_rows]
mock_pos.to_csv("./sample_HES/HES_APC_PosIDs.txt", sep="\t")

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  after removing the cwd from sys.path.


# Output filenames

In [3]:
#function that creates appropriate names for new files, maintaing the same extension
#it tries to take in account that the filename might be a full path with initial ./
def outfile_name(filename, suffix):
    name=filename.split(".")
    if len(name)==2 :
        nameout=[name[0]+suffix, name[1]]   
    else:
        nameout=["",name[1]+suffix, name[2]]
    return ".".join(nameout)

# Positive and Negative cohorts

In [4]:
#function that extract positive cohort (for each table)
def create_positive_cohort(filename, IDcol_label, pos_IDs, sep="\t", suffix_out="_pos"):
    """
    --Parameters--
    filename: str 
              File path of the table
    IDcol_label: str 
                 Name of the column that contains the patient IDS (different for each type of dataset)
    pos_IDs: str 
             File path of the positive IDs file
    sep: str 
         File separator for read_csv
    suffix_out: str
         Suffix that defines the positive cohort files
    --Returns--
    """
    df=pd.read_csv(filename, sep=sep, low_memory=False)
    pos_IDs=pd.read_csv(pos_IDs, sep="\t", low_memory=False)
    pos_df=df[df[IDcol_label].isin(pos_IDs)]
    fileout=outfile_name(filename, suffix_out)
    pos_df.to_csv(fileout,sep=sep,index=False)
    return 

In [86]:
outfile_name(filename, suffix="_pos")

'./sample_HES/HES_Sample_APC_pos.txt'

In [87]:
create_positive_cohort (filename, "ENCRYPTED_HESID", "./sample_HES/HES_APC_PosIDs.txt", sep="\t", suffix_out="_pos")

./sample_HES/HES_Sample_APC.txt


In [5]:
#function that filter out positive cohort (for each table)
def create_negative_cohort (filename, IDcol_label, pos_IDs, sep="\t", suffix_out="_neg"):
    """
    --Parameters--
    filename: str 
              File path of the table
    IDcol_label: str 
                 Name of the column that contains the patient IDS (different for each type of dataset)
    pos_IDs: str 
             File path of the positive IDs file
    sep: str 
         File separator for read_csv
    suffix_out: str
         Suffix that defines the negative cohort files
    --Returns--
    """
    df=pd.read_csv(filename, sep=sep, low_memory=False)
    pos_IDs=pd.read_csv(pos_IDs, sep="\t", low_memory=False)
    neg_df=df[~df[IDcol_label].isin(pos_IDs)]
    fileout=outfile_name(filename, suffix_out)
    neg_df.to_csv(fileout,sep=sep,index=False)
    return 

In [90]:
create_negative_cohort (filename, "ENCRYPTED_HESID", "./sample_HES/HES_APC_PosIDs.txt", sep="\t", suffix_out="_neg")

In [6]:
#Test: inner join of positive and negative cohorts must be empty
pos=pd.read_csv("./sample_HES/HES_Sample_APC_pos.txt", sep="\t", low_memory=False)
neg=pd.read_csv("./sample_HES/HES_Sample_APC_neg.txt", sep="\t", low_memory=False)

merged_inner = neg.merge(pos, left_on='ENCRYPTED_HESID', right_on='ENCRYPTED_HESID', how='inner')

# what's the size of the output data?
merged_inner.shape

(0, 475)

# Concatenate table years

In [34]:
#Not sure we will have enough memory for this approach

In [42]:
#creating mock datasets to play with
df=pd.read_csv("./sample_HES/HES_Sample_APC_neg.txt", sep="\t", low_memory=False)
sub1=df.sample(frac=0.3,random_state=200)
df1=df.drop(sub1.index)
sub2=df1.sample(frac=0.5,random_state=200)
sub3=df1.drop(sub2.index)
sub1.to_csv("./sample_HES/HES_Sample_APC_neg_1.txt", sep="\t",index=False)
sub2.to_csv("./sample_HES/HES_Sample_APC_neg_2.txt", sep="\t",index=False)
sub3.to_csv("./sample_HES/HES_Sample_APC_neg_3.txt",sep="\t",index=False)

In [156]:
#listing files with same root (hopefully different years will have similar names)
#on Linux no need to change backlash in forwardlash
subfiles_name=glob.glob("./sample_HES/HES_Sample_APC_neg_*.txt")
print(subfiles_name)
subfiles_name_OS=[]
for name in subfiles_name:
    subfiles_name_OS.append(name.replace('\\', '/'))
print(subfiles_name_OS)

['./sample_HES\\HES_Sample_APC_neg_1.txt', './sample_HES\\HES_Sample_APC_neg_2.txt', './sample_HES\\HES_Sample_APC_neg_3.txt']
['./sample_HES/HES_Sample_APC_neg_1.txt', './sample_HES/HES_Sample_APC_neg_2.txt', './sample_HES/HES_Sample_APC_neg_3.txt']


In [163]:
#function that given a list of files ALL WITH THE SAME FORMAT, produce a concatenated file of the same format
def concatenate_tables(list_tables, fileout, sep="\t"):
    """
    --Parameters--
    list_tables: list of strings 
              List of files to be concatenated
    fileout: str 
             Name of the concatenated file in output
    sep: str 
         Optional. File separator for read_csv.
    --Returns--
    """
    df=pd.read_csv(list_tables[0], sep=sep, low_memory=False)
    for chunk in list_tables[1:]:
        new_df=pd.read_csv(chunk, sep="\t", low_memory=False)
        df=pd.concat([df, new_df])
     #write the concatenated table
    print('Writing {:s}'.format(fileout))
    df.to_csv(fileout, index=False, sep=sep)

In [164]:
concatenate_tables(subfiles_name_OS, sep="\t", fileout='./sample_HES/HES_Sample_APC_neg_all.txt')

Writing ./sample_HES/HES_Sample_APC_neg_all.txt


In alternative we can paste directly on a file, if everything fit in memory (they are separate tables), it is worthy to do more operations while in memory

# Chunk by Patient ID

In [151]:
#function that given a csv file, chunk it in N csv files such that each patient history is contained in one chunk
#For each csv file another txt file containing the list of patient IDs contained in that chunk is produced
def chunk_by_patient(filename, N_chunk, ID_label, path_ID_files='./') :
    """
    --Parameters--
    filename: str 
             File path of the table to be chunked
    N_chunk: integer
             Number of desired chunks
    ID_label: str 
             Name of the column that contains the patient IDS (different for each type of dataset)
    path_ID_files: str
             Optional. Output folder for IDs lists. Default is the current path.
    --Returns--
    """          
    df=pd.read_csv(filename,  low_memory=False)
    #dividing by number of patients might produce files of quite different size, but it is the easiest solution
    IDs_by_chunk= round(len(df[ID_label].unique())/N_chunk)
    ID_initial=0
    for i in range(0, N_chunk):
        #print(i)
        if i != N_chunk-1:
            ID_chunk=(i+1)*IDs_by_chunk
            #IDs are sorted before chunking. This might be convenient for following analysis but not necessary
            chunk=df[df[ID_label].isin(df[ID_label].sort_values().unique()[ID_initial:ID_chunk])]
            ID_initial= ID_chunk
        else:
            #since we are rounding last chunk might have less ID than IDs_by_chunk
            chunk=df[df[ID_label].isin(df[ID_label].sort_values().unique()[ID_initial:])]
        suffix='_chunk'+str(i)
        #The chunks will have the same root of the initial file and will be placed in the same folder
        fileout=outfile_name(filename, suffix=suffix)
        root_IDfile=ID_label
        #By default the list od IDS are outputed in the current path folder, this can be regulated by path_ID_files
        fileoutID=path_ID_files + root_IDfile + suffix +'.txt'
        #write the nth chunk
        print('Writing {:s}'.format(fileout))
        chunk.to_csv(fileout, index=False)
        #write the list of IDs contained in the nth chunk
        print('Writing {:s}'.format(fileoutID))
        chunk[ID_label].to_csv(fileoutID, index=False)
    return

In [155]:
filename='./sample_HES/HES_Sample_APC_neg_all.csv'
N_chunk=3
ID_label='ENCRYPTED_HESID'
path_ID_files='./sample_HES/'
chunk_by_patient(filename, N_chunk, ID_label,path_ID_files=path_ID_files)

Writing ./sample_HES/HES_Sample_APC_neg_all_chunk0.csv
Writing ./sample_HES/ENCRYPTED_HESID_chunk0.txt
Writing ./sample_HES/HES_Sample_APC_neg_all_chunk1.csv
Writing ./sample_HES/ENCRYPTED_HESID_chunk1.txt
Writing ./sample_HES/HES_Sample_APC_neg_all_chunk2.csv
Writing ./sample_HES/ENCRYPTED_HESID_chunk2.txt


In [153]:
#Test IDs contained in chunk 0 must not be contained in chunk 1 and so on
chunk0=pd.read_csv('./sample_HES/HES_Sample_APC_neg_all_chunk0.csv', low_memory=False)
chunk1=pd.read_csv('./sample_HES/HES_Sample_APC_neg_all_chunk1.csv', low_memory=False)
chunk2=pd.read_csv('./sample_HES/HES_Sample_APC_neg_all_chunk2.csv', low_memory=False)
IDs_chunk0=chunk0[ID_label].unique()
IDs_chunk1=chunk1[ID_label].unique()
IDs_chunk2=chunk2[ID_label].unique()
print(IDs_chunk0.shape)
print(IDs_chunk1.shape)
print(IDs_chunk2.shape)

(308,)
(308,)
(307,)


In [154]:
all(~chunk2[ID_label].isin(chunk1[ID_label]))

True