# Space

In [None]:
import os
import logging
import pandas as pd 
from IPython.display import display, HTML
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)

import sys
sys.path.append(WORKSPACE_PATH)
from proj_space import PROJECT, TaskName, SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')


In [None]:
from recfldtkn.configfn import load_cohort_args

recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')
cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)

RawRootID = cohort_args['RawRootID']
RootID = cohort_args['RootID']
RootIDLength = cohort_args['RootIDLength']
print(RawRootID, RootID, RootIDLength)

In [None]:
from recfldtkn.configfn import get_rec_related_size

P2RecNumName = cohort_args['RecName']
RFT_GROUP_SIZE, idx_group_size, usebucket = get_rec_related_size(P2RecNumName, cohort_args)
print(RFT_GROUP_SIZE, idx_group_size, usebucket)

# Cohort: Pick A Cohort

In [None]:
import argparse
my_parser = argparse.ArgumentParser(description='Process Input.')


# Add the arguments
my_parser.add_argument('--cohort_name',
                    metavar='cohort_name',
                    default = None, 
                    type=str,
                    help='the cohort_name to process')

my_parser.add_argument('--cohort_label',
                    metavar='cohort_label',
                    default = None, 
                    type=str,
                    help='the label for cohort_name to process')

my_parser.add_argument('--list_cohort_name',
                    metavar='list_cohort_name',
                    type=str,
                    help='the groupname_ids to process')

In [None]:
################### in notebook ###################
args_information = [
    '--cohort_label', '1',
    '--list_cohort_name', 'false'
]

args = my_parser.parse_args(args_information)
###################################################

# Helper Function

In [None]:
import inspect

######################### You many want to change these functions to make it run.
selected_source_file_suffix_list = ['csv']
excluded_cols = []

def get_id_column(columns):
    if 'patient_id_encoded' in columns: id_column = 'patient_id_encoded' 
    return id_column

def get_tablename_from_file(file_path):
    name = file_path.split('/')[-1].split('_df_')[0]
    return name

def read_column_value_counts_by_chunk(RawRootID, chunk_size, file_path, rawdf = None):
    if type(rawdf) != pd.DataFrame:
        columns = pd.read_csv(file_path, nrows=0).columns
    else:
        columns = rawdf.columns 
    id_column = get_id_column(columns)

    if type(rawdf) == pd.DataFrame:
        result = rawdf[id_column].value_counts()
    else:
        li = [chunk[id_column].value_counts() for chunk in pd.read_csv(file_path, 
                                                                       usecols = [id_column], 
                                                                       chunksize=chunk_size, 
                                                                       low_memory=False)]
        result = pd.concat(li)
        result = result.groupby(result.index).sum()

    name = get_tablename_from_file(file_path)
    result = result.reset_index().rename(columns = {'count': 'RecNum', id_column: RawRootID})
    result['RecName'] = name
    return result
#########################


get_id_column.fn_string = inspect.getsource(get_id_column)
get_tablename_from_file.fn_string = inspect.getsource(get_tablename_from_file)
read_column_value_counts_by_chunk.fn_string = inspect.getsource(read_column_value_counts_by_chunk)

In [None]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
iterative_variables = [selected_source_file_suffix_list, excluded_cols]
fn_variables = [get_id_column, get_tablename_from_file, read_column_value_counts_by_chunk]
pycode = convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                       fn_variables = fn_variables, 
                                       prefix = prefix)
pypath = cohort_args['pypath']
# print(pypath)
with open(pypath, 'w') as file: file.write(pycode)
# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)
display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Process dfHumanRec

In [None]:
# from recfldtkn.pipeline_record import get_cohort_level_record_number_counts

def get_cohort_level_record_number_counts(cohort_name, cohort_label, cohort_args, filepath_to_rawdf = None):
    ###############################
    pypath = cohort_args['pypath']
    module = load_module_variables(pypath)
    get_id_column = module.get_id_column
    read_column_value_counts_by_chunk = module.read_column_value_counts_by_chunk
    excluded_cols = module.excluded_cols
    pid_recnum_result_fn = read_column_value_counts_by_chunk
    selected_source_file_suffix_list = module.selected_source_file_suffix_list
    ###############################

    RawRootID = cohort_args['RawRootID']
    RootID = cohort_args['RootID']  
    RootIDLength = cohort_args['RootIDLength']
    cohort_config = cohort_args['CohortInfo'][cohort_name]
    FolderPath = cohort_config['FolderPath']
    chunk_size = 100000
    
    logger.info(f'=======cohort_label-{cohort_label}: cohort_name-{cohort_name}=======')
    logger.info(cohort_config)
    
    if filepath_to_rawdf is None: 
        file_list = [i for i in os.listdir(FolderPath) if i.split('.')[-1] in selected_source_file_suffix_list]
        fullfile_list = [os.path.join(FolderPath, i) for i in file_list]
        logger.info(f'{FolderPath} <-- FolderPath')
        logger.info(f'{len(fullfile_list)} <--- fullfile_list')
        filepath_to_rawdf = {filepath: None for filepath in fullfile_list}

    L = []
    for file_path, rawdf in filepath_to_rawdf.items():
        if type(rawdf) == pd.DataFrame:
            if len(rawdf) == 0: continue
            result = pid_recnum_result_fn(RawRootID, chunk_size, file_path, rawdf)
            logger.info(f"'{file_path}' # {result.shape}")
        elif file_path.split('.')[-1] == 'csv':
            if os.stat(file_path).st_size == 0: 
                logger.info(f"'{file_path}' # emtpy file"); continue
            try:
                result = pid_recnum_result_fn(RawRootID, chunk_size, file_path, rawdf)
                logger.info(f"'{file_path}' # {result.shape}")
            except:
                logger.info(f"'{file_path}' # error file"); continue
        else:
            if file_path.split('.')[-1] == 'parquet':
                rawdf = pd.read_parquet(file_path)
            elif file_path.split('.')[-1] == 'p':
                rawdf = pd.read_pickle(file_path)
            else:
                raise ValueError(f'file type not supported: {file_path}')
            result = pid_recnum_result_fn(RawRootID, chunk_size, file_path, rawdf)
            logger.info(f"'{file_path}' # {result.shape}")
            
        L.append(result)
    logger.info(f'{len(L)} <---- types of dfRec so far')
    df_all = pd.concat(L, ignore_index=True)
    df_pivot = df_all.pivot(index=RawRootID, columns='RecName', values='RecNum').reset_index()

    recname_cols = [i for i in df_pivot.columns if i != RawRootID]
    included_cols = [i for i in recname_cols if i not in excluded_cols]
    rec_count = df_pivot[included_cols].sum(axis = 1)
    
    df_Human = df_pivot[rec_count > 0].reset_index(drop = True)
    df_Human['TotalRecNum'] = df_Human[included_cols].sum(axis = 1)
    logger.info(len(df_Human))

    CohortLabel = cohort_config['cohort_label']
    df_Human[RootID] = range(1, len(df_Human) + 1)
    df_Human[RootID] = df_Human[RootID].apply(lambda x: int(str(CohortLabel) + str(x).zfill(RootIDLength)))
    df_Human['CohortLabel'] = CohortLabel
    cols = ['PID'] + [i for i in df_Human.columns if i not in ['PID']]
    df_Human = df_Human[cols].reset_index(drop = True)

    return df_Human

In [None]:
cohort_label = int(args.cohort_label)
cohort_name = [v for k, v in cohort_args['CohortInfo'].items() 
               if v['cohort_label'] == cohort_label][0]['cohort_name']

print(f'=============== {cohort_name}: {cohort_label} ======================')
print(cohort_name, cohort_label)

In [None]:
df_Human = get_cohort_level_record_number_counts(cohort_name, cohort_label, cohort_args)
df_Human

In [None]:
import datasets
ds_HumanRec = datasets.Dataset.from_pandas(df_Human)
print(ds_HumanRec)

In [None]:
cohort_args['RecName']

In [None]:
print(SPACE['DATA_RFT'])

full_cohort_name = f'{cohort_label}-{cohort_name}'
print(full_cohort_name)
# ------------------------------------------------------------------------- # 
path = os.path.join(SPACE['DATA_RFT'], full_cohort_name, cohort_args['RecName'] + '_data')
print(path)
ds_HumanRec.save_to_disk(path)
print(ds_HumanRec)

# Select Patients with PID

In [None]:
# from recfldtkn.loadtools import load_ds_rec_and_info

def load_ds_rec_and_info(record_name, cohort_args, cohort_label_list = None):
    SPACE = cohort_args['SPACE']
    cohort_list = [i for i in os.listdir(SPACE['DATA_RFT'])]
    if cohort_label_list is not None:
        cohort_label_list = [str(i) for i in cohort_label_list]
        cohort_list = [i for i in cohort_list if i.split('-')[0] in cohort_label_list]
    l = []
    linfo = []
    print(cohort_list)
    for cohort_full_name in cohort_list:
        data_folder = os.path.join(SPACE['DATA_RFT'], cohort_full_name, record_name + '_data')
        # logger.info(f'Load from disk: {data_folder} ...')
        ds_rec = datasets.Dataset.load_from_disk(data_folder)
        l.append(ds_rec)
        info_folder = os.path.join(SPACE['DATA_RFT'], cohort_full_name, record_name + '_info')
        if os.path.exists(info_folder):
            ds_rec_info = datasets.Dataset.load_from_disk(info_folder)
            linfo.append(ds_rec_info)
    ds_rec = datasets.concatenate_datasets(l)
    if len(linfo) == 0:
        ds_rec_info = None
    else:
        ds_rec_info = datasets.concatenate_datasets(linfo)
    return ds_rec, ds_rec_info

In [None]:
# path = os.path.join(SPACE['DATA_RFT'], cohort_args['RecName'], cohort_name)
# print(path)
# ds_HumanRec = datasets.load_from_disk(path)
# print(ds_HumanRec)
# df_Human = ds_HumanRec.to_pandas()
# df_Human.head()

ds_Human, _ = load_ds_rec_and_info(cohort_args['RecName'], cohort_args)
df_Human = ds_Human.to_pandas()
df_Human


In [None]:
RootID = cohort_args['RootID']
RawRootID = cohort_args['RawRootID']

In [None]:
PID_list = [1013405, 1002538, 1022279, 1004432, 1016032, 1032308, 1031363, 1001133, 1007343, 1026067]
print(PID_list)

In [None]:
def get_patient_records_Ri(RawRootID_sample, RawRootID, cohort_args):
    
    d = {}
    cohort_config = cohort_args['CohortInfo'][cohort_name]
    FolderPath = cohort_config['FolderPath']
    chunk_size = 100000

    file_list = sorted(os.listdir(FolderPath))
    file_list = [i for i in file_list if 'csv' in i]
    for file in file_list:
        full_file = os.path.join(FolderPath, file)
        li = [chunk[chunk[RawRootID] == RawRootID_sample] 
              for chunk in pd.read_csv(full_file, chunksize=chunk_size, low_memory=False)]
        result = pd.concat(li)
        logger.info(f'{result.shape}: {file}')
        if len(result) == 0: continue
        d[file] = result
        
    return d

In [None]:
for PID_sample in PID_list:
    print('\n======== PID_sample:', PID_sample, '========')
    PIDInfo_dict = df_Human[df_Human[RootID] == PID_sample].iloc[0].to_dict()
    RawRootID_sample = PIDInfo_dict[RawRootID]
    d = get_patient_records_Ri(RawRootID_sample, RawRootID, cohort_args)
    folder = os.path.join(SPACE['DATA_RAW'], 'patient_sample', str(PID_sample))
    if os.path.exists(folder) == False: os.makedirs(folder)
    for file, df in d.items():
        df.to_csv(os.path.join(folder, file), index = False)
        print(file, df.shape)