# Space

In [None]:
import os
import logging
import pandas as pd 
from IPython.display import display, HTML
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)

import sys
sys.path.append(WORKSPACE_PATH)
from proj_space import PROJECT, TaskName, SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])

recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

In [None]:
from recfldtkn.configfn import load_cohort_args

cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)

In [None]:
from recfldtkn.configfn import get_rec_related_size

P2RecNumName = cohort_args['RecName']
RFT_GROUP_SIZE, idx_group_size, usebucket = get_rec_related_size(P2RecNumName, cohort_args)
print(RFT_GROUP_SIZE, idx_group_size, usebucket)

# Cohort: Pick A Cohort

In [None]:
import argparse
my_parser = argparse.ArgumentParser(description='Process Input.')

my_parser.add_argument('--cohort_name',
                    metavar='cohort_name',
                    default = None, 
                    type=str,
                    help='the cohort_name to process')

my_parser.add_argument('--cohort_label',
                    metavar='cohort_label',
                    default = None, 
                    type=str,
                    help='the label for cohort_name to process')

In [None]:
################### in notebook ###################
args_information = ['--cohort_label', '1']

args = my_parser.parse_args(args_information)
###################################################

# Helper Function

In [None]:
######################### You many want to change these functions to make it run.
selected_source_file_suffix_list = ['csv']
excluded_cols = [] # excluding raw_record_names. 
#########################

In [None]:
import inspect

#########################
def get_id_column(columns):
    if 'patient_id_encoded' in columns: 
        id_column = 'patient_id_encoded' 
    return id_column

get_id_column.fn_string = inspect.getsource(get_id_column)
#########################

In [None]:
#########################
def get_tablename_from_file(file_path):
    name = file_path.split('/')[-1].split('_df_')[0]
    return name

get_tablename_from_file.fn_string = inspect.getsource(get_tablename_from_file)
#########################

In [None]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
iterative_variables = [selected_source_file_suffix_list, excluded_cols]
fn_variables = [get_id_column, get_tablename_from_file]
pycode = convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                       fn_variables = fn_variables, 
                                       prefix = prefix)
pypath = cohort_args['pypath']
# print(pypath)
with open(pypath, 'w') as file: file.write(pycode)
# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)
display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Process dfHumanRec

In [None]:
from recfldtkn.pipeline_record import get_cohort_level_record_number_counts

cohort_label = int(args.cohort_label)
cohort_name = [v for k, v in cohort_args['CohortInfo'].items() 
               if v['cohort_label'] == cohort_label][0]['cohort_name']

print(f'=============== {cohort_name}: {cohort_label} ======================')
print(cohort_name, cohort_label)

df_Human = get_cohort_level_record_number_counts(cohort_name, cohort_label, cohort_args)
df_Human

In [None]:
import datasets
ds_HumanRec = datasets.Dataset.from_pandas(df_Human)
print(ds_HumanRec)

In [None]:
cohort_args['RecName']

In [None]:
print(SPACE['DATA_RFT'])

full_cohort_name = f'{cohort_label}-{cohort_name}'
print(full_cohort_name)
# ------------------------------------------------------------------------- # 
path = os.path.join(SPACE['DATA_RFT'], full_cohort_name, cohort_args['RecName'] + '_data')
print(path)
ds_HumanRec.save_to_disk(path)
print(ds_HumanRec)

# Select Patients with PID

In [None]:
from recfldtkn.loadtools import load_ds_rec_and_info
ds_Human, _ = load_ds_rec_and_info(cohort_args['RecName'], cohort_args)
df_Human = ds_Human.to_pandas()
df_Human

In [None]:
RootID = cohort_args['RootID']
RawRootID = cohort_args['RawRootID']
PID_list = [1013405, 1002538, 1022279, 1004432, 1016032, 1032308, 1031363, 1001133, 1007343, 1026067]
print(PID_list)

In [None]:
def get_patient_records_Ri(RawRootID_sample, RawRootID, cohort_args):
    d = {}
    cohort_config = cohort_args['CohortInfo'][cohort_name]
    FolderPath = cohort_config['FolderPath']
    chunk_size = 100000

    file_list = sorted(os.listdir(FolderPath))
    file_list = [i for i in file_list if 'csv' in i]
    for file in file_list:
        full_file = os.path.join(FolderPath, file)
        li = [chunk[chunk[RawRootID] == RawRootID_sample] 
              for chunk in pd.read_csv(full_file, chunksize=chunk_size, low_memory=False)]
        result = pd.concat(li)
        logger.info(f'{result.shape}: {file}')
        if len(result) == 0: continue
        d[file] = result
        
    return d

In [None]:
for PID_sample in PID_list:
    print('\n======== PID_sample:', PID_sample, '========')
    PIDInfo_dict = df_Human[df_Human[RootID] == PID_sample].iloc[0].to_dict()
    RawRootID_sample = PIDInfo_dict[RawRootID]
    d = get_patient_records_Ri(RawRootID_sample, RawRootID, cohort_args)
    folder = os.path.join(SPACE['DATA_RAW'], 'patient_sample', str(PID_sample))
    if os.path.exists(folder) == False: os.makedirs(folder)
    for file, df in d.items():
        df.to_csv(os.path.join(folder, file), index = False)
        print(file, df.shape)