### Sort Data Script 

Data under data/raw_data looks like: 
    
    raw_data ----- sample_id1 --- .tsv file 
              |___ sample_id2 --- .tsv file 
              |____ ....
              .
              .___ sample_idn --- .tsv file 

This script sorts the tsv files by storing each tsv file under their respective case ID's (i.e. patient ID). 
So the processed data files will be stored under data/processed_data like this: 

    processed_data ------ case_id1 ----- .tsv file 1 
                      |           |_____ ..... 
                      |           . 
                      |           ._____ .tsv file n 
                      |
                      |
                      |
                      |__ case_id2 ----- .tsv file 1 
                      |           |_____ ..... 
                      |           . 
                      |           ._____ .tsv file n 
                      |
                      . 
                      .
                      .___ case_idx ----- .tsv file 1 
                                  |_____ ..... 
                                  . 
                                  ._____ .tsv file n 



NOTE: This script queries GDC Endpoint for Case ID
                      


In [1]:
import pandas as pd
import requests
import os
from tqdm import tqdm
import shutil

In [5]:
# GET FOLDER NAMES (SAMPLE ID'S)
def get_folder_names(current_directory):
    items_in_directory = os.listdir(current_directory)
    directories_in_current_directory = [item for item in items_in_directory if os.path.isdir(os.path.join(current_directory, item))]
    return directories_in_current_directory

sample_ids = get_folder_names('../data/raw_data/')
sample_ids


['5158a1a5-4505-46da-abaf-ad97b93ad9c3',
 'cf1bc221-0793-45ac-87a4-f30b1359751f',
 '91c3b05d-6e9f-4f10-b57e-d218e68cc6d5',
 '3be162b2-3231-48d1-8148-2055f5f97e98',
 '88962ea5-7761-4a8f-bba6-3a63662af1a3',
 'a6045e1f-fc18-439d-8f80-f2ed1c5a0933',
 '86739be0-532d-42ad-84df-c08e2e969d00',
 '55efb570-0a68-4fb7-bda8-90903880625d',
 '3d4fd1a8-5bfe-45df-8ce9-f21272e8ffbc',
 'a2ba8d48-5d76-4c58-85bb-8724388d8820',
 '73a2b51c-f34e-4918-939e-250a27ad91e6',
 '95bae4c3-9915-4a02-b11e-2fc29a14b58b',
 'da98fbe2-1b42-4c97-a52b-959e73be24ae',
 'b016540a-57d9-4c95-9ff6-b3b8b19c2e05',
 '75cfa36d-325f-4fd3-a908-ef17980ed614',
 '5bec2b19-8027-46c0-afa6-f03adf8b1a2f',
 '7e5e23ed-7693-49c7-9225-7588fadc8974',
 'e87863d9-e00a-44d8-8331-bb9214b5554a',
 'acdb7bb8-f21c-4572-85c2-38d7408ec3db',
 '985cc5cd-d610-44e2-95b5-6844ccd67306',
 '557ca0cb-0afe-42e4-be2a-26d2ebc13300',
 'aafa0414-48bd-461b-828e-24de12186b1f',
 'e41bd510-0f59-438a-8db2-cf6042032329',
 '9d6ae543-b48f-4c0c-b1cf-073583c240ed',
 '6204391c-3276-

In [6]:
# FUNCTION TO GET CASE ID ASSOCIATED WITH SAMPLE ID 
def get_case_id(file_id):
    # Construct the API request URL to search for cases containing the file
    api_url = "https://api.gdc.cancer.gov/cases"

    # Define the query payload to search for cases containing the file
    query = {
        "filters": {
            "op": "=",
            "content": {
                "field": "files.file_id",
                "value": [file_id]
            }
        },
        "fields": "case_id"
    }

    # Send a POST request to the GDC API to search for cases containing the file
    response = requests.post(api_url, json=query)

    if response.status_code == 200:
        data = response.json()["data"]

        if data["pagination"]["total"] > 0:
            # Extracting the case UUID(s) associated with the file
            case_uuids = [case["case_id"] for case in data["hits"]]
            return case_uuids[0] # only return first case id found
        else:
            print(f"No cases found containing file {file_id}")
    else:
        print(f"Failed to retrieve information for file {file_id}")


# ORGANIZE SAMPLE IDS UNDER CASE ID
case_to_sample_id = {}
for sample_id in tqdm(sample_ids): 
    case_id = get_case_id(sample_id)

    if case_id in case_to_sample_id: 
        case_to_sample_id[case_id].append(sample_id)
    else:
        case_to_sample_id[case_id] = [sample_id]

case_to_sample_id

  0%|          | 0/1500 [00:00<?, ?it/s]

100%|██████████| 1500/1500 [06:46<00:00,  3.69it/s]


{'0ec70e40-07df-461d-8ff9-351240a0d454': ['5158a1a5-4505-46da-abaf-ad97b93ad9c3',
  'dafa26d7-9d2f-4b49-8ef9-1147c257a077'],
 '33919e92-4ea9-47e5-a6f4-36e51845d50f': ['cf1bc221-0793-45ac-87a4-f30b1359751f',
  '740be0d3-572e-40ae-9a18-48b6bd8c8cce'],
 'b8a44fdf-9cb9-4123-9ab0-4bc198921fee': ['91c3b05d-6e9f-4f10-b57e-d218e68cc6d5',
  'c14d2ca2-0121-4a0d-9d6b-57d5e51a1bac'],
 '298a1710-deb4-4064-a0f2-da32d43aa217': ['3be162b2-3231-48d1-8148-2055f5f97e98',
  '5618dd81-d68f-48f9-8b78-814cc9b6c4b3',
  '946631c8-2285-4aab-8df2-9e3cda48019c'],
 '2bcacfd1-6a45-4b96-9b80-2ad569b1ab45': ['88962ea5-7761-4a8f-bba6-3a63662af1a3',
  '229219e2-9b56-4882-8741-cee697c4b714'],
 '4d120899-9a80-4736-b097-285e9e261bcb': ['a6045e1f-fc18-439d-8f80-f2ed1c5a0933',
  'c01eb946-e57a-4733-bd49-eec2bd1a7a5e'],
 '972447aa-4332-47e7-bddd-2eb699dbb664': ['86739be0-532d-42ad-84df-c08e2e969d00',
  '4331b944-a95c-4b55-b8dd-ff0da300e956'],
 '8cd1ba0b-ead9-4661-a796-6c9dbf1c42cd': ['55efb570-0a68-4fb7-bda8-90903880625d',
 

In [7]:
#### FUNCTION: COPYS OVER ALL TSV FILES FOR GIVEN CASE ID 
def copy_all_files(case_id, case_to_sample_id): 
    for sample_id in case_to_sample_id[case_id]:

        file_name = [f for f in os.listdir("../data/raw_data/" + str(sample_id)) if f.endswith(".tsv")][0]

        if file_name.endswith(".txt"): 
            print(f"ERROR: {file_name} found in folder {sample_id}")

        source_path = "../data/raw_data/" + str(sample_id) + "/" + file_name
        target_path = "../data/processed_data/" + str(case_id) + "/" + file_name
        
        os.makedirs("../data/processed_data/" + str(case_id), exist_ok=True)
        shutil.copy(source_path, target_path)

# ITERATE THRU DICTIONARY TO COPY ALL FILES
for case_id in tqdm(case_to_sample_id.keys()):
    copy_all_files(case_id, case_to_sample_id)


100%|██████████| 902/902 [00:21<00:00, 42.77it/s]
