### Sort Data Script 

Data under raw_data looks like: 
    
    raw_data ----- sample_id1 --- .tsv file 
              |___ sample_id2 --- .tsv file 
              |____ ....
              .
              .___ sample_idn --- .tsv file 

This script sorts the tsv files by storing each tsv file under their respective case ID's (i.e. patient ID). So the processed data file will look like this: 

    processed_data ------ case_id1 ----- .tsv file 1 
                      |           |_____ ..... 
                      |           . 
                      |           ._____ .tsv file n 
                      |
                      |
                      |
                      |__ case_id2 ----- .tsv file 1 
                      |           |_____ ..... 
                      |           . 
                      |           ._____ .tsv file n 
                      |
                      . 
                      .
                      .___ case_idx ----- .tsv file 1 
                                  |_____ ..... 
                                  . 
                                  ._____ .tsv file n 



NOTE: This script queries GDC Endpoint for Case ID
                      


In [1]:
import pandas as pd
import requests
import os
from tqdm import tqdm
import shutil

In [5]:
# GET FOLDER NAMES (SAMPLE ID'S)
def get_folder_names(current_directory):
    items_in_directory = os.listdir(current_directory)
    directories_in_current_directory = [item for item in items_in_directory if os.path.isdir(os.path.join(current_directory, item))]
    return directories_in_current_directory

sample_ids = get_folder_names('data/raw_data/')
sample_ids


['266fd576-2577-4294-bdaf-2d2d34a272b4',
 '9db51366-c3ea-4510-904f-883f1a316cfe',
 '791bbb6f-def9-47aa-b189-297205b68d99',
 'e26abc28-15b0-4832-bd02-43c5dd0b55e8',
 '66c5fda2-ef4c-4a89-88a1-43403ed9cc7c',
 'b17e24bc-425a-4851-82fd-d344ad582f9c',
 '62853e7e-71d0-484e-a58f-0db54e794112',
 '6a83b19d-999d-4f5c-909f-47b9006009f2',
 '7dc99125-5937-483b-a75a-f0493e352442',
 '8a767f95-2109-43bb-80b0-b17ff79f6a1c',
 '64825c6e-8074-4adf-8f54-462875593f95',
 'fc62b2b6-e1fd-4ede-901f-bacba311f07b',
 '04ce4521-9595-475c-af18-22f1d396dde5',
 '8f67d704-bfa8-4a26-a4fd-62dc18170e42',
 '206ed23b-2fdc-4a89-8904-cb1c5b9a4fb0',
 'c236e62d-3526-49f0-a2a2-216e4a6653d6',
 '6a082522-6730-4359-923b-f05f7e6c2b70',
 '85f40e06-5eb6-4487-8115-b13827276362',
 '525d4ed4-c65f-4f76-aad7-e2e2001ce86e',
 '10360021-a197-46aa-9c8e-feb3f542164d']

In [6]:
# FUNCTION TO GET CASE ID ASSOCIATED WITH SAMPLE ID 
def get_case_id(file_id):
    # Construct the API request URL to search for cases containing the file
    api_url = "https://api.gdc.cancer.gov/cases"

    # Define the query payload to search for cases containing the file
    query = {
        "filters": {
            "op": "=",
            "content": {
                "field": "files.file_id",
                "value": [file_id]
            }
        },
        "fields": "case_id"
    }

    # Send a POST request to the GDC API to search for cases containing the file
    response = requests.post(api_url, json=query)

    if response.status_code == 200:
        data = response.json()["data"]

        if data["pagination"]["total"] > 0:
            # Extracting the case UUID(s) associated with the file
            case_uuids = [case["case_id"] for case in data["hits"]]
            return case_uuids[0] # only return first case id found
        else:
            print(f"No cases found containing file {file_id}")
    else:
        print(f"Failed to retrieve information for file {file_id}")


# ORGANIZE SAMPLE IDS UNDER CASE ID
case_to_sample_id = {}
for sample_id in tqdm(sample_ids): 
    case_id = get_case_id(sample_id)

    if case_id in case_to_sample_id: 
        case_to_sample_id[case_id].append(sample_id)
    else:
        case_to_sample_id[case_id] = [sample_id]

case_to_sample_id

100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


{'f81ac8a2-4ce6-439e-b027-1c0bfc88ceaa': ['266fd576-2577-4294-bdaf-2d2d34a272b4'],
 '316625b8-a217-4829-90d1-dfd9cb7e4bc3': ['9db51366-c3ea-4510-904f-883f1a316cfe'],
 'd093173f-08ab-4138-bf3c-399c45a6e163': ['791bbb6f-def9-47aa-b189-297205b68d99'],
 'c31900a4-5dcd-4022-97ac-638e86e889e4': ['e26abc28-15b0-4832-bd02-43c5dd0b55e8',
  'c236e62d-3526-49f0-a2a2-216e4a6653d6'],
 '33b7bf50-53f4-4839-aa83-9665f85671d8': ['66c5fda2-ef4c-4a89-88a1-43403ed9cc7c',
  'b17e24bc-425a-4851-82fd-d344ad582f9c'],
 '0130d616-885e-4a6c-9d03-2f17dd692a05': ['62853e7e-71d0-484e-a58f-0db54e794112'],
 'fe2cd610-aa52-4789-ac62-7683281bb22f': ['6a83b19d-999d-4f5c-909f-47b9006009f2'],
 'fa4e082a-d213-412e-8d91-842a6f4bef12': ['7dc99125-5937-483b-a75a-f0493e352442'],
 '37242f5a-25ae-4b1f-9ce6-09ce1dc92539': ['8a767f95-2109-43bb-80b0-b17ff79f6a1c'],
 '35bd694d-1dd2-466f-ab27-03320614b40e': ['64825c6e-8074-4adf-8f54-462875593f95'],
 'f55dd73d-8c36-440b-84e5-9aae53107775': ['fc62b2b6-e1fd-4ede-901f-bacba311f07b'],
 'f

In [11]:
#### FUNCTION: COPYS OVER ALL TSV FILES FOR GIVEN CASE ID 
def copy_all_files(case_id, case_to_sample_id): 
    for sample_id in case_to_sample_id[case_id]:

        file_name = [f for f in os.listdir("./data/raw_data/" + str(sample_id)) if f.endswith(".tsv")][0]

        if file_name.endswith(".txt"): 
            print(f"ERRROR: {file_name} found in folder {sample_id}")

        source_path = "./data/raw_data/" + str(sample_id) + "/" + file_name
        target_path = "./data/processed_data/" + str(case_id) + "/" + file_name
        
        os.makedirs("./data/processed_data/" + str(case_id), exist_ok=True)
        shutil.copy(source_path, target_path)

# ITERATE THRU DICTIONARY TO COPY ALL FILES
for case_id in tqdm(case_to_sample_id.keys()):
    copy_all_files(case_id, case_to_sample_id)


100%|██████████| 18/18 [00:00<00:00, 64.08it/s]
