# Format raw and processed files for GEO uploads

Note: This is written to be run on a Mac with the research drive mounted. Adjust file paths as necessary if this is not the case.

## Setup

In [1]:
from collections import namedtuple
import pandas as pd
import os
import hashlib

# Setting up this named tuple to store any further information we need about the runs
# (such as directory paths, etc.)
RunInfo = namedtuple("RunInfo", ["name", "fastq_folder"])

runs = [
    RunInfo(
        "Run230316",
        "/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_VL00320_43_AACJYMYM5/Analysis/1/Data/fastq",
    ),
    RunInfo(
        "Run230322",
        "/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230322_VL00320_45_AACJYNCM5/Analysis/1/Data/fastq",
    ),
    RunInfo(
        "Run230323",
        "/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_run/fastq",
    ),
]

experiment_tag = "230324-3way-merge"

# Path to processed data files
processed_data_folder = f"/Volumes/kueck/Pipeline/CellLines/RNAseq/{experiment_tag}/star_salmon"
processed_data_file_names = [
    "salmon.merged.gene_tpm.tsv",
    "salmon.merged.gene_counts.tsv",
]

# Path to fastq folder where all the fastq files will be copied to
# Note that this copies straight into the research drive so that the fastq files are not on local device
final_fastq_folder = f"/Volumes/kueck/ovarian_cancer_cisplatin_response_manuscript/geo_metadata/data/{experiment_tag}/fastq"

# Paths to save the sample/processed data info
sample_info_path = f"../data/{experiment_tag}/raw_sample_info.csv"
processed_data_info_path = f"../data/{experiment_tag}/processed_data_info.csv"
formatted_sample_files_path = f"../data/{experiment_tag}/raw_sample_files.csv"

# Print command and run command
def run_command(cmd):
    print(f"cmd: {cmd}")
    os.system(cmd)

# DataFrame to hold info about each sample
sample_info = pd.DataFrame(
    columns=["sample_id", "original_fastq_path", "symlink_fastq_path", "final_file_name", "md5_checksum"]
)
sample_info.set_index("sample_id", inplace=True)

In [2]:
def compute_md5_checksum(file_path):
    md5_hash = hashlib.md5()
    with open(file_path, "rb") as file:
        # Read the file in chunks to avoid memory issues with large files
        for chunk in iter(lambda: file.read(4096), b""):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()

## Collect information about each sample

In [3]:
for run in runs:
    for sample_num in range(1, 34):
        for read_num in range(1, 3):
            # Infer the original fastq path
            original_fastq_path = f"{run.fastq_folder}/{sample_num}_S{sample_num}_R{read_num}_001.fastq.gz"

            # Create a unique sample_id including the run tag
            sample_id = f"{sample_num}_S{sample_num}_R{read_num}_001_{run.name}"

            # Add sample info to data frame
            sample_info.loc[sample_id] = [original_fastq_path, "", "", ""]

display(sample_info)

Unnamed: 0_level_0,original_fastq_path,symlink_fastq_path,final_file_name,md5_checksum
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_S1_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,
1_S1_R2_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,
2_S2_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,
2_S2_R2_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,
3_S3_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,
...,...,...,...,...
31_S31_R2_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,,,
32_S32_R1_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,,,
32_S32_R2_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,,,
33_S33_R1_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,,,


## Compute MD5 checksums

In [4]:
i = 1
for sample_id, row in sample_info.iterrows():
    display(f"Computing MD5 for {sample_id} ({i}/{len(sample_info)})")
    sample_original_path = row["original_fastq_path"]

    # Compute md5 checksum
    md5_checksum = compute_md5_checksum(sample_original_path)

    # Update sample info
    sample_info.at[sample_id, "md5_checksum"] = md5_checksum

    i += 1

display(sample_info)

'Computing MD5 for 1_S1_R1_001_Run230316 (1/198)'

'Computing MD5 for 1_S1_R2_001_Run230316 (2/198)'

'Computing MD5 for 2_S2_R1_001_Run230316 (3/198)'

'Computing MD5 for 2_S2_R2_001_Run230316 (4/198)'

'Computing MD5 for 3_S3_R1_001_Run230316 (5/198)'

'Computing MD5 for 3_S3_R2_001_Run230316 (6/198)'

'Computing MD5 for 4_S4_R1_001_Run230316 (7/198)'

'Computing MD5 for 4_S4_R2_001_Run230316 (8/198)'

'Computing MD5 for 5_S5_R1_001_Run230316 (9/198)'

'Computing MD5 for 5_S5_R2_001_Run230316 (10/198)'

'Computing MD5 for 6_S6_R1_001_Run230316 (11/198)'

'Computing MD5 for 6_S6_R2_001_Run230316 (12/198)'

'Computing MD5 for 7_S7_R1_001_Run230316 (13/198)'

'Computing MD5 for 7_S7_R2_001_Run230316 (14/198)'

'Computing MD5 for 8_S8_R1_001_Run230316 (15/198)'

'Computing MD5 for 8_S8_R2_001_Run230316 (16/198)'

'Computing MD5 for 9_S9_R1_001_Run230316 (17/198)'

'Computing MD5 for 9_S9_R2_001_Run230316 (18/198)'

'Computing MD5 for 10_S10_R1_001_Run230316 (19/198)'

'Computing MD5 for 10_S10_R2_001_Run230316 (20/198)'

'Computing MD5 for 11_S11_R1_001_Run230316 (21/198)'

'Computing MD5 for 11_S11_R2_001_Run230316 (22/198)'

'Computing MD5 for 12_S12_R1_001_Run230316 (23/198)'

'Computing MD5 for 12_S12_R2_001_Run230316 (24/198)'

'Computing MD5 for 13_S13_R1_001_Run230316 (25/198)'

'Computing MD5 for 13_S13_R2_001_Run230316 (26/198)'

'Computing MD5 for 14_S14_R1_001_Run230316 (27/198)'

'Computing MD5 for 14_S14_R2_001_Run230316 (28/198)'

'Computing MD5 for 15_S15_R1_001_Run230316 (29/198)'

'Computing MD5 for 15_S15_R2_001_Run230316 (30/198)'

'Computing MD5 for 16_S16_R1_001_Run230316 (31/198)'

'Computing MD5 for 16_S16_R2_001_Run230316 (32/198)'

'Computing MD5 for 17_S17_R1_001_Run230316 (33/198)'

'Computing MD5 for 17_S17_R2_001_Run230316 (34/198)'

'Computing MD5 for 18_S18_R1_001_Run230316 (35/198)'

'Computing MD5 for 18_S18_R2_001_Run230316 (36/198)'

'Computing MD5 for 19_S19_R1_001_Run230316 (37/198)'

'Computing MD5 for 19_S19_R2_001_Run230316 (38/198)'

'Computing MD5 for 20_S20_R1_001_Run230316 (39/198)'

'Computing MD5 for 20_S20_R2_001_Run230316 (40/198)'

'Computing MD5 for 21_S21_R1_001_Run230316 (41/198)'

'Computing MD5 for 21_S21_R2_001_Run230316 (42/198)'

'Computing MD5 for 22_S22_R1_001_Run230316 (43/198)'

'Computing MD5 for 22_S22_R2_001_Run230316 (44/198)'

'Computing MD5 for 23_S23_R1_001_Run230316 (45/198)'

'Computing MD5 for 23_S23_R2_001_Run230316 (46/198)'

'Computing MD5 for 24_S24_R1_001_Run230316 (47/198)'

'Computing MD5 for 24_S24_R2_001_Run230316 (48/198)'

'Computing MD5 for 25_S25_R1_001_Run230316 (49/198)'

'Computing MD5 for 25_S25_R2_001_Run230316 (50/198)'

'Computing MD5 for 26_S26_R1_001_Run230316 (51/198)'

'Computing MD5 for 26_S26_R2_001_Run230316 (52/198)'

'Computing MD5 for 27_S27_R1_001_Run230316 (53/198)'

'Computing MD5 for 27_S27_R2_001_Run230316 (54/198)'

'Computing MD5 for 28_S28_R1_001_Run230316 (55/198)'

'Computing MD5 for 28_S28_R2_001_Run230316 (56/198)'

'Computing MD5 for 29_S29_R1_001_Run230316 (57/198)'

'Computing MD5 for 29_S29_R2_001_Run230316 (58/198)'

'Computing MD5 for 30_S30_R1_001_Run230316 (59/198)'

'Computing MD5 for 30_S30_R2_001_Run230316 (60/198)'

'Computing MD5 for 31_S31_R1_001_Run230316 (61/198)'

'Computing MD5 for 31_S31_R2_001_Run230316 (62/198)'

'Computing MD5 for 32_S32_R1_001_Run230316 (63/198)'

'Computing MD5 for 32_S32_R2_001_Run230316 (64/198)'

'Computing MD5 for 33_S33_R1_001_Run230316 (65/198)'

'Computing MD5 for 33_S33_R2_001_Run230316 (66/198)'

'Computing MD5 for 1_S1_R1_001_Run230322 (67/198)'

'Computing MD5 for 1_S1_R2_001_Run230322 (68/198)'

'Computing MD5 for 2_S2_R1_001_Run230322 (69/198)'

'Computing MD5 for 2_S2_R2_001_Run230322 (70/198)'

'Computing MD5 for 3_S3_R1_001_Run230322 (71/198)'

'Computing MD5 for 3_S3_R2_001_Run230322 (72/198)'

'Computing MD5 for 4_S4_R1_001_Run230322 (73/198)'

'Computing MD5 for 4_S4_R2_001_Run230322 (74/198)'

'Computing MD5 for 5_S5_R1_001_Run230322 (75/198)'

'Computing MD5 for 5_S5_R2_001_Run230322 (76/198)'

'Computing MD5 for 6_S6_R1_001_Run230322 (77/198)'

'Computing MD5 for 6_S6_R2_001_Run230322 (78/198)'

'Computing MD5 for 7_S7_R1_001_Run230322 (79/198)'

'Computing MD5 for 7_S7_R2_001_Run230322 (80/198)'

'Computing MD5 for 8_S8_R1_001_Run230322 (81/198)'

'Computing MD5 for 8_S8_R2_001_Run230322 (82/198)'

'Computing MD5 for 9_S9_R1_001_Run230322 (83/198)'

'Computing MD5 for 9_S9_R2_001_Run230322 (84/198)'

'Computing MD5 for 10_S10_R1_001_Run230322 (85/198)'

'Computing MD5 for 10_S10_R2_001_Run230322 (86/198)'

'Computing MD5 for 11_S11_R1_001_Run230322 (87/198)'

'Computing MD5 for 11_S11_R2_001_Run230322 (88/198)'

'Computing MD5 for 12_S12_R1_001_Run230322 (89/198)'

'Computing MD5 for 12_S12_R2_001_Run230322 (90/198)'

'Computing MD5 for 13_S13_R1_001_Run230322 (91/198)'

'Computing MD5 for 13_S13_R2_001_Run230322 (92/198)'

'Computing MD5 for 14_S14_R1_001_Run230322 (93/198)'

'Computing MD5 for 14_S14_R2_001_Run230322 (94/198)'

'Computing MD5 for 15_S15_R1_001_Run230322 (95/198)'

'Computing MD5 for 15_S15_R2_001_Run230322 (96/198)'

'Computing MD5 for 16_S16_R1_001_Run230322 (97/198)'

'Computing MD5 for 16_S16_R2_001_Run230322 (98/198)'

'Computing MD5 for 17_S17_R1_001_Run230322 (99/198)'

'Computing MD5 for 17_S17_R2_001_Run230322 (100/198)'

'Computing MD5 for 18_S18_R1_001_Run230322 (101/198)'

'Computing MD5 for 18_S18_R2_001_Run230322 (102/198)'

'Computing MD5 for 19_S19_R1_001_Run230322 (103/198)'

'Computing MD5 for 19_S19_R2_001_Run230322 (104/198)'

'Computing MD5 for 20_S20_R1_001_Run230322 (105/198)'

'Computing MD5 for 20_S20_R2_001_Run230322 (106/198)'

'Computing MD5 for 21_S21_R1_001_Run230322 (107/198)'

'Computing MD5 for 21_S21_R2_001_Run230322 (108/198)'

'Computing MD5 for 22_S22_R1_001_Run230322 (109/198)'

'Computing MD5 for 22_S22_R2_001_Run230322 (110/198)'

'Computing MD5 for 23_S23_R1_001_Run230322 (111/198)'

'Computing MD5 for 23_S23_R2_001_Run230322 (112/198)'

'Computing MD5 for 24_S24_R1_001_Run230322 (113/198)'

'Computing MD5 for 24_S24_R2_001_Run230322 (114/198)'

'Computing MD5 for 25_S25_R1_001_Run230322 (115/198)'

'Computing MD5 for 25_S25_R2_001_Run230322 (116/198)'

'Computing MD5 for 26_S26_R1_001_Run230322 (117/198)'

'Computing MD5 for 26_S26_R2_001_Run230322 (118/198)'

'Computing MD5 for 27_S27_R1_001_Run230322 (119/198)'

'Computing MD5 for 27_S27_R2_001_Run230322 (120/198)'

'Computing MD5 for 28_S28_R1_001_Run230322 (121/198)'

'Computing MD5 for 28_S28_R2_001_Run230322 (122/198)'

'Computing MD5 for 29_S29_R1_001_Run230322 (123/198)'

'Computing MD5 for 29_S29_R2_001_Run230322 (124/198)'

'Computing MD5 for 30_S30_R1_001_Run230322 (125/198)'

'Computing MD5 for 30_S30_R2_001_Run230322 (126/198)'

'Computing MD5 for 31_S31_R1_001_Run230322 (127/198)'

'Computing MD5 for 31_S31_R2_001_Run230322 (128/198)'

'Computing MD5 for 32_S32_R1_001_Run230322 (129/198)'

'Computing MD5 for 32_S32_R2_001_Run230322 (130/198)'

'Computing MD5 for 33_S33_R1_001_Run230322 (131/198)'

'Computing MD5 for 33_S33_R2_001_Run230322 (132/198)'

'Computing MD5 for 1_S1_R1_001_Run230323 (133/198)'

'Computing MD5 for 1_S1_R2_001_Run230323 (134/198)'

'Computing MD5 for 2_S2_R1_001_Run230323 (135/198)'

'Computing MD5 for 2_S2_R2_001_Run230323 (136/198)'

'Computing MD5 for 3_S3_R1_001_Run230323 (137/198)'

'Computing MD5 for 3_S3_R2_001_Run230323 (138/198)'

'Computing MD5 for 4_S4_R1_001_Run230323 (139/198)'

'Computing MD5 for 4_S4_R2_001_Run230323 (140/198)'

'Computing MD5 for 5_S5_R1_001_Run230323 (141/198)'

'Computing MD5 for 5_S5_R2_001_Run230323 (142/198)'

'Computing MD5 for 6_S6_R1_001_Run230323 (143/198)'

'Computing MD5 for 6_S6_R2_001_Run230323 (144/198)'

'Computing MD5 for 7_S7_R1_001_Run230323 (145/198)'

'Computing MD5 for 7_S7_R2_001_Run230323 (146/198)'

'Computing MD5 for 8_S8_R1_001_Run230323 (147/198)'

'Computing MD5 for 8_S8_R2_001_Run230323 (148/198)'

'Computing MD5 for 9_S9_R1_001_Run230323 (149/198)'

'Computing MD5 for 9_S9_R2_001_Run230323 (150/198)'

'Computing MD5 for 10_S10_R1_001_Run230323 (151/198)'

'Computing MD5 for 10_S10_R2_001_Run230323 (152/198)'

'Computing MD5 for 11_S11_R1_001_Run230323 (153/198)'

'Computing MD5 for 11_S11_R2_001_Run230323 (154/198)'

'Computing MD5 for 12_S12_R1_001_Run230323 (155/198)'

'Computing MD5 for 12_S12_R2_001_Run230323 (156/198)'

'Computing MD5 for 13_S13_R1_001_Run230323 (157/198)'

'Computing MD5 for 13_S13_R2_001_Run230323 (158/198)'

'Computing MD5 for 14_S14_R1_001_Run230323 (159/198)'

'Computing MD5 for 14_S14_R2_001_Run230323 (160/198)'

'Computing MD5 for 15_S15_R1_001_Run230323 (161/198)'

'Computing MD5 for 15_S15_R2_001_Run230323 (162/198)'

'Computing MD5 for 16_S16_R1_001_Run230323 (163/198)'

'Computing MD5 for 16_S16_R2_001_Run230323 (164/198)'

'Computing MD5 for 17_S17_R1_001_Run230323 (165/198)'

'Computing MD5 for 17_S17_R2_001_Run230323 (166/198)'

'Computing MD5 for 18_S18_R1_001_Run230323 (167/198)'

'Computing MD5 for 18_S18_R2_001_Run230323 (168/198)'

'Computing MD5 for 19_S19_R1_001_Run230323 (169/198)'

'Computing MD5 for 19_S19_R2_001_Run230323 (170/198)'

'Computing MD5 for 20_S20_R1_001_Run230323 (171/198)'

'Computing MD5 for 20_S20_R2_001_Run230323 (172/198)'

'Computing MD5 for 21_S21_R1_001_Run230323 (173/198)'

'Computing MD5 for 21_S21_R2_001_Run230323 (174/198)'

'Computing MD5 for 22_S22_R1_001_Run230323 (175/198)'

'Computing MD5 for 22_S22_R2_001_Run230323 (176/198)'

'Computing MD5 for 23_S23_R1_001_Run230323 (177/198)'

'Computing MD5 for 23_S23_R2_001_Run230323 (178/198)'

'Computing MD5 for 24_S24_R1_001_Run230323 (179/198)'

'Computing MD5 for 24_S24_R2_001_Run230323 (180/198)'

'Computing MD5 for 25_S25_R1_001_Run230323 (181/198)'

'Computing MD5 for 25_S25_R2_001_Run230323 (182/198)'

'Computing MD5 for 26_S26_R1_001_Run230323 (183/198)'

'Computing MD5 for 26_S26_R2_001_Run230323 (184/198)'

'Computing MD5 for 27_S27_R1_001_Run230323 (185/198)'

'Computing MD5 for 27_S27_R2_001_Run230323 (186/198)'

'Computing MD5 for 28_S28_R1_001_Run230323 (187/198)'

'Computing MD5 for 28_S28_R2_001_Run230323 (188/198)'

'Computing MD5 for 29_S29_R1_001_Run230323 (189/198)'

'Computing MD5 for 29_S29_R2_001_Run230323 (190/198)'

'Computing MD5 for 30_S30_R1_001_Run230323 (191/198)'

'Computing MD5 for 30_S30_R2_001_Run230323 (192/198)'

'Computing MD5 for 31_S31_R1_001_Run230323 (193/198)'

'Computing MD5 for 31_S31_R2_001_Run230323 (194/198)'

'Computing MD5 for 32_S32_R1_001_Run230323 (195/198)'

'Computing MD5 for 32_S32_R2_001_Run230323 (196/198)'

'Computing MD5 for 33_S33_R1_001_Run230323 (197/198)'

'Computing MD5 for 33_S33_R2_001_Run230323 (198/198)'

Unnamed: 0_level_0,original_fastq_path,symlink_fastq_path,final_file_name,md5_checksum
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_S1_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,19cee84a6a3598930e2e6befdf203e0f
1_S1_R2_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,a41a73120d75d1a5c4696488dc56af78
2_S2_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,0613d63ebd5346ef57124d458a5e9f7b
2_S2_R2_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,46e40cbfdf13d025863609aab40e63cd
3_S3_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,,,e28b40d675a63b1837587ff28dc2a618
...,...,...,...,...
31_S31_R2_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,,,267e934f226bb6bc9926fdcd69a8b063
32_S32_R1_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,,,c8634cdab2082b4198d58ccc0057f77b
32_S32_R2_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,,,b332e910a17b49a3f1599ee04b4d2743
33_S33_R1_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,,,4214a83565ed2095db0cf9a49fec95af


## Make a symlink to fastq files

In [8]:
i = 1
for sample_id, row in sample_info.iterrows():
    display(f"Making symlink for fastq file {sample_id} ({i}/{len(sample_info)})")

    # Create symbolic link
    sample_original_path = row["original_fastq_path"]
    final_file_name = f"{sample_id}.fastq.gz"
    symlink_fastq_path = f"{final_fastq_folder}/{sample_id}.fastq.gz"
    os.symlink(sample_original_path, symlink_fastq_path)

    # Update sample info
    sample_info.at[sample_id, "symlink_fastq_path"] = symlink_fastq_path
    sample_info.at[sample_id, "final_file_name"] = final_file_name

    i += 1

display(sample_info)

'Making symlink for fastq file 1_S1_R1_001_Run230316 (1/198)'

'Making symlink for fastq file 1_S1_R2_001_Run230316 (2/198)'

'Making symlink for fastq file 2_S2_R1_001_Run230316 (3/198)'

'Making symlink for fastq file 2_S2_R2_001_Run230316 (4/198)'

'Making symlink for fastq file 3_S3_R1_001_Run230316 (5/198)'

'Making symlink for fastq file 3_S3_R2_001_Run230316 (6/198)'

'Making symlink for fastq file 4_S4_R1_001_Run230316 (7/198)'

'Making symlink for fastq file 4_S4_R2_001_Run230316 (8/198)'

'Making symlink for fastq file 5_S5_R1_001_Run230316 (9/198)'

'Making symlink for fastq file 5_S5_R2_001_Run230316 (10/198)'

'Making symlink for fastq file 6_S6_R1_001_Run230316 (11/198)'

'Making symlink for fastq file 6_S6_R2_001_Run230316 (12/198)'

'Making symlink for fastq file 7_S7_R1_001_Run230316 (13/198)'

'Making symlink for fastq file 7_S7_R2_001_Run230316 (14/198)'

'Making symlink for fastq file 8_S8_R1_001_Run230316 (15/198)'

'Making symlink for fastq file 8_S8_R2_001_Run230316 (16/198)'

'Making symlink for fastq file 9_S9_R1_001_Run230316 (17/198)'

'Making symlink for fastq file 9_S9_R2_001_Run230316 (18/198)'

'Making symlink for fastq file 10_S10_R1_001_Run230316 (19/198)'

'Making symlink for fastq file 10_S10_R2_001_Run230316 (20/198)'

'Making symlink for fastq file 11_S11_R1_001_Run230316 (21/198)'

'Making symlink for fastq file 11_S11_R2_001_Run230316 (22/198)'

'Making symlink for fastq file 12_S12_R1_001_Run230316 (23/198)'

'Making symlink for fastq file 12_S12_R2_001_Run230316 (24/198)'

'Making symlink for fastq file 13_S13_R1_001_Run230316 (25/198)'

'Making symlink for fastq file 13_S13_R2_001_Run230316 (26/198)'

'Making symlink for fastq file 14_S14_R1_001_Run230316 (27/198)'

'Making symlink for fastq file 14_S14_R2_001_Run230316 (28/198)'

'Making symlink for fastq file 15_S15_R1_001_Run230316 (29/198)'

'Making symlink for fastq file 15_S15_R2_001_Run230316 (30/198)'

'Making symlink for fastq file 16_S16_R1_001_Run230316 (31/198)'

'Making symlink for fastq file 16_S16_R2_001_Run230316 (32/198)'

'Making symlink for fastq file 17_S17_R1_001_Run230316 (33/198)'

'Making symlink for fastq file 17_S17_R2_001_Run230316 (34/198)'

'Making symlink for fastq file 18_S18_R1_001_Run230316 (35/198)'

'Making symlink for fastq file 18_S18_R2_001_Run230316 (36/198)'

'Making symlink for fastq file 19_S19_R1_001_Run230316 (37/198)'

'Making symlink for fastq file 19_S19_R2_001_Run230316 (38/198)'

'Making symlink for fastq file 20_S20_R1_001_Run230316 (39/198)'

'Making symlink for fastq file 20_S20_R2_001_Run230316 (40/198)'

'Making symlink for fastq file 21_S21_R1_001_Run230316 (41/198)'

'Making symlink for fastq file 21_S21_R2_001_Run230316 (42/198)'

'Making symlink for fastq file 22_S22_R1_001_Run230316 (43/198)'

'Making symlink for fastq file 22_S22_R2_001_Run230316 (44/198)'

'Making symlink for fastq file 23_S23_R1_001_Run230316 (45/198)'

'Making symlink for fastq file 23_S23_R2_001_Run230316 (46/198)'

'Making symlink for fastq file 24_S24_R1_001_Run230316 (47/198)'

'Making symlink for fastq file 24_S24_R2_001_Run230316 (48/198)'

'Making symlink for fastq file 25_S25_R1_001_Run230316 (49/198)'

'Making symlink for fastq file 25_S25_R2_001_Run230316 (50/198)'

'Making symlink for fastq file 26_S26_R1_001_Run230316 (51/198)'

'Making symlink for fastq file 26_S26_R2_001_Run230316 (52/198)'

'Making symlink for fastq file 27_S27_R1_001_Run230316 (53/198)'

'Making symlink for fastq file 27_S27_R2_001_Run230316 (54/198)'

'Making symlink for fastq file 28_S28_R1_001_Run230316 (55/198)'

'Making symlink for fastq file 28_S28_R2_001_Run230316 (56/198)'

'Making symlink for fastq file 29_S29_R1_001_Run230316 (57/198)'

'Making symlink for fastq file 29_S29_R2_001_Run230316 (58/198)'

'Making symlink for fastq file 30_S30_R1_001_Run230316 (59/198)'

'Making symlink for fastq file 30_S30_R2_001_Run230316 (60/198)'

'Making symlink for fastq file 31_S31_R1_001_Run230316 (61/198)'

'Making symlink for fastq file 31_S31_R2_001_Run230316 (62/198)'

'Making symlink for fastq file 32_S32_R1_001_Run230316 (63/198)'

'Making symlink for fastq file 32_S32_R2_001_Run230316 (64/198)'

'Making symlink for fastq file 33_S33_R1_001_Run230316 (65/198)'

'Making symlink for fastq file 33_S33_R2_001_Run230316 (66/198)'

'Making symlink for fastq file 1_S1_R1_001_Run230322 (67/198)'

'Making symlink for fastq file 1_S1_R2_001_Run230322 (68/198)'

'Making symlink for fastq file 2_S2_R1_001_Run230322 (69/198)'

'Making symlink for fastq file 2_S2_R2_001_Run230322 (70/198)'

'Making symlink for fastq file 3_S3_R1_001_Run230322 (71/198)'

'Making symlink for fastq file 3_S3_R2_001_Run230322 (72/198)'

'Making symlink for fastq file 4_S4_R1_001_Run230322 (73/198)'

'Making symlink for fastq file 4_S4_R2_001_Run230322 (74/198)'

'Making symlink for fastq file 5_S5_R1_001_Run230322 (75/198)'

'Making symlink for fastq file 5_S5_R2_001_Run230322 (76/198)'

'Making symlink for fastq file 6_S6_R1_001_Run230322 (77/198)'

'Making symlink for fastq file 6_S6_R2_001_Run230322 (78/198)'

'Making symlink for fastq file 7_S7_R1_001_Run230322 (79/198)'

'Making symlink for fastq file 7_S7_R2_001_Run230322 (80/198)'

'Making symlink for fastq file 8_S8_R1_001_Run230322 (81/198)'

'Making symlink for fastq file 8_S8_R2_001_Run230322 (82/198)'

'Making symlink for fastq file 9_S9_R1_001_Run230322 (83/198)'

'Making symlink for fastq file 9_S9_R2_001_Run230322 (84/198)'

'Making symlink for fastq file 10_S10_R1_001_Run230322 (85/198)'

'Making symlink for fastq file 10_S10_R2_001_Run230322 (86/198)'

'Making symlink for fastq file 11_S11_R1_001_Run230322 (87/198)'

'Making symlink for fastq file 11_S11_R2_001_Run230322 (88/198)'

'Making symlink for fastq file 12_S12_R1_001_Run230322 (89/198)'

'Making symlink for fastq file 12_S12_R2_001_Run230322 (90/198)'

'Making symlink for fastq file 13_S13_R1_001_Run230322 (91/198)'

'Making symlink for fastq file 13_S13_R2_001_Run230322 (92/198)'

'Making symlink for fastq file 14_S14_R1_001_Run230322 (93/198)'

'Making symlink for fastq file 14_S14_R2_001_Run230322 (94/198)'

'Making symlink for fastq file 15_S15_R1_001_Run230322 (95/198)'

'Making symlink for fastq file 15_S15_R2_001_Run230322 (96/198)'

'Making symlink for fastq file 16_S16_R1_001_Run230322 (97/198)'

'Making symlink for fastq file 16_S16_R2_001_Run230322 (98/198)'

'Making symlink for fastq file 17_S17_R1_001_Run230322 (99/198)'

'Making symlink for fastq file 17_S17_R2_001_Run230322 (100/198)'

'Making symlink for fastq file 18_S18_R1_001_Run230322 (101/198)'

'Making symlink for fastq file 18_S18_R2_001_Run230322 (102/198)'

'Making symlink for fastq file 19_S19_R1_001_Run230322 (103/198)'

'Making symlink for fastq file 19_S19_R2_001_Run230322 (104/198)'

'Making symlink for fastq file 20_S20_R1_001_Run230322 (105/198)'

'Making symlink for fastq file 20_S20_R2_001_Run230322 (106/198)'

'Making symlink for fastq file 21_S21_R1_001_Run230322 (107/198)'

'Making symlink for fastq file 21_S21_R2_001_Run230322 (108/198)'

'Making symlink for fastq file 22_S22_R1_001_Run230322 (109/198)'

'Making symlink for fastq file 22_S22_R2_001_Run230322 (110/198)'

'Making symlink for fastq file 23_S23_R1_001_Run230322 (111/198)'

'Making symlink for fastq file 23_S23_R2_001_Run230322 (112/198)'

'Making symlink for fastq file 24_S24_R1_001_Run230322 (113/198)'

'Making symlink for fastq file 24_S24_R2_001_Run230322 (114/198)'

'Making symlink for fastq file 25_S25_R1_001_Run230322 (115/198)'

'Making symlink for fastq file 25_S25_R2_001_Run230322 (116/198)'

'Making symlink for fastq file 26_S26_R1_001_Run230322 (117/198)'

'Making symlink for fastq file 26_S26_R2_001_Run230322 (118/198)'

'Making symlink for fastq file 27_S27_R1_001_Run230322 (119/198)'

'Making symlink for fastq file 27_S27_R2_001_Run230322 (120/198)'

'Making symlink for fastq file 28_S28_R1_001_Run230322 (121/198)'

'Making symlink for fastq file 28_S28_R2_001_Run230322 (122/198)'

'Making symlink for fastq file 29_S29_R1_001_Run230322 (123/198)'

'Making symlink for fastq file 29_S29_R2_001_Run230322 (124/198)'

'Making symlink for fastq file 30_S30_R1_001_Run230322 (125/198)'

'Making symlink for fastq file 30_S30_R2_001_Run230322 (126/198)'

'Making symlink for fastq file 31_S31_R1_001_Run230322 (127/198)'

'Making symlink for fastq file 31_S31_R2_001_Run230322 (128/198)'

'Making symlink for fastq file 32_S32_R1_001_Run230322 (129/198)'

'Making symlink for fastq file 32_S32_R2_001_Run230322 (130/198)'

'Making symlink for fastq file 33_S33_R1_001_Run230322 (131/198)'

'Making symlink for fastq file 33_S33_R2_001_Run230322 (132/198)'

'Making symlink for fastq file 1_S1_R1_001_Run230323 (133/198)'

'Making symlink for fastq file 1_S1_R2_001_Run230323 (134/198)'

'Making symlink for fastq file 2_S2_R1_001_Run230323 (135/198)'

'Making symlink for fastq file 2_S2_R2_001_Run230323 (136/198)'

'Making symlink for fastq file 3_S3_R1_001_Run230323 (137/198)'

'Making symlink for fastq file 3_S3_R2_001_Run230323 (138/198)'

'Making symlink for fastq file 4_S4_R1_001_Run230323 (139/198)'

'Making symlink for fastq file 4_S4_R2_001_Run230323 (140/198)'

'Making symlink for fastq file 5_S5_R1_001_Run230323 (141/198)'

'Making symlink for fastq file 5_S5_R2_001_Run230323 (142/198)'

'Making symlink for fastq file 6_S6_R1_001_Run230323 (143/198)'

'Making symlink for fastq file 6_S6_R2_001_Run230323 (144/198)'

'Making symlink for fastq file 7_S7_R1_001_Run230323 (145/198)'

'Making symlink for fastq file 7_S7_R2_001_Run230323 (146/198)'

'Making symlink for fastq file 8_S8_R1_001_Run230323 (147/198)'

'Making symlink for fastq file 8_S8_R2_001_Run230323 (148/198)'

'Making symlink for fastq file 9_S9_R1_001_Run230323 (149/198)'

'Making symlink for fastq file 9_S9_R2_001_Run230323 (150/198)'

'Making symlink for fastq file 10_S10_R1_001_Run230323 (151/198)'

'Making symlink for fastq file 10_S10_R2_001_Run230323 (152/198)'

'Making symlink for fastq file 11_S11_R1_001_Run230323 (153/198)'

'Making symlink for fastq file 11_S11_R2_001_Run230323 (154/198)'

'Making symlink for fastq file 12_S12_R1_001_Run230323 (155/198)'

'Making symlink for fastq file 12_S12_R2_001_Run230323 (156/198)'

'Making symlink for fastq file 13_S13_R1_001_Run230323 (157/198)'

'Making symlink for fastq file 13_S13_R2_001_Run230323 (158/198)'

'Making symlink for fastq file 14_S14_R1_001_Run230323 (159/198)'

'Making symlink for fastq file 14_S14_R2_001_Run230323 (160/198)'

'Making symlink for fastq file 15_S15_R1_001_Run230323 (161/198)'

'Making symlink for fastq file 15_S15_R2_001_Run230323 (162/198)'

'Making symlink for fastq file 16_S16_R1_001_Run230323 (163/198)'

'Making symlink for fastq file 16_S16_R2_001_Run230323 (164/198)'

'Making symlink for fastq file 17_S17_R1_001_Run230323 (165/198)'

'Making symlink for fastq file 17_S17_R2_001_Run230323 (166/198)'

'Making symlink for fastq file 18_S18_R1_001_Run230323 (167/198)'

'Making symlink for fastq file 18_S18_R2_001_Run230323 (168/198)'

'Making symlink for fastq file 19_S19_R1_001_Run230323 (169/198)'

'Making symlink for fastq file 19_S19_R2_001_Run230323 (170/198)'

'Making symlink for fastq file 20_S20_R1_001_Run230323 (171/198)'

'Making symlink for fastq file 20_S20_R2_001_Run230323 (172/198)'

'Making symlink for fastq file 21_S21_R1_001_Run230323 (173/198)'

'Making symlink for fastq file 21_S21_R2_001_Run230323 (174/198)'

'Making symlink for fastq file 22_S22_R1_001_Run230323 (175/198)'

'Making symlink for fastq file 22_S22_R2_001_Run230323 (176/198)'

'Making symlink for fastq file 23_S23_R1_001_Run230323 (177/198)'

'Making symlink for fastq file 23_S23_R2_001_Run230323 (178/198)'

'Making symlink for fastq file 24_S24_R1_001_Run230323 (179/198)'

'Making symlink for fastq file 24_S24_R2_001_Run230323 (180/198)'

'Making symlink for fastq file 25_S25_R1_001_Run230323 (181/198)'

'Making symlink for fastq file 25_S25_R2_001_Run230323 (182/198)'

'Making symlink for fastq file 26_S26_R1_001_Run230323 (183/198)'

'Making symlink for fastq file 26_S26_R2_001_Run230323 (184/198)'

'Making symlink for fastq file 27_S27_R1_001_Run230323 (185/198)'

'Making symlink for fastq file 27_S27_R2_001_Run230323 (186/198)'

'Making symlink for fastq file 28_S28_R1_001_Run230323 (187/198)'

'Making symlink for fastq file 28_S28_R2_001_Run230323 (188/198)'

'Making symlink for fastq file 29_S29_R1_001_Run230323 (189/198)'

'Making symlink for fastq file 29_S29_R2_001_Run230323 (190/198)'

'Making symlink for fastq file 30_S30_R1_001_Run230323 (191/198)'

'Making symlink for fastq file 30_S30_R2_001_Run230323 (192/198)'

'Making symlink for fastq file 31_S31_R1_001_Run230323 (193/198)'

'Making symlink for fastq file 31_S31_R2_001_Run230323 (194/198)'

'Making symlink for fastq file 32_S32_R1_001_Run230323 (195/198)'

'Making symlink for fastq file 32_S32_R2_001_Run230323 (196/198)'

'Making symlink for fastq file 33_S33_R1_001_Run230323 (197/198)'

'Making symlink for fastq file 33_S33_R2_001_Run230323 (198/198)'

Unnamed: 0_level_0,original_fastq_path,symlink_fastq_path,final_file_name,md5_checksum
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_S1_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,1_S1_R1_001_Run230316.fastq.gz,19cee84a6a3598930e2e6befdf203e0f
1_S1_R2_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,1_S1_R2_001_Run230316.fastq.gz,a41a73120d75d1a5c4696488dc56af78
2_S2_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,2_S2_R1_001_Run230316.fastq.gz,0613d63ebd5346ef57124d458a5e9f7b
2_S2_R2_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,2_S2_R2_001_Run230316.fastq.gz,46e40cbfdf13d025863609aab40e63cd
3_S3_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,3_S3_R1_001_Run230316.fastq.gz,e28b40d675a63b1837587ff28dc2a618
...,...,...,...,...
31_S31_R2_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,31_S31_R2_001_Run230323.fastq.gz,267e934f226bb6bc9926fdcd69a8b063
32_S32_R1_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,32_S32_R1_001_Run230323.fastq.gz,c8634cdab2082b4198d58ccc0057f77b
32_S32_R2_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,32_S32_R2_001_Run230323.fastq.gz,b332e910a17b49a3f1599ee04b4d2743
33_S33_R1_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,33_S33_R1_001_Run230323.fastq.gz,4214a83565ed2095db0cf9a49fec95af


In [None]:
# Code for deep copy:

# i = 1
# for sample_id, row in sample_info.iterrows():
#     display(f"Copying fastq file {sample_id} ({i}/{len(sample_info)})")
#     sample_original_path = row["original_fastq_path"]
#     copied_fastq_path = f"{final_fastq_folder}/{sample_id}.fastq.gz"

#     # Copy the fastq file
#     run_command(f"cp {sample_original_path} {copied_fastq_path}")

#     # Compute md5 checksum on copied file
#     display(f"Computing MD5 for file {sample_id} ({i}/{len(sample_info)})")
#     copied_md5_hash = compute_md5_checksum(copied_fastq_path)

#     # Check md5 checksum matches
#     assert (copied_md5_checksum == row["md5_checksum"], f"MD5 checksums do not match for {sample_id}")

#     # Update sample info
#     sample_info.at[sample_id, "copied_fastq_path"] = copied_fastq_path

#     i += 1
#     break

# display(sample_info)

## Save sample info

This can then be copied into the GEO metadata spreadsheet

In [10]:
sample_info.to_csv(sample_info_path, sep="\t", index=True)
display(sample_info)

Unnamed: 0_level_0,original_fastq_path,symlink_fastq_path,final_file_name,md5_checksum
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_S1_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,1_S1_R1_001_Run230316.fastq.gz,19cee84a6a3598930e2e6befdf203e0f
1_S1_R2_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,1_S1_R2_001_Run230316.fastq.gz,a41a73120d75d1a5c4696488dc56af78
2_S2_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,2_S2_R1_001_Run230316.fastq.gz,0613d63ebd5346ef57124d458a5e9f7b
2_S2_R2_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,2_S2_R2_001_Run230316.fastq.gz,46e40cbfdf13d025863609aab40e63cd
3_S3_R1_001_Run230316,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230316_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,3_S3_R1_001_Run230316.fastq.gz,e28b40d675a63b1837587ff28dc2a618
...,...,...,...,...
31_S31_R2_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,31_S31_R2_001_Run230323.fastq.gz,267e934f226bb6bc9926fdcd69a8b063
32_S32_R1_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,32_S32_R1_001_Run230323.fastq.gz,c8634cdab2082b4198d58ccc0057f77b
32_S32_R2_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,32_S32_R2_001_Run230323.fastq.gz,b332e910a17b49a3f1599ee04b4d2743
33_S33_R1_001_Run230323,/Volumes/kueck/FASTQ/CellLines/RNA-Seq/230323_...,/Volumes/kueck/ovarian_cancer_cisplatin_respon...,33_S33_R1_001_Run230323.fastq.gz,4214a83565ed2095db0cf9a49fec95af


## Create formatted table with files per experimental sample

In [17]:
columns = ["sample_number"]
for run in runs:
    for read_num in range(1, 3):
        columns.append(f"{run.name}_R{read_num}")

formatted_sample_info = pd.DataFrame(
    columns=columns
)
formatted_sample_info.set_index("sample_number", inplace=True)

for sample_num in range(1, 34):
    for run in runs:
        for read_num in range(1, 3):
            column_name = f"{run.name}_R{read_num}"
            file_name = f"{sample_num}_S{sample_num}_R{read_num}_001_{run.name}.fastq.gz"

            # Check that this file name appears in the sample_info data frame
            assert(file_name in sample_info["final_file_name"].values)

            formatted_sample_info.at[sample_num, column_name] = file_name

display(formatted_sample_info)

formatted_sample_info.to_csv(formatted_sample_files_path, sep="\t", index=True)

Unnamed: 0_level_0,Run230316_R1,Run230316_R2,Run230322_R1,Run230322_R2,Run230323_R1,Run230323_R2
sample_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1_S1_R1_001_Run230316.fastq.gz,1_S1_R2_001_Run230316.fastq.gz,1_S1_R1_001_Run230322.fastq.gz,1_S1_R2_001_Run230322.fastq.gz,1_S1_R1_001_Run230323.fastq.gz,1_S1_R2_001_Run230323.fastq.gz
2,2_S2_R1_001_Run230316.fastq.gz,2_S2_R2_001_Run230316.fastq.gz,2_S2_R1_001_Run230322.fastq.gz,2_S2_R2_001_Run230322.fastq.gz,2_S2_R1_001_Run230323.fastq.gz,2_S2_R2_001_Run230323.fastq.gz
3,3_S3_R1_001_Run230316.fastq.gz,3_S3_R2_001_Run230316.fastq.gz,3_S3_R1_001_Run230322.fastq.gz,3_S3_R2_001_Run230322.fastq.gz,3_S3_R1_001_Run230323.fastq.gz,3_S3_R2_001_Run230323.fastq.gz
4,4_S4_R1_001_Run230316.fastq.gz,4_S4_R2_001_Run230316.fastq.gz,4_S4_R1_001_Run230322.fastq.gz,4_S4_R2_001_Run230322.fastq.gz,4_S4_R1_001_Run230323.fastq.gz,4_S4_R2_001_Run230323.fastq.gz
5,5_S5_R1_001_Run230316.fastq.gz,5_S5_R2_001_Run230316.fastq.gz,5_S5_R1_001_Run230322.fastq.gz,5_S5_R2_001_Run230322.fastq.gz,5_S5_R1_001_Run230323.fastq.gz,5_S5_R2_001_Run230323.fastq.gz
6,6_S6_R1_001_Run230316.fastq.gz,6_S6_R2_001_Run230316.fastq.gz,6_S6_R1_001_Run230322.fastq.gz,6_S6_R2_001_Run230322.fastq.gz,6_S6_R1_001_Run230323.fastq.gz,6_S6_R2_001_Run230323.fastq.gz
7,7_S7_R1_001_Run230316.fastq.gz,7_S7_R2_001_Run230316.fastq.gz,7_S7_R1_001_Run230322.fastq.gz,7_S7_R2_001_Run230322.fastq.gz,7_S7_R1_001_Run230323.fastq.gz,7_S7_R2_001_Run230323.fastq.gz
8,8_S8_R1_001_Run230316.fastq.gz,8_S8_R2_001_Run230316.fastq.gz,8_S8_R1_001_Run230322.fastq.gz,8_S8_R2_001_Run230322.fastq.gz,8_S8_R1_001_Run230323.fastq.gz,8_S8_R2_001_Run230323.fastq.gz
9,9_S9_R1_001_Run230316.fastq.gz,9_S9_R2_001_Run230316.fastq.gz,9_S9_R1_001_Run230322.fastq.gz,9_S9_R2_001_Run230322.fastq.gz,9_S9_R1_001_Run230323.fastq.gz,9_S9_R2_001_Run230323.fastq.gz
10,10_S10_R1_001_Run230316.fastq.gz,10_S10_R2_001_Run230316.fastq.gz,10_S10_R1_001_Run230322.fastq.gz,10_S10_R2_001_Run230322.fastq.gz,10_S10_R1_001_Run230323.fastq.gz,10_S10_R2_001_Run230323.fastq.gz


## Compute MD5 Checksums for processed data files

In [19]:
processed_file_info = pd.DataFrame(
    columns=["file_name", "md5_checksum"]
)

for file_name in processed_data_file_names:
    file_path = f"{processed_data_folder}/{file_name}"

    # Compute md5 checksum
    md5_checksum = compute_md5_checksum(file_path)

    # Add to processed file info
    processed_file_info.loc[file_name] = [file_name, md5_checksum]

display(processed_file_info)

# Save processed data info
processed_file_info.to_csv(processed_data_info_path, sep="\t", index=False)

Unnamed: 0,file_name,md5_checksum
salmon.merged.gene_tpm.tsv,salmon.merged.gene_tpm.tsv,69df151d0b83454cc8c36b6972e62629
salmon.merged.gene_counts.tsv,salmon.merged.gene_counts.tsv,8955cd3fcad46e7cb9b11e12b97bd9e8
