# Format raw and processed files for GEO uploads

## Setup

In [7]:
from collections import namedtuple
import pandas as pd
import os
import hashlib

# Setting up this named tuple to store any further information we need about the samples
# (such as directory paths, etc.)
SampleInfo = namedtuple("SampleInfo", ["name", "treatment", "replicate", "fastq_id"])

samples = [
    SampleInfo(
        "Plk1as_DMSO_rep1",
        "DMSO",
        "1",
        "p3_S1",
    ),
    SampleInfo(
        "Plk1as_DMSO_rep2",
        "DMSO",
        "2",
        "p5_S5",
    ),
    SampleInfo(
        "Plk1as_DMSO_rep3",
        "DMSO",
        "3",
        "p14_S3",
    ),
    SampleInfo(
        "Plk1as_DMSO_rep4",
        "DMSO",
        "4",
        "p16_S7",
    ),
    SampleInfo(
        "Plk1as_3MB_rep1",
        "3MB",
        "1",
        "p3_S2",
    ),
    SampleInfo(
        "Plk1as_3MB_rep2",
        "3MB",
        "2",
        "p5_S6",
    ),
    SampleInfo(
        "Plk1as_3MB_rep3",
        "3MB",
        "3",
        "p14_S4",
    ),
    SampleInfo(
        "Plk1as_3MB_rep4",
        "3MB",
        "4",
        "p16_S8",
    ),
]

# Path to raw and processed data files
data_folder = f"data_to_upload"

# Print command and run command
def run_command(cmd):
    print(f"cmd: {cmd}")
    os.system(cmd)

In [2]:
def compute_md5_checksum(file_path):
    md5_hash = hashlib.md5()
    with open(file_path, "rb") as file:
        # Read the file in chunks to avoid memory issues with large files
        for chunk in iter(lambda: file.read(4096), b""):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()

## Compute MD5 checksums for peaks files

In [None]:
processed_files_checksum_df = pd.DataFrame(columns=["file_name", "md5_checksum"])
for sample in samples:
    display(f"Computing MD5 for {sample.name}")
    sample_peaks_path = (
        f"{data_folder}/{sample.treatment}_REP{sample.replicate}.mLb.clN_peaks.bed"
    )

    # Compute md5 checksum
    md5_checksum = compute_md5_checksum(sample_peaks_path)

    # Add to checksum data frame
    processed_files_checksum_df = pd.concat(
        [
            processed_files_checksum_df,
            pd.DataFrame(
                {
                    "file_name": [os.path.basename(sample_peaks_path)],
                    "md5_checksum": [md5_checksum],
                }
            ),
        ],
        ignore_index=True,
    )

display(processed_files_checksum_df)
processed_files_checksum_df.to_csv("processed_files_checksum.csv", index=False)

'Computing MD5 for Plk1as_DMSO_rep1'

'Computing MD5 for Plk1as_DMSO_rep2'

'Computing MD5 for Plk1as_DMSO_rep3'

'Computing MD5 for Plk1as_DMSO_rep4'

'Computing MD5 for Plk1as_3MB_rep1'

'Computing MD5 for Plk1as_3MB_rep2'

'Computing MD5 for Plk1as_3MB_rep3'

'Computing MD5 for Plk1as_3MB_rep4'

Unnamed: 0,file_name,md5_checksum
0,DMSO_REP1.mLb.clN_peaks.bed,981fc75b4a5dbceeed82832ab7569bd1
1,DMSO_REP2.mLb.clN_peaks.bed,a48284b461ada359483ae9363c4439ed
2,DMSO_REP3.mLb.clN_peaks.bed,cd28385873dceca19cf8acfdfdfdd843
3,DMSO_REP4.mLb.clN_peaks.bed,3aff8a96fa0bc139269613849048de9a
4,3MB_REP1.mLb.clN_peaks.bed,2f2df4adae834c7659cfd4a8e5686a2a
5,3MB_REP2.mLb.clN_peaks.bed,602208ad0a00e6197b48511b644c630e
6,3MB_REP3.mLb.clN_peaks.bed,d561ab8e35cbc65b39f6299012fd2ca8
7,3MB_REP4.mLb.clN_peaks.bed,eba47012d9766f020f3dda3047f01c54


## Compute MD5 checksums for fastq files

In [9]:
raw_files_checksum_df = pd.DataFrame(columns=["file_name", "md5_checksum"])
for sample in samples:
    for r in [1, 2]:
        display(f"Computing MD5 for {sample.name}")
        sample_fastq_path = f"{data_folder}/{sample.treatment}_{sample.fastq_id}_R{r}_001.fastq.gz"

        # Compute md5 checksum
        md5_checksum = compute_md5_checksum(sample_fastq_path)

        # Add to checksum data frame
        raw_files_checksum_df = pd.concat(
            [
                raw_files_checksum_df,
                pd.DataFrame(
                    {
                        "file_name": [os.path.basename(sample_fastq_path)],
                        "md5_checksum": [md5_checksum],
                    }
                ),
            ],
            ignore_index=True,
        )


display(raw_files_checksum_df)
raw_files_checksum_df.to_csv("raw_files_checksum.csv", index=False)

'Computing MD5 for Plk1as_DMSO_rep1'

'Computing MD5 for Plk1as_DMSO_rep1'

'Computing MD5 for Plk1as_DMSO_rep2'

'Computing MD5 for Plk1as_DMSO_rep2'

'Computing MD5 for Plk1as_DMSO_rep3'

'Computing MD5 for Plk1as_DMSO_rep3'

'Computing MD5 for Plk1as_DMSO_rep4'

'Computing MD5 for Plk1as_DMSO_rep4'

'Computing MD5 for Plk1as_3MB_rep1'

'Computing MD5 for Plk1as_3MB_rep1'

'Computing MD5 for Plk1as_3MB_rep2'

'Computing MD5 for Plk1as_3MB_rep2'

'Computing MD5 for Plk1as_3MB_rep3'

'Computing MD5 for Plk1as_3MB_rep3'

'Computing MD5 for Plk1as_3MB_rep4'

'Computing MD5 for Plk1as_3MB_rep4'

Unnamed: 0,file_name,md5_checksum
0,DMSO_p3_S1_R1_001.fastq.gz,3ab20ea07414c97c2b9c50f08af5026a
1,DMSO_p3_S1_R2_001.fastq.gz,da8043a32c263cfa9d9bdd8711ac4d7a
2,DMSO_p5_S5_R1_001.fastq.gz,a59325f94f75086c4e62b55db2ee3d14
3,DMSO_p5_S5_R2_001.fastq.gz,87f9b141747398e828948b5e77b3b5ff
4,DMSO_p14_S3_R1_001.fastq.gz,45dcfbf5ed807e9efb1bc552f85e8388
5,DMSO_p14_S3_R2_001.fastq.gz,79bd330c29562941c480aba1d0436ca9
6,DMSO_p16_S7_R1_001.fastq.gz,38bef9d5ee649a06073d1fabe68a335e
7,DMSO_p16_S7_R2_001.fastq.gz,cb016b1f91fe1889346068847fd76a92
8,3MB_p3_S2_R1_001.fastq.gz,5afd854e59f824a5b19628842bceb26a
9,3MB_p3_S2_R2_001.fastq.gz,3ab22496eafcc661da0eb14fcc751fe3
