# Check the SHA-512 checksums of downloaded FASTQs versus those from Crits Christoph et al

In [None]:
import ast
import os

import pandas as pd

Read Crits-Christoph et al data on checksums:

In [None]:
cc_checksums = (
    pd.read_csv(snakemake.input.cc_checksums)
    .drop(columns=["File Size (bytes)", "Number of Reads", "Download Time (Pacific)"])
    .rename(columns={"File Name": "fastq"})
    .assign(SHA512=lambda x: x["SHA512"].str.lower())
    .query("SHA512 != 'missing from gisaid'")
)

cc_checksums

Get checksums for downloaded FASTQs from NGDC:

First get metadata for all downloaded FASTQs:

In [None]:
metadata = (
    pd.read_csv(snakemake.input.metadata, converters={"fastqs": ast.literal_eval})
    [["Run accession", "Sample name", "Sample title", "Collection date", "fastqs"]]
    .explode("fastqs")
    .rename(
        columns={
            "fastqs": "fastq",
            "Sample name": "Sample name NGDC",
            "Run accession": "Run accession NGDC",
        },
    )
    .reset_index(drop=True)
)

metadata

Now merge these metadata with the computed checksums:

In [None]:
records = []

for checksum_files in [snakemake.input.checksums, snakemake.input.checksums_nogz]:
    for checksum_file in checksum_files:
        with open(checksum_file) as f:
            checksum, fullpath = f.read().strip().split()
        records.append((checksum, os.path.basename(fullpath)))

downloaded = (
    pd.DataFrame(records, columns=["SHA512", "fastq"])
    .assign(
        fastq_gz=lambda x: x["fastq"].where(
            x["fastq"].str.endswith(".gz"), x["fastq"] + ".gz"
        ),
    )
    .merge(
        metadata.rename(columns={"fastq": "merge_fastq"}),
        left_on="fastq_gz",
        right_on="merge_fastq",
        how="outer",
        validate="many_to_one",
    )
    .drop(columns=["fastq_gz", "merge_fastq"])
)

assert downloaded.notnull().all().all()

downloaded

Now merge the Crits-Christoph and downloaded checksums and see if they match for the shared files:

In [None]:
merged = (
    cc_checksums
    .merge(
        downloaded,
        how="left",
        on="SHA512",
        validate="one_to_one",
        suffixes=[" Crits-Christoph", " NGDC"],
    )
)

assert set(cc_checksums["SHA512"]).issubset(downloaded["SHA512"]) 

assert merged.notnull().all().all()

assert (merged["Sample title Crits-Christoph"] == merged["Sample title NGDC"]).all()

assert (merged["Collection date Crits-Christoph"] == merged["Collection date NGDC"]).all()

merged.to_csv(snakemake.output.csv, index=False)

merged