In [1]:
%run "../00_project_config.ipynb"

import pathlib as pl
import re as re
import collections as col
import pandas as pd
import datetime as dt

data_status_file = PROJECT_BASE.joinpath(
    "annotations", "projectmng", "data_production_status.tsv"
)

data_status_ts = None
with open(data_status_file, "r") as data_status_content:
    for line in data_status_content:
        if line.startswith("#"):
            if "2023" in line or "2024" in line:
                print(f"Processing data status: {line.strip()}")
                data_status_ts = line.strip("#").strip()
            continue
        break
        
data_status = pd.read_csv(
    data_status_file, sep="\t", header=0,
    comment="#", index_col=1
)

# by construction, the fofn table contains only complete samples
fofn_table_file = PROJECT_BASE.joinpath(
    "samples", "fofn_table.tsv"
)

fofn_table = pd.read_csv(fofn_table_file, sep="\t", header=0, comment="#")
# 2023-09-01: drop epi2me ONT for HG002
fofn_table = fofn_table.loc[~fofn_table["fofn_path"].str.contains("epi2me"), :].copy()

### 2023-11-20
# skip all batch 3 samples that belong to the YRI and CHS trios as well
# as sample HG00096 that needs a fix in GraphAligner (release v1.0.18
# not yet on bioconda).

# SKIP_B3_SAMPLES = [
#   "HG00512", "HG00513", "HG00514",
#   "NA19238", "NA19239", "NA19240",
#   "HG00096" ]

### 2023-11-29
# Update: data production complete, no more samples to skip
# NB: top up for HG00732 as well, so sample moved into batch 3


sample_records = col.defaultdict(dict)
for (sample, read_type), fofn_files in fofn_table.groupby(["sample", "read_type"]):
    
    try:
        sample_batch = int(data_status.at[sample, "sample_batch"])
        norm_name = sample
    except KeyError:
        norm_name = sample
        lu_name = "GM" + sample[2:]
        sample_batch = int(data_status.at[lu_name, "sample_batch"])
    
    # see comment above; 2023-11-29
    #if sample_batch == 3 and sample in SKIP_B3_SAMPLES:
    #    print(f"DEBUG - skipping sample {sample}")
    #    continue
    # 
    #assert sample not in SKIP_B3_SAMPLES
    
    merged_fofns = ",".join(sorted(fofn_files["fofn_path"].values))
    
    # 2023-11-29
    # the following check is used for now to separate
    # SQII/HiFi from Revio/HiFi data;
    # the Revio HiFi fofn file has this suffix
    if "f1f4e3c8" in merged_fofns:
        merged_fofns = merged_fofns.split(",")
        sq2_hifi = [fofn for fofn in merged_fofns if "f1f4e3c8" not in fofn]
        assert len(sq2_hifi) == 2
        revio_hifi = [fofn for fofn in merged_fofns if "f1f4e3c8" in fofn]
        assert len(revio_hifi) == 1
        
        sample_records[norm_name][read_type] = ",".join(sq2_hifi)
        sample_records[norm_name]["target"] = "unphased"
        sample_records[norm_name]["batch"] = sample_batch
        
        sample_records[norm_name + "R"][read_type] = revio_hifi[0]
        sample_records[norm_name + "R"]["target"] = "unphased"
        sample_records[norm_name + "R"]["batch"] = 3
        
    else:
        sample_records[norm_name][read_type] = merged_fofns
        sample_records[norm_name]["target"] = "unphased"
        sample_records[norm_name]["batch"] = sample_batch
        
    if read_type == "ont" and norm_name == "HG00733":
        sample_records[norm_name + "R"][read_type] = merged_fofns
        sample_records[norm_name + "R"]["target"] = "unphased"
        sample_records[norm_name + "R"]["batch"] = 3
    
sample_records = pd.DataFrame.from_records(sample_records)
sample_records = sample_records.transpose()
sample_records.reset_index(drop=False, inplace=True)
sample_records.rename({"index": "sample"}, inplace=True, axis=1)

sample_records.sort_values(["sample", "batch", "hifi", "ont"], inplace=True)
sample_records = sample_records[["sample", "batch", "target", "hifi", "ont"]]

# compute number of samples per batch;
# ignore special sample HG00733R

special_samples = ["HG00733R"]

samples_per_batch = sample_records.loc[
    ~sample_records["sample"].isin(special_samples),
    "batch"
].value_counts()

sample_sheet_all_file = PROJECT_BASE.joinpath(
    "samples", "vrk_unps_all_samples.tsv"
)

sample_sheet_hgsvc3_file = PROJECT_BASE.joinpath(
    "samples", "vrk_unps_hgsvc3_samples.tsv"
)

with open(sample_sheet_all_file, "w") as sample_sheet:
    if data_status_ts is not None:
        _ = sample_sheet.write(f"# Relative to data status: {data_status_ts}\n")
    _ = sample_sheet.write(f"# {TODAY}\n")
    _ = sample_sheet.write(f"# Samples: {sample_records['sample'].nunique()}\n")
    for batch, n_samples in samples_per_batch.items():
        _ = sample_sheet.write(f"# Batch {batch}: {n_samples} samples\n")
    sample_records.to_csv(sample_sheet, sep="\t", header=True, index=False)


sample_records = sample_records.loc[
    ~sample_records["sample"].isin(special_samples), :
].copy()
with open(sample_sheet_hgsvc3_file, "w") as sample_sheet:
    if data_status_ts is not None:
        _ = sample_sheet.write(f"# Relative to data status: {data_status_ts}\n")
    _ = sample_sheet.write(f"# {TODAY}\n")
    _ = sample_sheet.write(f"# Samples: {sample_records['sample'].nunique()}\n")
    for batch, n_samples in samples_per_batch.items():
        _ = sample_sheet.write(f"# Batch {batch}: {n_samples} samples\n")
    sample_records.to_csv(sample_sheet, sep="\t", header=True, index=False)

    
#sample_records[["sample", "target", "hifi", "ont"]].to_csv(row_store, sep="\t", header=True, index=False)

# df = pd.DataFrame.from_records(sorted(records), columns=["sample", "key", "value"])
# df.to_csv(eav_store, header=True, index=False, sep="\t")

# def make_csv(values):
#     return ",".join(sorted(map(str, values)))

# df = df.pivot_table(
#     index="sample",
#     columns="key",
#     values="value",
#     aggfunc=make_csv
# )
# df = df[["hgsvc_sample_num", "target", "hifi", "ont", "batch"]]
# df.to_csv(row_store, header=True, index=True, sep="\t")


Processing data status: # 20240124T1628
