In [2]:
import pandas as pd
import pathlib

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"

_PROJECT_CONFIG_NB = str(pathlib.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

data_freeze = pd.read_csv(
    PROJECT_BASE.joinpath(
        "annotations", "data_freezes", "hgsvc3_assembly_data_sources.draft.tsv"
    ), sep="\t", comment="#", header=0
)

acc_files = pd.read_csv(
    PROJECT_BASE.joinpath(
        "annotations", "data_freezes", "hgsvc3_assembly_data_sources.accessions.tsv"
    ), sep="\t", comment="#", header=0
)
acc_files.drop(["datatype", "sample"], axis=1, inplace=True)
acc_files.set_index("df_idx", inplace=True)

# 2024-12-03 update w/ new set of Verkko assembly accessions delivered by Feyza
# file used for submission:
# 20240716_ena-upload_assembly-acc.csv
acc_assm = pd.read_csv(
    PROJECT_BASE.joinpath(
        "annotations", "external", "accessions", "20241120_verkko_assembly_acc.txt"
    ), sep="\t", header=0
)

acc_assm["sample"] = acc_assm["description"].apply(lambda x: x.split(".")[0])
acc_assm_lut = dict(
    (row.sample, row.accession) for row in acc_assm.itertuples()
)

data_freeze.drop(
    [
        "remote_path", "accession", "igsr_folder_exists", "igsr_file_exists",
        "hgsvc_sample_number", "phase", "verkko_assembly_batch", "family", "member",
        "is_child", "SIN"
    ], axis=1, inplace=True
)

data_freeze.rename(
    {"filename": "file_ext"},
    axis=1, inplace=True
)

merged = data_freeze.merge(acc_files, left_index=True, right_index=True, how="outer")

# fix Strand-seq summary
select_strandseq = merged["datatype"] == "strandseq"
merged.loc[select_strandseq, "filename"] = "all"
merged.loc[select_strandseq, "matched_name"] = "all"
merged.loc[select_strandseq, "match_quality"] = "(summary)"
merged.loc[select_strandseq, "file_accession"] = "(summary)"
merged.loc[select_strandseq, "reuse_data"] = "yes"
merged.loc[select_strandseq, "project_accession"] = "PRJEB39750"

stats_columns = [c for c in merged.columns if "_grt_" in c]

merged.loc[select_strandseq, stats_columns] = -1

select_trios = [
    "HG00731", "HG00732", "HG00733",
    "HG00512", "HG00513", "HG00514",
    "NA19238", "NA19239", "NA19240"
]

custom_select = (merged["sample"].isin(select_trios)) & select_strandseq
merged.loc[custom_select, "project_accession"] = "PRJEB12849"

custom_select = (merged["sample"].isin(["NA24385"])) & select_strandseq
merged.loc[custom_select, "project_accession"] = "AWS:S3:human-pangenomics:NHGRI_UCSC_panel/HG002/hpp_HG002_NA24385_son_v1/Strand_seq/2019-12-16-HWVTJAFXY"

# fix 'all' summary rows
select_summary_rows = merged["file_ext"] == "all"
merged.loc[select_summary_rows, "filename"] = "all"
merged.loc[select_summary_rows, "matched_name"] = "all"
merged.loc[select_summary_rows, "match_quality"] = "(summary)" 
merged.loc[select_summary_rows, "file_accession"] = "(summary)" 
merged.loc[select_summary_rows, "project_accession"] = "(summary)"
merged.loc[select_summary_rows, "reuse_data"] = "(summary)"

nan_rows = merged.loc[pd.isnull(merged).any(axis=1), :]
assert nan_rows.empty

merged.drop(["file_ext"], axis=1, inplace=True)

merged["verkko_assembly_accession"] = merged["sample"].replace(acc_assm_lut, inplace=False)

print(merged.columns)

reorder_columns = [
    "sample", "sex", "population", "supergroup",
    "verkko_assembly_accession", "datatype",
    "filename", "file_accession",
    "project_accession",
    "total_num_grt_0bp", "total_length_grt_0bp",
    "cov_xfold_grt_0bp_at_3Gbp", "length_N50_grt_0bp",
    "length_auN_grt_0bp",
    "total_num_grt_100kbp", "cov_xfold_grt_100kbp_at_3Gbp",
    "length_N50_grt_100kbp", "length_auN_grt_100kbp",
    "total_num_grt_1Mbp", "matched_name", "match_quality", "reuse_data"
]
assert all(c in merged.columns for c in reorder_columns)

merged = merged[reorder_columns]

merged.drop_duplicates(inplace=True)

merged.sort_values(["sample", "datatype", "filename"], inplace=True)

output_table = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "hgsvc3_assembly_data_sources.final-raw-header.tsv"
)

with open(output_table, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    merged.to_csv(dump, sep="\t", header=True, index=False)

Index(['sample', 'population', 'supergroup', 'sex', 'datatype',
       'cov_xfold_grt_0bp_at_3Gbp', 'total_length_grt_0bp',
       'total_num_grt_0bp', 'length_N50_grt_0bp', 'length_auN_grt_0bp',
       'cov_xfold_grt_100kbp_at_3Gbp', 'total_num_grt_100kbp',
       'length_N50_grt_100kbp', 'length_auN_grt_100kbp', 'total_num_grt_1Mbp',
       'filename', 'matched_name', 'match_quality', 'file_accession',
       'project_accession', 'reuse_data', 'verkko_assembly_accession'],
      dtype='object')
