In [1]:
import tqdm
from pathlib import Path
import numpy as np
import pandas as pd


In [None]:
base_dir = Path("../data")
# Note that these columns have different names in the original file
# (run_metadata.v05.tsv, 514a111c8f2f02c2db36a3e4e48baf58):
# strain = Run
# date_submitted = First_created
# date = Date_tree
metadata_file = base_dir / "run_metadata.v05.renamed.tsv"
md = pd.read_csv(
    metadata_file,
    header=0,
    sep="\t",
    na_values=["."],
)
#md.head()


In [3]:
assert not len(md['strain'].unique()) == len(md['Sample'].unique())


In [None]:
# Address https://github.com/jeromekelleher/sc2ts/issues/257
# ENA sample accessions
dup_samples = md[md.duplicated(subset='Sample')]['Sample'].unique()

# Prefilter metadata to only the samples above.
filt_md = md[md['Sample'].isin(dup_samples)]

# Keep a replicate sequence for each sample that has multiple sequences.
best_seqs = []
no_tiebreaker_seqs = []

num_rep_seqs_total = 0
num_rep_seqs_removed = 0

for i in tqdm.tqdm(range(len(dup_samples))):
    tmp_sample = dup_samples[i]
    tmp_df = filt_md[filt_md['Sample'] == tmp_sample].reset_index()
    assert len(tmp_df) - 1 > 0

    num_rep_seqs_total += len(tmp_df)

    arr_cons_het = tmp_df['Viridian_cons_het'].to_numpy()
    arr_N = tmp_df['Viridian_N'].to_numpy()
    arr_cons_len = tmp_df['Viridian_cons_len'].to_numpy()

    # Lots of runtime warnings generated due to presence of NA entries.
    for arr, fn in [
        (arr_cons_het, np.nanmin),  # Try to pick seq with fewest non-ACGTN bases
        (arr_N, np.nanmin), # Try to pick seq with fewest Ns
        (arr_cons_len, np.nanmax),  # Try to pick longest seq
    ]:
        best_value = fn(arr)
        if not np.isnan(best_value):
            best_indices = np.where(arr == best_value)[0]
            if len(best_indices) == 1:
                best_seqs.append(
                    tmp_df.iloc[best_indices[0]]['strain']
                )
                break
    else:
        # Arbitrarily choose the first replicate sequence.
        no_tiebreaker_seqs.append(
            tmp_df.iloc[0]['strain']
        )

    # Because one replicate sequence is added either way.
    num_rep_seqs_removed += len(tmp_df) - 1


assert len(dup_samples) == len(best_seqs) + len(no_tiebreaker_seqs)
print(f"Duplicate samples: {len(dup_samples)}")
print(f"Replicate sequences selected based on above criteria: {len(best_seqs)}")
print(f"Replicate sequences when there are no tie breakers: {len(no_tiebreaker_seqs)}")


In [5]:
keep_seqs = best_seqs + no_tiebreaker_seqs
rep_seqs_to_exclude = filt_md[~filt_md['strain'].isin(keep_seqs)]['strain']
assert len(filt_md) == len(keep_seqs) + len(rep_seqs_to_exclude)

md_dedup = md[~md['strain'].isin(rep_seqs_to_exclude)].reset_index()
assert len(md) - len(md_dedup) == len(rep_seqs_to_exclude)
assert len(md['Sample'].unique()) == len(md_dedup['Sample'].unique())


In [6]:
# Address https://github.com/jeromekelleher/sc2ts/issues/273
md_dedup_trimmed = md_dedup[
    [
        'Sample',
        'strain',
        'Platform',
        'Country',
        'date',
        'Viridian_result',
        'In_Viridian_tree',
        'Viridian_pangolin',
        'Viridian_scorpio',
        'Viridian_pangolin_1.29',
        'Viridian_scorpio_1.29',
        'Viridian_N',
        'Viridian_cons_het',
        'Viridian_cons_len',
    ]
]


In [8]:
out_metadata_file = base_dir / "run_metadata.v05.renamed.dedup.trimmed.tsv"
md_dedup_trimmed.to_csv(out_metadata_file, sep="\t", index=False)


In [9]:
test_df = pd.read_csv(out_metadata_file, sep="\t")
assert len(test_df['strain'].unique()) == len(test_df['Sample'].unique())
