# Annotate set of GISAID mutations by comparator genomes

Get variables from `snakemake`:

In [None]:
comparator_map_file = snakemake.input.comparator_map
genome_fasta_file = snakemake.input.genome_fasta
gisaid_metadata_file = snakemake.input.gisaid_metadata
gisaid_muts_file = snakemake.input.gisaid_muts
annotated_muts_file = snakemake.output.annotated_muts
add_mutations = snakemake.params.add_mutations

Import Python modules:

In [None]:
import Bio.SeqIO

import pandas as pd

Read comparator map:

In [None]:
comparator_map = pd.read_csv(comparator_map_file)

comparators = comparator_map.columns.tolist()[2:]
print(f"Read comparator identities for: {comparators}")

comparator_map

Get set of mutations for each comparator getting **only** mismatch mutations to valid nucleotides:

In [None]:
comparator_muts = {}
valid_nts = ['A', 'C', 'G', 'T']
for comparator in comparators:
    comparator_muts[comparator] = set(
        comparator_map
        .assign(mutated=lambda x: x['reference'] != x[comparator],
                mutation=lambda x: x['reference'] + x['site'].astype(str) + x[comparator])
        .query('mutated')
        .query('reference in @valid_nts')
        .query(f"{comparator} in @valid_nts")
        ['mutation']
        )
    print(f"{comparator} has {len(comparator_muts[comparator])} mutations")

Read GISAID mutations, get rid of ones that are indicated to be removed from reference (**this is equivalent to changing the reference!!!**), and annotate which ones are in each comparator:

In [None]:
add_mutations_dict = {int(m[1: -1]): (m[0], m[-1]) for m in add_mutations}

def add_mutations_func(mut_list):
    """Re-call relative to added mutations in reference."""
    muts = []
    sites_to_add_muts = set(add_mutations_dict)
    for m in mut_list:
        if not m:
            continue
        r = int(m[1: -1])
        if r in add_mutations_dict:
            sites_to_add_muts.remove(r)
            ref_wt = add_mutations_dict[r][-1]
            wt = m[0]
            assert wt == add_mutations_dict[r][0]
            mut = m[-1]
            if mut == ref_wt:
                continue
            else:
                muts.append(f"{ref_wt}{r}{mut}")
        else:
            muts.append(m)
    for r in sorted(sites_to_add_muts):
        tup = add_mutations_dict[r] 
        muts.append(f"{tup[1]}{r}{tup[0]}")
    return muts

gisaid_muts = (
    pd.read_csv(gisaid_muts_file,
                sep='\t',
                na_filter=False,
                )
    .rename(columns={'Unnamed: 0': 'strain',
                     'nucleotide': 'mutations'})
    [['strain', 'mutations']]
    .query('strain != "Reference"')
    .assign(mutations=lambda x: x['mutations'].str.split(',').map(add_mutations_func),
            n_mutations=lambda x: x['mutations'].map(len))
    )

for comparator, muts in comparator_muts.items():
    gisaid_muts[f"{comparator}_mutations"] = (gisaid_muts
                                              ['mutations']
                                              .map(lambda x: [xi for xi in x if xi in muts])
                                              )
    gisaid_muts[f"{comparator}_n_mutations"] = (gisaid_muts
                                                [f"{comparator}_mutations"]
                                                .map(len)
                                                )
    gisaid_muts[f"{comparator}_mutations"] = (gisaid_muts
                                              [f"{comparator}_mutations"]
                                              .map(lambda x: ','.join(x))
                                              )
gisaid_muts['mutations'] = gisaid_muts['mutations'].map(lambda x: ','.join(x))

gisaid_muts

Read GISAID metadata:

In [None]:
gisaid_metadata = pd.read_csv(gisaid_metadata_file,
                              sep='\t',
                              low_memory=False,
                              )

gisaid_metadata

Add metadata to mutations:

In [None]:
gisaid_merged = gisaid_muts.merge(gisaid_metadata,
                                  on='strain',
                                  how='inner',
                                  validate='one_to_one',
                                  )

gisaid_merged

Do some filtering to get only human sequences:

In [None]:
gisaid_merged = (
    gisaid_merged
    .query('host == "Human"')
    )

gisaid_merged

Do a final sanity check to make sure the mutations all match the reference wildtype:

In [None]:
ref_genome = str(Bio.SeqIO.read(genome_fasta_file, 'fasta').seq)

site_to_ref = {r: nt for r, nt in enumerate(ref_genome, start=1)}

def check_func(mut_list):
    for m in mut_list:
        if m:
            wt = m[0]
            site = int(m[1: -1])
            if site_to_ref[site] != wt:
                raise ValueError(f"mismatch for {m}")
    return True

_ = gisaid_merged['mutations'].str.split(',').map(check_func)

Write to file:

In [None]:
print(f"Writing to {annotated_muts_file}")

gisaid_merged.to_csv(annotated_muts_file,
                     index=False)