# Annotate set of GISAID mutations by comparator genomes

Get variables from `snakemake`:

In [None]:
comparator_map_file = snakemake.input.comparator_map
genome_fasta_file = snakemake.input.genome_fasta
gisaid_metadata_file = snakemake.input.gisaid_metadata
gisaid_muts_file = snakemake.input.gisaid_muts
annotated_muts_file = snakemake.output.annotated_muts

Import Python modules:

In [None]:
import pandas as pd

Read comparator map:

In [None]:
comparator_map = pd.read_csv(comparator_map_file)

comparators = comparator_map.columns.tolist()[2:]
print(f"Read comparator identities for: {comparators}")

comparator_map

Get set of mutations for each comparator getting **only** mismatch mutations to valid nucleotides:

In [None]:
comparator_muts = {}
valid_nts = ['A', 'C', 'G', 'T']
for comparator in comparators:
    comparator_muts[comparator] = set(
        comparator_map
        .assign(mutated=lambda x: x['reference'] != x[comparator],
                mutation=lambda x: x['reference'] + x['site'].astype(str) + x[comparator])
        .query('mutated')
        .query('reference in @valid_nts')
        .query(f"{comparator} in @valid_nts")
        ['mutation']
        )
    print(f"{comparator} has {len(comparator_muts[comparator])} mutations")

Read GISAID mutations and annotate which ones are in each comparator:

In [None]:
gisaid_muts = (
    pd.read_csv(gisaid_muts_file,
                sep='\t',
                na_filter=False,
                )
    .rename(columns={'Unnamed: 0': 'strain',
                     'nucleotide': 'mutations'})
    [['strain', 'mutations']]
    .query('strain != "Reference"')
    .assign(n_mutations=lambda x: x['mutations'].str.split(',').map(len))
    )

for comparator, muts in comparator_muts.items():
    gisaid_muts[f"{comparator}_mutations"] = (gisaid_muts
                                              ['mutations']
                                              .str.split(',')
                                              .map(lambda x: ','.join(xi for xi in x if xi in muts))
                                              )
    gisaid_muts[f"{comparator}_n_mutations"] = (gisaid_muts
                                                [f"{comparator}_mutations"]
                                                .str.split(',')
                                                .map(len)
                                                )

gisaid_muts

Read GISAID metadata:

In [None]:
gisaid_metadata = pd.read_csv(gisaid_metadata_file,
                              sep='\t',
                              low_memory=False,
                              )

gisaid_metadata

Add metadata to mutations:

In [None]:
gisaid_merged = gisaid_muts.merge(gisaid_metadata,
                                  on='strain',
                                  how='inner',
                                  validate='one_to_one',
                                  )

gisaid_merged

Do some filtering to get only human sequences:

In [None]:
gisaid_merged = (
    gisaid_merged
    .query('host == "Human"')
    )

gisaid_merged

Write to file:

In [None]:
print(f"Writing to {annotated_muts_file}")

gisaid_merged.to_csv(annotated_muts_file,
                     index=False)