# Make plots

Import Python modules:

In [1]:
import itertools
import os

import altair as alt

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

Get data and variable from `snakemake`:

In [None]:
sars2_aligned_by_run = pd.read_csv(snakemake.input.sars2_aligned_by_run)

sars2_aligned_by_metagenomic_sample = pd.read_csv(snakemake.input.sars2_aligned_by_metagenomic_sample)

mito_composition_by_metagenomic_run = pd.read_csv(snakemake.input.mito_composition_by_metagenomic_run)

mito_composition_by_metagenomic_sample = pd.read_csv(snakemake.input.mito_composition_by_metagenomic_sample)

crits_christoph_read_counts = pd.read_csv(snakemake.input.crits_christoph_read_counts)

ngdc_to_crits_christoph = pd.read_csv(snakemake.input.ngdc_to_crits_christoph)

plotsdir = snakemake.output.plotsdir

## Compare mitochondrial DNA composition to Crits-Christoph et al
First get Crits-Christoph read counts in tidy format, assigning NGDC run accessions and summing counts for runs:

In [3]:
crits_christoph_read_counts_tidy = (
    crits_christoph_read_counts
    .drop(columns=["Location", "Sample_name"])
    .merge(
        ngdc_to_crits_christoph
        .assign(Filename=lambda x: x["fastq Crits-Christoph"].map(lambda s: s.split(".")[0]))
        .rename(columns={"Run accession NGDC": "Run accession"})
        [["Filename", "Run accession"]]
        .drop_duplicates(),
        validate="one_to_one",
    )
    .merge(
        sars2_aligned_by_run[["Run accession", "sample"]].drop_duplicates(),
        validate="many_to_one",
        how="left",
    )
    .melt(
        id_vars=["sample", "Run accession", "Filename"],
        var_name="species",
        value_name="aligned reads Crits-Christoph",
    )
    .groupby(["sample", "Run accession", "species"], as_index=False)
    .aggregate({"aligned reads Crits-Christoph": "sum"})
)

crits_christoph_read_counts_tidy

Unnamed: 0,sample,Run accession,species,aligned reads Crits-Christoph
0,1-29-8,CRR710770,Arctonyx collaris,0
1,1-29-8,CRR710770,Bos taurus,0
2,1-29-8,CRR710770,Canis lupus,0
3,1-29-8,CRR710770,Capra aegagrus,0
4,1-29-8,CRR710770,Erinaceus amurensis,0
...,...,...,...,...
2923,wws-1#-0,CRR711161,Rattus tiomanicus,0
2924,wws-1#-0,CRR711161,Rhizomys pruinosus,0
2925,wws-1#-0,CRR711161,Sus scrofa,32
2926,wws-1#-0,CRR711161,Urocyon cinereoargenteus,0


Now merge current read counts with those from Crits-Christoph for all species:

In [4]:
crits_christoph_species = crits_christoph_read_counts_tidy["species"].unique().tolist()

# get current study data in form to compare, harmozing species names
mito_composition_to_compare = (
    mito_composition_by_metagenomic_run
    [["Run accession", "species", "common_name", "alignment_count_type", "aligned_reads"]]
    .rename(columns={"aligned_reads": "aligned reads current study"})
)

# expand mito composition by run to include all Crits-Christoph species
assert set(crits_christoph_species).issubset(mito_composition_to_compare["species"])


crits_christoph_vs_current = (
    crits_christoph_read_counts_tidy
    .merge(
        mito_composition_to_compare,
        validate="one_to_many",
        on=["Run accession", "species"],
    )
)

Now get the correlations by species and plot them:

In [5]:
crits_christoph_vs_current_species_corr = (
    crits_christoph_vs_current
    .groupby(["species", "common_name", "alignment_count_type"])
    [["aligned reads Crits-Christoph", "aligned reads current study"]]
    .corr(method="pearson")
    .reset_index()
    .query("level_3 == 'aligned reads Crits-Christoph'")
    .rename(columns={"aligned reads current study": "Pearson correlation"})
    .drop(columns=["level_3", "aligned reads Crits-Christoph"])
    .query("`Pearson correlation`.notnull()")
    .merge(
        crits_christoph_vs_current
        .groupby(["species", "alignment_count_type"], as_index=False)
        .aggregate(
            aligned_reads_Crits_Christoph=pd.NamedAgg("aligned reads Crits-Christoph", "sum"),
            aligned_reads_current_study=pd.NamedAgg("aligned reads current study", "sum"),
        )
    )
)

crits_christoph_vs_current_species_corr.head()

Unnamed: 0,species,common_name,alignment_count_type,Pearson correlation,aligned_reads_Crits_Christoph,aligned_reads_current_study
0,Arctonyx collaris,hog badger,n_primary,0.9244,188,330
1,Arctonyx collaris,hog badger,n_primary_and_unique_best,0.955924,188,221
2,Bos taurus,cow,n_primary,0.999974,61278,32632
3,Bos taurus,cow,n_primary_and_unique_best,0.999613,61278,22702
4,Canis lupus,dog,n_primary,0.998951,7941,6386


In [6]:
alignment_count_type_selection = alt.selection_point(
    fields=["alignment_count_type"],
    bind=alt.binding_select(
        options=sars2_aligned_by_run["alignment_count_type"].unique(),
        name="alignment count type",
    ),
    value="n_primary",
)

crits_christoph_vs_current_species_corr_chart = (
    alt.Chart(crits_christoph_vs_current_species_corr)
    .encode(
        x="Pearson correlation",
        y=alt.Y(
            "common_name",
            sort=alt.SortField("Pearson correlation", order="descending"),
            title=None,
        ),
        tooltip=[
            alt.Tooltip(c, format=".3g")
            if crits_christoph_vs_current_species_corr[c].dtype == float
            else c
            for c in crits_christoph_vs_current_species_corr.columns.tolist()
        ],
    )
    .mark_circle(size=50, opacity=1)
    .properties(
        height=alt.Step(13),
        width=185,
        title=alt.TitleParams(
            ["correlation across runs of reads aligning to each", "mitochondrial genome in this study vs Crits-Christoph et al"],
            align="left",
            anchor="start",
            fontSize=12,
        ),
    )
    .add_params(alignment_count_type_selection)
    .transform_filter(alignment_count_type_selection)
)
    
    
crits_christoph_vs_current_species_corr_chart

In [7]:
(
    crits_christoph_vs_current
    .query("common_name == 'gray fox'")
    .sort_values("aligned reads current study")
)

Unnamed: 0,sample,Run accession,species,aligned reads Crits-Christoph,common_name,alignment_count_type,aligned reads current study
2925,CSSQ-1-3,CRR710782,Urocyon cinereoargenteus,0,gray fox,n_primary_and_unique_best,0
1677,A63,CRR710850,Urocyon cinereoargenteus,0,gray fox,n_primary_and_unique_best,0
1629,A63,CRR710849,Urocyon cinereoargenteus,0,gray fox,n_primary_and_unique_best,0
573,A15,CRR710821,Urocyon cinereoargenteus,0,gray fox,n_primary_and_unique_best,0
3789,EWS-3#-2,CRR711158,Urocyon cinereoargenteus,0,gray fox,n_primary_and_unique_best,0
...,...,...,...,...,...,...,...
4412,Q61,CRR710762,Urocyon cinereoargenteus,552,gray fox,n_primary,3288
2780,B5,CRR710873,Urocyon cinereoargenteus,0,gray fox,n_primary,3598
2732,B5,CRR710872,Urocyon cinereoargenteus,0,gray fox,n_primary,3619
2876,B5,CRR710875,Urocyon cinereoargenteus,0,gray fox,n_primary,3649


Now get the correlations by run:

In [8]:
crits_christoph_vs_current_corr = (
    crits_christoph_vs_current
#    .query("common_name not in ['gray fox', 'masked palm civet']")
    .groupby(["Run accession", "sample", "alignment_count_type"])
    [["aligned reads Crits-Christoph", "aligned reads current study"]]
    .corr(method="pearson")
    .reset_index()
    .query("level_3 == 'aligned reads Crits-Christoph'")
    .rename(columns={"aligned reads current study": "correlation"})
    .drop(columns=["level_3", "aligned reads Crits-Christoph"])
    .query("correlation.notnull()")
    .merge(
        crits_christoph_vs_current
        .groupby(["Run accession", "alignment_count_type"], as_index=False)
        .aggregate(
            aligned_reads_Crits_Christoph=pd.NamedAgg("aligned reads Crits-Christoph", "sum"),
            aligned_reads_current_study=pd.NamedAgg("aligned reads current study", "sum"),
        )
    )
)

crits_christoph_vs_current_corr.sort_values("correlation")

Unnamed: 0,Run accession,sample,alignment_count_type,correlation,aligned_reads_Crits_Christoph,aligned_reads_current_study
96,CRR710854,A87,n_primary,0.046917,787,1547
92,CRR710852,A87,n_primary,0.048627,742,1462
94,CRR710853,A87,n_primary,0.050413,734,1487
98,CRR710855,A87,n_primary,0.076438,854,1498
34,CRR710813,A101,n_primary,0.141779,1187,2221
...,...,...,...,...,...,...
177,CRR710900,F46,n_primary_and_unique_best,0.999984,1138,1489
181,CRR710902,F46,n_primary_and_unique_best,0.999991,1175,1455
183,CRR710903,F46,n_primary_and_unique_best,0.999995,1172,1448
179,CRR710901,F46,n_primary_and_unique_best,0.999995,1130,1355


The correlations by species: