# Analysis of integrated data

Import Python modules:

In [1]:
import itertools
import re

import altair as alt

import numpy

import pandas as pd

## Read integrated data
The general structure of the dataframe is that each cell barcode is listed on a row, and features of that cell are listed in columns.
Cells that have more than one valid viral barcode identified may have multiple rows--one for each valid viral barcode.

In [2]:
integrated_data_csv = 'results/viral_fastq10x/scProgenyProduction_trial3_integrate_data.csv.gz'

integrated_data = pd.read_csv(integrated_data_csv)

integrated_data

Unnamed: 0,cell_barcode,infected,infecting_viral_tag,total_UMIs,viral_UMIs,frac_viral_UMIs,n_viral_genes,frac_UMIs_fluHA,frac_UMIs_fluM,frac_UMIs_fluNA,...,barcoded_gene,viral_barcode,viral_bc_UMIs,frac_viral_bc_UMIs,freq_second_infection,freq_supernatant,max_freq_second_infection,max_freq_supernatant,contributes_progeny_second_infection,contributes_progeny_supernatant
0,AAACCCAGTAACAAGT,uninfected,none,47873,6,0.000125,0,0.000042,0.000063,0.000000,...,,,,,,,,,,
1,AAACCCATCATTGCTT,uninfected,none,90114,10,0.000111,1,0.000000,0.000055,0.000000,...,,,,,,,,,,
2,AAACGAAAGATGTTGA,uninfected,none,111630,18,0.000161,0,0.000027,0.000090,0.000000,...,,,,,,,,,,
3,AAACGAAGTACTTCCC,infected,both,56828,24082,0.423770,7,0.000035,0.229816,0.007391,...,,,,,,,,,,
4,AAACGAAGTAGACGTG,infected,wt,124341,4654,0.037429,8,0.002566,0.012723,0.000249,...,fluHA,AAGTAAGCGACATGAG,251.0,0.002019,0.000010,0.000076,0.000010,0.000076,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3677,TTTGATCTCCCGTTCA,uninfected,none,63150,3,0.000048,0,0.000000,0.000016,0.000000,...,,,,,,,,,,
3678,TTTGGAGAGTTGCCTA,uninfected,none,65941,12,0.000182,1,0.000000,0.000091,0.000015,...,,,,,,,,,,
3679,TTTGGAGGTATCGTTG,infected,wt,150130,3526,0.023486,8,0.000619,0.011110,0.000300,...,fluHA,AGTAAACTTCCTCGCG,65.0,0.000433,0.000010,0.000695,0.000028,0.001790,True,True
3680,TTTGGAGGTATCGTTG,infected,wt,150130,3526,0.023486,8,0.000619,0.011110,0.000300,...,fluNA,ACATCTTATTTACACG,39.0,0.000260,0.000028,0.001790,0.000028,0.001790,True,True


## Look at UMIs per cell
Just inspect total UMIs per cell distribution:

In [3]:
umis_per_cell_chart = (
    alt.Chart(integrated_data[["cell_barcode", "total_UMIs"]].drop_duplicates())
    .encode(
        x=alt.X("total_UMIs", bin=alt.Bin(step=10000), scale=alt.Scale(zero=True)),
        y=alt.Y("count()", title="number of cells"),
    )
    .mark_bar()
    .properties(width=300, height=200)
)

umis_per_cell_chart

## Subset to just infected non-doublet cells with at least one barcoded viral gene

To analyze progeny production and its relation to viral transcription, we subset on the following cells of interest:
 - infected
 - not a known doublet (not a mix of viral tags)
 - expresses at least one of the barcoded viral genes in the supernatant (HA or NA)

In [4]:
min_viral_barcode_freq = 1e-5  # value assigned to unobserved viral barcodes

print(f"Starting with {integrated_data['cell_barcode'].nunique()} cells.")

infected_cells = integrated_data.query("infected == 'infected'")
print(f"Retaining the {infected_cells['cell_barcode'].nunique()} infected cells.")

infected_cells = infected_cells.query("infecting_viral_tag != 'both'")
print(f"Retaining the {infected_cells['cell_barcode'].nunique()} cells with just one viral tag.")

infected_cells = infected_cells.query("barcoded_gene.notnull()", engine="python")
print(f"Retaining the {infected_cells['cell_barcode'].nunique()} expressing at least one barcoded viral gene.")

# aggregate observed viral barcodes for each gene and cell
infected_cells = (
    infected_cells
    .groupby(["cell_barcode", "barcoded_gene"], as_index=False)
    .aggregate(
        n_viral_barcodes=pd.NamedAgg("viral_barcode", lambda v: v.astype(bool).astype(int).sum()),
        viral_barcodes=pd.NamedAgg("viral_barcode", "; ".join),
        freq_supernatant=pd.NamedAgg("freq_supernatant", "sum"),
        freq_second_infection=pd.NamedAgg("freq_second_infection", "sum"),
    )
)
assert infected_cells.notnull().all().all()

# get barcoded viral genes
barcoded_genes = infected_cells["barcoded_gene"].unique().tolist()

# now pivot so just one row per cell
pivoted_dfs = []
for col_to_pivot, fill_value in [
    ("n_viral_barcodes", 0),
    ("viral_barcodes", ""),
    ("freq_supernatant", pd.NA),
    ("freq_second_infection", pd.NA),
]:
    pivoted_dfs.append(
        infected_cells
        .pivot_table(
            index="cell_barcode",
            columns="barcoded_gene",
            values=col_to_pivot,
            fill_value=fill_value,
            aggfunc='sum',
        )
        .rename_axis(None, axis=1)
        .rename(columns={gene: f"{col_to_pivot}_{gene}" for gene in barcoded_genes})
    )
assert all((pivoted_dfs[0].index == df.index).all() for df in pivoted_dfs)
infected_cells = pd.concat(pivoted_dfs, axis=1).reset_index()

# now add back in cell-specific information
cell_info = (
    integrated_data
    [['cell_barcode', 'infected', 'infecting_viral_tag', 'total_UMIs',
       'viral_UMIs', 'frac_viral_UMIs', 'n_viral_genes'] +
     [c for c in integrated_data.columns
      if re.match("frac_UMIs_flu|present_flu|pacbio_UMIs|mutations_", c)]
    ]
    .drop_duplicates()
)
assert set(infected_cells["cell_barcode"]).issubset(cell_info["cell_barcode"])
infected_cells = (
    infected_cells
    .merge(cell_info, on="cell_barcode", how="left", validate="one_to_one")
)

# add columns explaining missing genes
present_columns = {c: c.replace("present_flu", "") for c in infected_cells.columns
                   if re.match("present_flu", c)}

infected_cells = (
    infected_cells
    .assign(
        all_viral_genes=lambda x: x['n_viral_genes'] == len(present_columns),
        missing_viral_genes=lambda x: x.apply(lambda row:
            "; ".join(val for (key, val) in present_columns.items() if not row[key]),
            axis=1)
    )
)

Starting with 3129 cells.
Retaining the 410 infected cells.
Retaining the 359 cells with just one viral tag.
Retaining the 347 expressing at least one barcoded viral gene.


## Determine which cells have all present viral genes sequenced
Among the infected cells, see how many have all of the viral genes that are present in that cell sequenced:

In [5]:
def virus_fully_sequenced(row):
    row_dict = row.to_dict()
    for key, val in row_dict.items():
        if re.fullmatch("present_flu\w+", key) and val:
            gene = key.split("_")[1]
            if row_dict[f"mutations_{gene}"] == "Not Detected":
                return False
    else:
        return True

infected_cells = (
    infected_cells
    .assign(virus_fully_sequenced=lambda x: x.apply(virus_fully_sequenced, axis=1))
)

display(
    infected_cells
    .groupby("virus_fully_sequenced")
    .aggregate(
        n_cells=pd.NamedAgg("cell_barcode", "nunique"),
        mean_n_viral_genes=pd.NamedAgg("n_viral_genes", "mean"),
        mean_total_UMIs=pd.NamedAgg("total_UMIs", "mean"),
        mean_frac_viral_UMIs=pd.NamedAgg("frac_viral_UMIs", "mean"),
    )
    .round(2)
)

virus_fully_sequenced_chart = (
    alt.Chart(infected_cells)
    .encode(
        x=alt.X(
            "jitter:Q",
            title=None,
            axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        ),
        y=alt.Y("frac_viral_UMIs"),
        color=alt.Color("virus_fully_sequenced", legend=None),
        column=alt.Column(
            "virus_fully_sequenced",
            header=alt.Header(
                labelAngle=-90,
                labelOrient="bottom",
                labelAlign="right",
                titleOrient="bottom",
            ),
        ),
        tooltip=["cell_barcode", alt.Tooltip("frac_viral_UMIs", format='.2g'),
                 "total_UMIs", "missing_viral_genes", "infecting_viral_tag",],
    )
    .transform_calculate(jitter="sqrt(-2*log(random()))*cos(2*PI*random())")
    .mark_point(filled=True, size=40, opacity=0.5)
    .configure_facet(spacing=0)
    .configure_view(stroke=None)
    .properties(width=50, height=250)
)

virus_fully_sequenced_chart

Unnamed: 0_level_0,n_cells,mean_n_viral_genes,mean_total_UMIs,mean_frac_viral_UMIs
virus_fully_sequenced,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,106,7.57,100255.92,0.05
True,241,7.44,116197.83,0.07


Going forward, just keep the fully sequenced infected cells.
Also, annotate these by any mutations they have:

In [6]:
def viral_mutations(row):
    row_dict = row.to_dict()
    mutations = []
    for key, val in row_dict.items():
        if re.fullmatch("mutations_flu\w+", key) and val:
            gene = key.replace("mutations_flu", "")
            if val == "Not Detected":
                assert not row_dict[f"present_flu{gene}"]
            elif val != "WT":
                mutations.append(f"{gene}: {val}")
    return ", ".join(mutations)

def mutated_genes(row):
    row_dict = row.to_dict()
    mutated_genes = []
    for key, val in row_dict.items():
        if re.fullmatch("mutations_flu\w+", key) and val:
            gene = key.replace("mutations_flu", "")
            if val == "Not Detected":
                assert not row_dict[f"present_flu{gene}"]
            elif val != "WT":
                mutated_genes.append(gene)
    return ", ".join(mutated_genes)

infected_sequenced_cells = (
    infected_cells
    .query("virus_fully_sequenced")
    .assign(
        viral_mutations=lambda x: x.apply(viral_mutations, axis=1),
        mutated_genes=lambda x: x.apply(mutated_genes, axis=1),
        has_mutations=lambda x: x["mutated_genes"].astype(bool),
    )
)

## Get just cells expressing both barcoded viral genes
To utiize both HA and NA barcodes, we keep just cells just expressing both genes:

In [7]:
infected_sequenced_cells["all_barcoded_genes"] = numpy.logical_and.reduce(
    [infected_sequenced_cells[f"present_{gene}"] for gene in barcoded_genes] +
    [infected_sequenced_cells[f"freq_supernatant_{gene}"].notnull() for gene in barcoded_genes] +
    [infected_sequenced_cells[f"freq_second_infection_{gene}"].notnull() for gene in barcoded_genes]
)

display(infected_sequenced_cells
        .groupby("all_barcoded_genes")
        .aggregate(n_cells=pd.NamedAgg("cell_barcode", "count"))
)

barcoded_infected_cells = infected_sequenced_cells.query("all_barcoded_genes")

Unnamed: 0_level_0,n_cells
all_barcoded_genes,Unnamed: 1_level_1
False,55
True,186


Normalize the frequencies of the viral barcodes for each viral tag among these cells:

In [8]:
for condition, gene in itertools.product(["supernatant", "second_infection"],
                                         barcoded_genes,
                                        ):
    col = f"freq_{condition}_{gene}"
    barcoded_infected_cells[col] = (
        barcoded_infected_cells[col] /
        barcoded_infected_cells.groupby("infecting_viral_tag")[col].transform("sum")
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barcoded_infected_cells[col] = (


Look at correlation between barcode frequencies of the different barcoded viral genes, and take the geometric mean and normalize:

In [9]:
for condition in ["supernatant", "second_infection"]:
    assert barcoded_genes == ["fluHA", "fluNA"]  # code below assumes this
    print(f"\nViral barcode frequencies for {condition}:")
    display(
        alt.Chart(barcoded_infected_cells)
        .encode(
            x=f"freq_{condition}_fluHA",
            y=f"freq_{condition}_fluNA",
            facet=alt.Facet("infecting_viral_tag"),
        )
        .mark_point(filled=True)
        .resolve_scale(x="independent", y="independent")
        .properties(width=200, height=200)
    )
    
    # get normalized geometric mean
    mean_col = f"freq_{condition}"
    barcoded_infected_cells[mean_col] = (
        barcoded_infected_cells[f"{mean_col}_fluHA"]
        * barcoded_infected_cells[f"{mean_col}_fluNA"]
    )**(1 / 2)
    barcoded_infected_cells[mean_col] = (
        barcoded_infected_cells[mean_col] /
        barcoded_infected_cells.groupby("infecting_viral_tag")[mean_col].transform("sum")
    )


Viral barcode frequencies for supernatant:



Viral barcode frequencies for second_infection:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barcoded_infected_cells[mean_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barcoded_infected_cells[mean_col] = (


## Plot correlation of viral progeny and transcription

In [10]:
freq_cols = [f"freq_{condition}" for condition in ["supernatant", "second_infection"]]

cell_selector = alt.selection_single(on='mouseover', empty='none')

all_viral_genes_selector = alt.selection_single(
    fields=["all_viral_genes"],
    bind=alt.binding_select(
        name="has all viral genes",
        options=[None, True, False],
        labels=["all", "true", "false"],
    ),
)

mutated_selector = alt.selection_single(
    fields=["has_mutations"],
    bind=alt.binding_select(
        name="has mutations",
        options=[None, True, False],
        labels=["all", "true", "false"],
    ),
)

viral_tag_selector = alt.selection_single(
    fields=["infecting_viral_tag"],
    bind=alt.binding_select(
        name="viral tag",
        options=[None, *barcoded_infected_cells["infecting_viral_tag"].unique().tolist()],
        labels=["all", *barcoded_infected_cells["infecting_viral_tag"].unique().tolist()]),
)

charts = []
for freq_col in freq_cols:
    charts.append(
        alt.Chart(barcoded_infected_cells)
        .encode(
            x=alt.X("frac_viral_UMIs"),
            y=alt.Y(freq_col),
            tooltip=["cell_barcode", alt.Tooltip("frac_viral_UMIs", format='.2g'),
                     "total_UMIs", "missing_viral_genes", "mutated_genes", "infecting_viral_tag",
                     *[alt.Tooltip(c, format='.2g') for c in freq_cols],
                     "viral_mutations",
               #      *[alt.Tooltip(f"{c}_{gene}", format=".2g") for c in freq_cols for gene in barcoded_genes],
               #      *[alt.Tooltip(c, format=".2g") for c in barcoded_infected_cells.columns
               #        if c.startswith("frac_UMIs_flu")]
                    ],
            color=alt.Color("all_viral_genes"),
            shape=alt.Shape("has_mutations"),
            strokeWidth=alt.condition(cell_selector, alt.value(2), alt.value(0)),
            opacity=alt.condition(
                cell_selector,
                alt.value(1),
                alt.value(0.5),
            ),
            size=alt.condition(cell_selector, alt.value(60), alt.value(35)),
        )
        .mark_point(filled=True, stroke="black")
        .properties(width=300, height=300)
    )

chart = (
    alt.hconcat(*charts)
    .add_selection(
        cell_selector,
        all_viral_genes_selector,
        viral_tag_selector,
        mutated_selector,
    )
    .transform_filter(all_viral_genes_selector)
    .transform_filter(viral_tag_selector)
    .transform_filter(mutated_selector)
)

display(chart)

chart.save("interactive_chart.html")