# Analysis of integrated data

Import Python modules:

In [1]:
import re

import altair as alt

import pandas as pd

## Read integrated data
The general structure of the dataframe is that each cell barcode is listed on a row, and features of that cell are listed in columns.
Cells that have more than one valid viral barcode identified may have multiple rows--one for each valid viral barcode.

In [2]:
integrated_data_csv = 'results/viral_fastq10x/scProgenyProduction_trial3_integrate_data.csv.gz'

integrated_data = pd.read_csv(integrated_data_csv)

integrated_data

Unnamed: 0,cell_barcode,infected,infecting_viral_tag,total_UMIs,viral_UMIs,frac_viral_UMIs,n_viral_genes,frac_UMIs_fluHA,frac_UMIs_fluM,frac_UMIs_fluNA,...,barcoded_gene,viral_barcode,viral_bc_UMIs,frac_viral_bc_UMIs,freq_second_infection,freq_supernatant,max_freq_second_infection,max_freq_supernatant,contributes_progeny_second_infection,contributes_progeny_supernatant
0,AAACCCAGTAACAAGT,uninfected,none,47873,6,0.000125,0,0.000042,0.000063,0.000000,...,,,,,,,,,,
1,AAACCCATCATTGCTT,uninfected,none,90114,10,0.000111,1,0.000000,0.000055,0.000000,...,,,,,,,,,,
2,AAACGAAAGATGTTGA,uninfected,none,111630,18,0.000161,0,0.000027,0.000090,0.000000,...,,,,,,,,,,
3,AAACGAAGTACTTCCC,infected,both,56828,24082,0.423770,7,0.000035,0.229816,0.007391,...,,,,,,,,,,
4,AAACGAAGTAGACGTG,infected,wt,124341,4654,0.037429,8,0.002566,0.012723,0.000249,...,fluHA,AAGTAAGCGACATGAG,251.0,0.002019,0.000010,0.000076,0.000010,0.000076,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3974,TTTGATCTCGCATTGA,infected,wt,170914,10415,0.060937,8,0.003604,0.028324,0.007864,...,fluHA,GGTCACTTGGGTACTG,363.0,0.002124,0.000010,0.000010,0.000010,0.000010,False,False
3975,TTTGGAGAGTTGCCTA,uninfected,none,65941,12,0.000182,1,0.000000,0.000091,0.000015,...,,,,,,,,,,
3976,TTTGGAGGTATCGTTG,infected,wt,150130,3526,0.023486,8,0.000619,0.011110,0.000300,...,fluHA,AGTAAACTTCCTCGCG,65.0,0.000433,0.000010,0.000695,0.000028,0.001790,True,True
3977,TTTGGAGGTATCGTTG,infected,wt,150130,3526,0.023486,8,0.000619,0.011110,0.000300,...,fluNA,ACATCTTATTTACACG,39.0,0.000260,0.000028,0.001790,0.000028,0.001790,True,True


## Subset to just infected non-doublet cells with at least one barcoded viral gene

To analyze progeny production and its relation to viral transcription, we subset on the following cells of interest:
 - infected
 - not a known doublet (not a mix of viral tags)
 - expresses at least one of the barcoded viral genes in the supernatant (HA or NA)

In [13]:
min_viral_barcode_freq = 1e-5  # value assigned to unobserved viral barcodes

print(f"Starting with {integrated_data['cell_barcode'].nunique()} cells.")

infected_cells = integrated_data.query("infected == 'infected'")
print(f"Retaining the {infected_cells['cell_barcode'].nunique()} infected cells.")

infected_cells = infected_cells.query("infecting_viral_tag != 'both'")
print(f"Retaining the {infected_cells['cell_barcode'].nunique()} cells with just one viral tag.")

infected_cells = infected_cells.query("barcoded_gene.notnull()", engine="python")
print(f"Retaining the {infected_cells['cell_barcode'].nunique()} expressing at least one barcoded viral gene.")

# aggregate observed viral barcodes for each gene and cell
infected_cells = (
    infected_cells
    .groupby(["cell_barcode", "barcoded_gene"], as_index=False)
    .aggregate(
        n_viral_barcodes=pd.NamedAgg("viral_barcode", lambda v: v.astype(bool).astype(int).sum()),
        viral_barcodes=pd.NamedAgg("viral_barcode", "; ".join),
        freq_supernatant=pd.NamedAgg("freq_supernatant", "sum"),
        freq_second_infection=pd.NamedAgg("freq_second_infection", "sum"),
    )
)
assert infected_cells.notnull().all().all()

# get barcoded viral genes
barcoded_genes = infected_cells["barcoded_gene"].unique().tolist()

# now pivot so just one row per cell
pivoted_dfs = []
for col_to_pivot, fill_value in [
    ("n_viral_barcodes", 0),
    ("viral_barcodes", ""),
    ("freq_supernatant", pd.NA),
    ("freq_second_infection", pd.NA),
]:
    pivoted_dfs.append(
        infected_cells
        .pivot_table(
            index="cell_barcode",
            columns="barcoded_gene",
            values=col_to_pivot,
            fill_value=fill_value,
            aggfunc='sum',
        )
        .rename_axis(None, axis=1)
        .rename(columns={gene: f"{col_to_pivot}_{gene}" for gene in barcoded_genes})
    )
assert all((pivoted_dfs[0].index == df.index).all() for df in pivoted_dfs)
infected_cells = pd.concat(pivoted_dfs, axis=1).reset_index()

# now add back in cell-specific information
cell_info = (
    integrated_data
    [['cell_barcode', 'infected', 'infecting_viral_tag', 'total_UMIs',
       'viral_UMIs', 'frac_viral_UMIs', 'n_viral_genes'] +
     [c for c in integrated_data.columns if re.match("frac_UMIs_flu|present_flu", c)]
    ]
    .drop_duplicates()
)
assert set(infected_cells["cell_barcode"]).issubset(cell_info["cell_barcode"])
infected_cells = (
    infected_cells
    .merge(cell_info, on="cell_barcode", how="left", validate="one_to_one")
)

# add columns explaining missing genes
present_columns = {c: c.replace("present_flu", "") for c in infected_cells.columns
                   if re.match("present_flu", c)}

infected_cells = (
    infected_cells
    .assign(
        all_viral_genes=lambda x: x['n_viral_genes'] == len(present_columns),
        missing_viral_genes=lambda x: x.apply(lambda row:
            "; ".join(val for (key, val) in present_columns.items() if not row[key]),
            axis=1)
    )
)

infected_cells

Starting with 3372 cells.
Retaining the 479 infected cells.
Retaining the 410 cells with just one viral tag.
Retaining the 396 expressing at least one barcoded viral gene.


Unnamed: 0,cell_barcode,n_viral_barcodes_fluHA,n_viral_barcodes_fluNA,viral_barcodes_fluHA,viral_barcodes_fluNA,freq_supernatant_fluHA,freq_supernatant_fluNA,freq_second_infection_fluHA,freq_second_infection_fluNA,infected,...,present_fluHA,present_fluM,present_fluNA,present_fluNP,present_fluNS,present_fluPA,present_fluPB1,present_fluPB2,all_viral_genes,missing_viral_genes
0,AAACGAAGTAGACGTG,1,1,AAGTAAGCGACATGAG,GTAGGCGATAAGTGGA,7.57407e-05,7.62399e-05,1e-05,1e-05,infected,...,True,True,True,True,True,True,True,True,True,
1,AAAGGATTCTGATGGT,1,1,GTGGAGTCGCCAGTTC,TGTTATATTTGTATTG,1e-05,0.0131834,1e-05,0.077933,infected,...,True,True,True,True,True,True,True,True,True,
2,AAAGGGCCAGGCTACC,3,1,AAAGTGATCCCCATAC; CATTTAACGCTGTGAG; CGTAGGATGT...,TTTCGACTGTTGATTG,0.00113885,1e-05,3e-05,1e-05,infected,...,True,True,True,True,True,True,True,True,True,
3,AAAGGGCTCCGCACTT,3,2,AATCTACGAGGGAAAC; ATGGATCAGATTTCCT; GCAAAAATAC...,CTCGTTTTCGGAGTAC; GACTATCTAATTGAGG,0.00353158,0.000134919,0.000164641,2e-05,infected,...,True,True,True,True,True,False,True,True,False,PA
4,AAAGTCCAGTAGAGTT,1,1,CGCAGGTAACAAGAAG,TTTACTTTAAGTGCTT,1e-05,1e-05,1e-05,1e-05,infected,...,True,True,True,True,True,True,True,True,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,TTTAGTCCATCATCCC,1,2,AGAACGATATAAATGT,AGAAACCTCGACATAT; TTGGACGCATTGCAAA,1e-05,2e-05,1e-05,2e-05,infected,...,True,True,True,True,True,False,True,True,False,PA
392,TTTAGTCGTGCTCCGA,1,0,AGTACACGAGGTCGGT,,1e-05,,1e-05,,infected,...,True,True,True,True,True,True,True,True,True,
393,TTTCACAAGCCAAGCA,1,1,GAACGCACTCATTATC,GGTATCAGTTATTGTT,1e-05,1e-05,1e-05,1e-05,infected,...,True,True,True,True,False,True,False,True,False,NS; PB1
394,TTTGATCTCGCATTGA,1,0,GGTCACTTGGGTACTG,,1e-05,,1e-05,,infected,...,True,True,True,True,True,True,True,True,True,


In [14]:
infected_cells.columns

Index(['cell_barcode', 'n_viral_barcodes_fluHA', 'n_viral_barcodes_fluNA',
       'viral_barcodes_fluHA', 'viral_barcodes_fluNA',
       'freq_supernatant_fluHA', 'freq_supernatant_fluNA',
       'freq_second_infection_fluHA', 'freq_second_infection_fluNA',
       'infected', 'infecting_viral_tag', 'total_UMIs', 'viral_UMIs',
       'frac_viral_UMIs', 'n_viral_genes', 'frac_UMIs_fluHA', 'frac_UMIs_fluM',
       'frac_UMIs_fluNA', 'frac_UMIs_fluNP', 'frac_UMIs_fluNS',
       'frac_UMIs_fluPA', 'frac_UMIs_fluPB1', 'frac_UMIs_fluPB2',
       'present_fluHA', 'present_fluM', 'present_fluNA', 'present_fluNP',
       'present_fluNS', 'present_fluPA', 'present_fluPB1', 'present_fluPB2',
       'all_viral_genes', 'missing_viral_genes'],
      dtype='object')

## Plot correlation of viral progeny and transcription

In [5]:
progeny_vs_transcription_chart = (
    alt.Chart(infected_cells)
    .encode(
        x=alt.Alt.X("frac_viral_UMIs"),
        y=alt.Y"freq_supernatant_fluNA"),
        tooltip=["cell_barcode", "frac_viral_UMIs"]
    )
)

progeny_vs_transcription_chart

SchemaValidationError: Invalid specification

        altair.vegalite.v4.api.Chart, validating 'required'

        'mark' is a required property
        

alt.Chart(...)

In [15]:
alt.Chart(infected_cells).encode(x=alt.X("total_UMIs", bin=True), y="count()").mark_bar()

In [17]:
infected_cells["total_UMIs"].min()

15997