# Integrate data
This notebook collects the `integrated_data_csv` file for each sample and concatenates them into a single CSV containing data from all samples.

The general structure of the CSV is that each cell barcode is listed on a row, and features of that cell are listed in columns. Cells that have more than one valid viral barcode identified may have multiple rows--one for each valid viral barcode.

Import python modules:

In [1]:
from IPython.display import display

import pandas as pd

Variables:

In [2]:
# Input sources
trial1_integrated_data_csv = 'results/viral_fastq10x/scProgenyProduction_trial1_integrate_data.csv'
trial3_integrated_data_csv = 'results/viral_fastq10x/scProgenyProduction_trial3_integrate_data.csv'

# Output
all_samples_csv = 'results/viral_fastq10x/all_samples.csv'

Load data for each sample:

In [3]:
trial1_data = pd.read_csv(trial1_integrated_data_csv)
trial1_data['sample'] = 'highMOI'
trial3_data = pd.read_csv(trial3_integrated_data_csv)
trial3_data['sample'] = 'lowMOI'

display(trial1_data)
display(trial3_data)

Unnamed: 0,cell_barcode,infected,infecting_viral_tag,total_UMIs,viral_UMIs,frac_viral_UMIs,n_viral_genes,transcriptome_UMIs_fluHA,transcriptome_UMIs_fluM,transcriptome_UMIs_fluNA,...,present_fluPB1,present_fluPB2,barcoded_gene,viral_barcode,viral_bc_UMIs,frac_viral_bc_UMIs,freq_supernatant,max_freq_supernatant,contributes_progeny_supernatant,sample
0,AAACGAAAGCCACCGT,ambiguous,wt,41743,451,0.010804,6,101,54,26,...,True,False,,,,,,,,highMOI
1,AAACGCTAGTTCACTG,infected,syn,11572,4702,0.406326,7,650,2075,101,...,True,True,fluHA,GGGAAGCAAATATAAT,35.0,0.003025,0.000027,0.001033,True,highMOI
2,AAACGCTAGTTCACTG,infected,syn,11572,4702,0.406326,7,650,2075,101,...,True,True,fluHA,AGGAAGGTAAATTCAG,17.0,0.001469,0.000992,0.001033,True,highMOI
3,AAACGCTAGTTCACTG,infected,syn,11572,4702,0.406326,7,650,2075,101,...,True,True,fluHA,GATTTGAAAGGATACC,25.0,0.002160,0.000069,0.001033,True,highMOI
4,AAACGCTAGTTCACTG,infected,syn,11572,4702,0.406326,7,650,2075,101,...,True,True,fluHA,GATGTTTTGAAGAGTG,51.0,0.004407,0.000010,0.001033,True,highMOI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2663,TTTGTTGTCGGAATGG,infected,wt,13433,5142,0.382789,8,460,2784,59,...,True,True,fluHA,TCGACACAATACTTGC,41.0,0.003052,0.000397,0.000397,True,highMOI
2664,TTTGTTGTCGGAATGG,infected,wt,13433,5142,0.382789,8,460,2784,59,...,True,True,fluNA,AAATTTTAGGGAAATA,12.0,0.000893,0.000010,0.000397,True,highMOI
2665,TTTGTTGTCGGAATGG,infected,wt,13433,5142,0.382789,8,460,2784,59,...,True,True,fluNA,ATGAGTTCACGGGGGT,9.0,0.000670,0.000051,0.000397,True,highMOI
2666,TTTGTTGTCGGAATGG,infected,wt,13433,5142,0.382789,8,460,2784,59,...,True,True,fluNA,CAACAGGGAGTAAGGT,16.0,0.001191,0.000079,0.000397,True,highMOI


Unnamed: 0,cell_barcode,infected,infecting_viral_tag,total_UMIs,viral_UMIs,frac_viral_UMIs,n_viral_genes,transcriptome_UMIs_fluHA,transcriptome_UMIs_fluM,transcriptome_UMIs_fluNA,...,viral_barcode,viral_bc_UMIs,frac_viral_bc_UMIs,freq_second_infection,freq_supernatant,max_freq_second_infection,max_freq_supernatant,contributes_progeny_second_infection,contributes_progeny_supernatant,sample
0,AAACCCAGTAACAAGT,uninfected,none,47873,6,0.000125,0,2,3,0,...,,,,,,,,,,lowMOI
1,AAACCCATCATTGCTT,uninfected,none,90114,10,0.000111,1,0,5,0,...,,,,,,,,,,lowMOI
2,AAACGAAAGATGTTGA,uninfected,none,111630,18,0.000161,0,3,10,0,...,,,,,,,,,,lowMOI
3,AAACGAAGTACTTCCC,infected,both,56828,24082,0.423770,7,2,13060,420,...,,,,,,,,,,lowMOI
4,AAACGAAGTAGACGTG,infected,wt,124341,4654,0.037429,8,319,1582,31,...,AAGTAAGCGACATGAG,251.0,0.002019,0.000010,0.000076,0.000010,0.000076,False,True,lowMOI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3671,TTTGATCTCCCGTTCA,uninfected,none,63150,3,0.000048,0,0,1,0,...,,,,,,,,,,lowMOI
3672,TTTGGAGAGTTGCCTA,uninfected,none,65941,12,0.000182,0,0,6,1,...,,,,,,,,,,lowMOI
3673,TTTGGAGGTATCGTTG,infected,wt,150130,3526,0.023486,8,93,1668,45,...,AGTAAACTTCCTCGCG,65.0,0.000433,0.000010,0.000695,0.000028,0.001790,True,True,lowMOI
3674,TTTGGAGGTATCGTTG,infected,wt,150130,3526,0.023486,8,93,1668,45,...,ACATCTTATTTACACG,39.0,0.000260,0.000028,0.001790,0.000028,0.001790,True,True,lowMOI


Concatenate samples into single dataframe:

In [4]:
all_samples = pd.concat([trial1_data,
                         trial3_data],
                        axis=0)
display(all_samples)

Unnamed: 0,cell_barcode,infected,infecting_viral_tag,total_UMIs,viral_UMIs,frac_viral_UMIs,n_viral_genes,transcriptome_UMIs_fluHA,transcriptome_UMIs_fluM,transcriptome_UMIs_fluNA,...,mutation_support_fluM,mutation_support_fluNA,mutation_support_fluNP,mutation_support_fluNS,mutation_support_fluPA,mutation_support_fluPB1,mutation_support_fluPB2,freq_second_infection,max_freq_second_infection,contributes_progeny_second_infection
0,AAACGAAAGCCACCGT,ambiguous,wt,41743,451,0.010804,6,101,54,26,...,,,,,,,,,,
1,AAACGCTAGTTCACTG,infected,syn,11572,4702,0.406326,7,650,2075,101,...,,,,,,,,,,
2,AAACGCTAGTTCACTG,infected,syn,11572,4702,0.406326,7,650,2075,101,...,,,,,,,,,,
3,AAACGCTAGTTCACTG,infected,syn,11572,4702,0.406326,7,650,2075,101,...,,,,,,,,,,
4,AAACGCTAGTTCACTG,infected,syn,11572,4702,0.406326,7,650,2075,101,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3671,TTTGATCTCCCGTTCA,uninfected,none,63150,3,0.000048,0,0,1,0,...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,,,
3672,TTTGGAGAGTTGCCTA,uninfected,none,65941,12,0.000182,0,0,6,1,...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,,,
3673,TTTGGAGGTATCGTTG,infected,wt,150130,3526,0.023486,8,93,1668,45,...,Not Applicable,31/31,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,0.000010,0.000028,True
3674,TTTGGAGGTATCGTTG,infected,wt,150130,3526,0.023486,8,93,1668,45,...,Not Applicable,31/31,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,0.000028,0.000028,True


Export CSV:

In [5]:
all_samples.to_csv(all_samples_csv, index=False)