# Create a table with SRA IDs, readstats QC data, and submitter metadata

**Below are the steps taken in this notebook:**
1. Installs, import Statements & Global Variable Definitions
2. Read in tables as dataframes
3. Merge dataframes
4. Write merged dataframe to data table and file

**Important Note**  
This may not be the best way to aggregate the data, because SRA submissions are done after QC, which means the workspace needs to be kept and run with newly uploaded info. It might be easier to just output the Readstats table and merge outside Terra/Anvil


## Installs

In [4]:
%%capture
%pip install gcsfs
## capture CANNOT have comments above it
## For reading CSVs stored in Google Cloud (without downloading them first)
## May need to restart kernel after install 

In [5]:
%%capture
%pip install --upgrade --no-cache-dir --force-reinstall terra-pandas
%pip install --upgrade --no-cache-dir  --force-reinstall git+https://github.com/DataBiosphere/terra-notebook-utils
## For reading/writing data tables into pandas data frames
## May need to restart kernel after install 

## Import Statements

In [6]:
from firecloud import fiss
import pandas as pd      
import os                 
import subprocess       
import re                 
import io
import gcsfs

from typing import Any, Callable, List, Optional
from terra_notebook_utils import table, WORKSPACE_NAME, WORKSPACE_GOOGLE_PROJECT
from terra_pandas import dataframe_to_table, table_to_dataframe

## Global Variable Declarations

In [7]:
# AnVIL_HPRC WorkspaceBucket
anvil_hprc_bucket       = "gs://fc-285575a2-c2d5-4f17-a6a1-d1811d4afab4/"

# table filenames (expected in the workspace bucket, see below)
submitter_metadata = 'HPRC_DEEPCONSENSUS_v1pt2_2023_08_q20_submitter_metadata.tsv'
sra_metadata = 'HPRC_DEEPCONSENSUS_v1pt2_2023_08_q20-13931699-processed-ok.tsv'

# submission id
submission_id = 'HPRC_DEEPCONSENSUS_v1pt2_2023_08_q20'

# Get the Google billing project name and workspace name for current workspace
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

Billing project: human-pangenome-ucsc
Workspace: HPRC_DEEPCONSENSUS_v1pt2_2023_08_q20
Workspace storage bucket: gs://fc-684a1d20-49ec-4a0a-90ab-57b0f5c0aab5/


# Read In tables as dataframes

## Readstats table
Find the individual readstats output files in the table and concatenate their contents.
Add the hifi filename and sample name.

In [20]:
# readstats
readstats_df = table_to_dataframe("readstats", workspace=WORKSPACE, workspace_namespace=PROJECT)
qc_list = []

for index, row in readstats_df.iterrows():

        sample_readstats_fp = row['ReadStatsReport']
        sample_readstats_fn = os.path.basename(sample_readstats_fp)
        df = pd.read_csv(sample_readstats_fp, header=None, sep='\t')
        df = df[df[0]=='sample.fastq']
        # add sample and file name
        #df = df.append({0:'undef', 1:'sample', 2:row['sample']}, ignore_index=True)
        #df = df.append({0:'undef', 1:'filename', 2:os.path.basename(row['hifi'])}, ignore_index=True)
        new_row = pd.DataFrame({0:'undef', 1:'filename', 2:os.path.basename(row['hifi'])}, index=[0])
        df = pd.concat([df, new_row], ignore_index=True)
        # remove all but the variables (total_bp, quartile_25 etc)
        df2 = df.drop(df.columns[[0, 1]], axis=1)
        # make rownames ('total_bp', 'quartile_25' etc)
        df2.index = df.iloc[:,1]
        # remove the now meaningless 'sample.fastq' filename
        df2 = df2.drop(index=['file'])
        qc_list.append(df2)
        

# merge
readstats_df = pd.concat(qc_list, axis=1).transpose()
readstats_df.head()

1,total_reads,total_bp,total_Gbp,min,max,mean,quartile_25,quartile_50,quartile_75,N25,N50,N75,filename
2,2687625,56102007798,56.1,107,64413,20874,17762,20158,23398,18348,21041,24496,HG00099.m54329U_220825_174247.dc.q20.fastq.gz
2,2643186,52918724515,52.92,345,62341,20020,17277,19348,22242,17731,20061,23169,HG00099.m54329U_220827_143814.dc.q20.fastq.gz
2,352327,7524316582,7.52,1028,57489,21356,19009,20965,23289,19422,21456,23918,HG00323.m64043_220728_173215.dc.q20.fastq.gz
2,811437,17181739490,17.18,398,75864,21174,16931,20457,24597,18227,22011,26361,HG03017.m64076_221001_041132.dc.q20.fastq.gz
2,2319689,44900731287,44.9,351,70885,19356,15860,18511,22051,16646,19670,23521,HG03041.m54329U_211214_012740.dc.q20.fastq.gz


## Submitter metadata table

In [21]:
meta_fp = os.path.join(bucket, submitter_metadata)
metadata_df = pd.read_csv(meta_fp, sep='\t')
metadata_df = metadata_df[['filename', 'sample_ID', 'instrument_model', 'shear_method', 'size_selection', 'DeepConsensus_version', 
                           'polymerase_version', 'seq_plate_chemistry_version', 'generator_facility', 'generator_contact']]
metadata_df.head()

Unnamed: 0,filename,sample_ID,instrument_model,shear_method,size_selection,DeepConsensus_version,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact
0,HG00099.m54329U_220825_174247.dc.q20.fastq.gz,HG00099,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu
1,HG00099.m54329U_220827_143814.dc.q20.fastq.gz,HG00099,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu
2,HG00099.m54329U_220829_095708.dc.q20.fastq.gz,HG00099,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu
3,HG00140.m64043_220728_173215.dc.q20.fastq.gz,HG00140,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu
4,HG00140.m64136_220715_182717.dc.q20.fastq.gz,HG00140,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu


## SRA table
Must download after finishing submission.
The downloaded table needs to be split in one row per file (instead of filename, filename2 <...> filenameN)

In [22]:
## Read in SRA file and split by filename

sra_file = os.path.join(bucket + sra_metadata)
sample_df = pd.read_csv(sra_file, sep='\t')

fnames = [x for x in sample_df.columns if x.startswith('filename')]
sra_df = pd.melt(sample_df, id_vars =['library_ID','accession'], value_vars = fnames, value_name='file')
sra_df= sra_df.dropna()
sra_df = sra_df[['file', 'accession', 'library_ID']]
sra_df.columns = ['filename', 'accession', 'library_ID']
sra_df.head()


Unnamed: 0,filename,accession,library_ID
0,HG00099.m54329U_220825_174247.dc.q20.fastq.gz,SRR26545347,PG00099_1.HFSS_dc
1,HG00140.m64043_220728_173215.dc.q20.fastq.gz,SRR26545346,HG00140_lib1_dc
2,HG00280.m54329U_220901_221341.dc.q20.fastq.gz,SRR26545335,PG00280.HFSS_dc
3,HG00323.m64043_220728_173215.dc.q20.fastq.gz,SRR26545324,HG00323_lib1_dc
4,HG00408.m64136_211111_194404.dc.q20.fastq.gz,SRR26545313,HG00408_lib1_dc


# Merge all dataframes

In [23]:
print(sra_df.shape, readstats_df.shape, metadata_df.shape)
# two samples of HG02572 were not uploaded to SRA with the rest of this batch

(168, 3) (168, 13) (168, 10)


In [24]:
set(readstats_df['filename']) - set(sra_df['filename'])
#m54329U_201103_231616.ccs.bam (HG002 sample) is not present in the submitter metadata or on SRA, but it is in the submissions 

set()

In [25]:
merged_df = pd.merge(
    sra_df,
    readstats_df,
    on='filename')
merged_df = pd.merge(
    merged_df,
    metadata_df,
    on='filename')
merged_df.head()

Unnamed: 0,filename,accession,library_ID,total_reads,total_bp,total_Gbp,min,max,mean,quartile_25,...,N75,sample_ID,instrument_model,shear_method,size_selection,DeepConsensus_version,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact
0,HG00099.m54329U_220825_174247.dc.q20.fastq.gz,SRR26545347,PG00099_1.HFSS_dc,2687625,56102007798,56.1,107,64413,20874,17762,...,24496,HG00099,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu
1,HG00140.m64043_220728_173215.dc.q20.fastq.gz,SRR26545346,HG00140_lib1_dc,572695,11345594738,11.35,117,55287,19810,17863,...,21783,HG00140,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu
2,HG00280.m54329U_220901_221341.dc.q20.fastq.gz,SRR26545335,PG00280.HFSS_dc,2766007,55104620054,55.1,91,66666,19922,16987,...,23253,HG00280,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu
3,HG00323.m64043_220728_173215.dc.q20.fastq.gz,SRR26545324,HG00323_lib1_dc,352327,7524316582,7.52,1028,57489,21356,19009,...,23918,HG00323,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu
4,HG00408.m64136_211111_194404.dc.q20.fastq.gz,SRR26545313,HG00408_lib1_dc,2333870,44812028504,44.81,83,61297,19200,16485,...,22353,HG00408,PacBio Sequel II,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu


# Write output files

In [26]:
## Create table
#dataframe_to_table("post_sra_metadata", merged_df, WORKSPACE, PROJECT)
outf = os.path.join(bucket, submission_id + '_post_sra_metadata.tsv')
merged_df.to_csv(outf, sep="\t", index=False)

In [25]:
merged_df.shape

(132, 24)