# Extract ONT QC Data<a class="tocSkip">

**This notebook reads in data from NTSM and Coverage WDLS (stored in data tables). This is part of the ONT QC process.**

**Below are the steps taken in this notebook:**
1. Import Statements & Global Variable Definitions
2. Define Functions
3. Read In Sample Names
4. Create Dataframe Of Files
5. Examine results

**Note**: These results are not written back to the data tables or to files

# Import Statements & Global Variable Definitions

## Installs

In [1]:
## May need to restart kernel after the following installs 

In [2]:
%%capture
%pip install gcsfs
## capture CANNOT have comments above it
## For reading CSVs stored in Google Cloud (without downloading them first)
## May need to restart kernel after install 

In [3]:
%%capture
%pip install --upgrade --no-cache-dir --force-reinstall terra-pandas
%pip install --upgrade --no-cache-dir  --force-reinstall git+https://github.com/DataBiosphere/terra-notebook-utils
## For reading/writing data tables into pandas data frames
## May need to restart kernel after install 

## Import Statements

In [1]:
from firecloud import fiss
import pandas as pd 
import numpy as np
import terra_pandas as tp
import os                 
import subprocess       
import re                 
import io
import gcsfs
import glob

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from typing import Any, Callable, List, Optional
from terra_notebook_utils import table, WORKSPACE_NAME, WORKSPACE_GOOGLE_PROJECT


## Global Variable Declarations

In [2]:
# Get the Google billing project name and workspace name for current workspace
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

Billing project: human-pangenome-ucsc
Workspace: HPRC_YEAR2_ONT_Guppy6
Workspace storage bucket: gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/


# Extract NTSM Data

## Read in NTSM Data Table

In [3]:
ntsm_df = tp.table_to_dataframe("ntsm", workspace=WORKSPACE, workspace_namespace=PROJECT)

ntsm_df.head()

Unnamed: 0_level_0,ntsv_count_2,fastq_list,ntsm_eval_out,read_2_fastq,read_1_fastq,sample,ONT_pass_bam,ntsv_count_1
ntsm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00423,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...
1,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00423,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...
10,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00642,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...
100,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG02683,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...
101,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG02698,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...


## Read NTSM Output & Write To DataFrame

In [4]:
ntsm_df['ntsm_score'] = np.nan
ntsm_df['result']     = np.nan

for index, row in ntsm_df.iterrows():

        sample_ntsm_fp = row['ntsm_eval_out']
        sample_ntsm_df = pd.read_csv(sample_ntsm_fp, header=None, sep='\t')

        ntsm_df.loc[index,'ntsm_score'] = sample_ntsm_df.iloc[0][2]
        ntsm_df.loc[index,'result'] = sample_ntsm_df[3].astype('str')[0]



In [5]:
## How many rows don't match? (Should be 0)
sum(ntsm_df['result'] != 'Similar')

0

# Extract ReadStats Data

## Read in ReadStats Data Table

In [6]:
covstats_df = tp.table_to_dataframe("covstats", workspace=WORKSPACE, workspace_namespace=PROJECT)

covstats_df.head()

Unnamed: 0_level_0,ONT_seq_summ,fail_summary_stats,pass_summary_stats,sample
covstats_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,HG00423
1,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,HG00423
10,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,HG00642
100,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,HG02683
101,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,HG02698


## Read ReadStats Output & Write To DataFrame

In [14]:
summary_files = list(covstats_df['pass_summary_stats']) + list(covstats_df['fail_summary_stats'])
#summary_files=['gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/submissions/84baaeda-5840-4d9b-93da-8c6ede10596d/run_calc_ont_stats/e1842c56-7bc2-46ad-9196-62deb717a086/call-calc_ont_summary_stats/cacheCopy/glob-f0d314809f0b58a96bee1f8f36b45ca1/12_08_21_R941_HG00423_1_Guppy_6.4.6_450bps_modbases_5mc_cg_sup_prom.pass_summary_stats.txt','gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/submissions/84baaeda-5840-4d9b-93da-8c6ede10596d/run_calc_ont_stats/129377bc-f790-44b9-b6a0-ab8bbba9ec0d/call-calc_ont_summary_stats/glob-f0d314809f0b58a96bee1f8f36b45ca1/07_20_21_R941_HG02698_3_Guppy_6.5.7_450bps_modbases_5mc_cg_sup_prom.pass_summary_stats.txt','gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/submissions/84baaeda-5840-4d9b-93da-8c6ede10596d/run_calc_ont_stats/4e5a1164-d9e0-48d3-bed1-d3c25710c32b/call-calc_ont_summary_stats/attempt-2/glob-f0d314809f0b58a96bee1f8f36b45ca1/08_17_21_R941_HG02735_1_Guppy_6.5.7_450bps_modbases_5mc_cg_sup_prom.pass_summary_stats.txt', 'gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/submissions/84baaeda-5840-4d9b-93da-8c6ede10596d/run_calc_ont_stats/304ebe1c-7c06-4b5d-a9ed-c898f29fcf95/call-calc_ont_summary_stats/attempt-2/glob-f0d314809f0b58a96bee1f8f36b45ca1/08_17_21_R941_HG02735_2_Guppy_6.5.7_450bps_modbases_5mc_cg_sup_prom.pass_summary_stats.txt']
#print(summary_files)
summary_df = pd.DataFrame()

# Loop through each summary file and append its contents to the combined dataframe
for summary in summary_files:
    df = pd.read_csv(summary, sep='\t')
    #df['pass_summary_stats'] = summary
    #df['sample'] = df['File'].str.split(pat="_").str[4] 
    df.insert(1, 'sample', df['File'].str.split(pat="_").str[4])
    df.insert(2, 'flowcell', df['File'].str.split(pat="_").str[4:6].str.join('_'))
    df['File'] = df['File'].str.replace('\[\'', '').str.replace('\'\]', '').str.replace('txt','bam')
    summary_df = pd.concat([summary_df, df])

summary_df.head()

Unnamed: 0,File,sample,flowcell,read_N50,Gb,coverage,100kb+,200kb+,300kb+,400kb+,500kb+,1Mb+,whales
0,12_08_21_R941_HG00423_1_Guppy_6.4.6_450bps_mod...,HG00423,HG00423_1,92363,69.39,21.03,9.67,2.55,0.52,0.11,0.04,0.0,6
0,12_08_21_R941_HG00423_2_Guppy_6.4.6_450bps_mod...,HG00423,HG00423_2,89888,61.42,18.61,8.32,2.2,0.46,0.11,0.04,0.0,2
0,08_10_21_R941_HG00642_2_Guppy_6.5.7_450bps_mod...,HG00642,HG00642_2,101008,68.63,20.8,10.52,2.52,0.47,0.13,0.05,0.0,7
0,08_25_21_R941_HG02683_3_Guppy_6.5.7_450bps_mod...,HG02683,HG02683_3,95821,51.42,15.58,7.42,1.67,0.27,0.06,0.02,0.0,0
0,07_20_21_R941_HG02698_1_Guppy_6.5.7_450bps_mod...,HG02698,HG02698_1,70225,114.21,34.61,9.89,0.7,0.08,0.02,0.0,0.0,0


In [8]:
summary_df.shape
len(covstats_df['sample'].unique())

55

In [9]:
# sum coverage by sample
for sample in (summary_df['sample'].unique()):
    total_coverage = summary_df.loc[summary_df['sample'] == sample, 'coverage'].sum()
    if total_coverage < 60:
        print(sample, round(total_coverage,2))
# this should output nothing 

HG02683 59.81
HG04187 59.07


In [10]:
# sum coverage by sample
for sample in (summary_df['sample'].unique()):
    total_100kb_coverage = summary_df.loc[summary_df['sample'] == sample, '100kb+'].sum()
    if total_100kb_coverage < 30:
        print(sample, round(total_100kb_coverage,2))
# this should output nothing 

HG02683 28.13
HG02698 27.58
HG02735 25.18
HG02738 28.79
HG03669 26.78
HG00706 26.6
HG03710 26.33
HG03816 28.37
HG03927 28.52
HG03942 18.95
HG04115 27.21
HG04160 22.42
HG04184 27.36
HG04187 20.13
HG04199 27.11
HG04228 29.72
HG01346 28.82
HG01433 24.58
HG00544 19.68
HG01884 24.14
HG01943 23.6
HG01981 23.2
HG02027 22.8
HG02071 23.47
HG02074 23.26
HG02132 26.81
HG02451 24.66
HG02602 24.02
HG02615 25.25
HG02647 20.84
HG02668 17.98


# TODO: put these in a .csv

In [11]:
summary_df.head()

Unnamed: 0,File,sample,flowcell,read_N50,Gb,coverage,100kb+,200kb+,300kb+,400kb+,500kb+,1Mb+,whales
0,12_08_21_R941_HG00423_1_Guppy_6.4.6_450bps_mod...,HG00423,HG00423_1,92363,69.39,21.03,9.67,2.55,0.52,0.11,0.04,0.0,6
0,12_08_21_R941_HG00423_2_Guppy_6.4.6_450bps_mod...,HG00423,HG00423_2,89888,61.42,18.61,8.32,2.2,0.46,0.11,0.04,0.0,2
0,08_10_21_R941_HG00642_2_Guppy_6.5.7_450bps_mod...,HG00642,HG00642_2,101008,68.63,20.8,10.52,2.52,0.47,0.13,0.05,0.0,7
0,08_25_21_R941_HG02683_3_Guppy_6.5.7_450bps_mod...,HG02683,HG02683_3,95821,51.42,15.58,7.42,1.67,0.27,0.06,0.02,0.0,0
0,07_20_21_R941_HG02698_1_Guppy_6.5.7_450bps_mod...,HG02698,HG02698_1,70225,114.21,34.61,9.89,0.7,0.08,0.02,0.0,0.0,0


In [12]:
ntsm_df.head()

Unnamed: 0_level_0,ntsv_count_2,fastq_list,ntsm_eval_out,read_2_fastq,read_1_fastq,sample,ONT_pass_bam,ntsv_count_1,ntsm_score,result
ntsm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00423,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,0.307526,Similar
1,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00423,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,0.298268,Similar
10,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00642,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,0.34542,Similar
100,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG02683,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,0.344844,Similar
101,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,[gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG02698,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/s...,0.271365,Similar


In [15]:
# Create table
outSumm = os.path.join(bucket, WORKSPACE + '_summary.tsv')
summary_df.to_csv(outSumm, sep="\t", index=False)

In [66]:
# Create table
outNTSM = os.path.join(bucket, WORKSPACE + '_NTSM.tsv')
ntsm_df.to_csv(outNTSM, sep="\t", index=False)