# Extract ONT QC Data<a class="tocSkip">

**This notebook reads in data from NTSM and Coverage WDLS (stored in data tables). This is part of the ONT QC process.**

**Below are the steps taken in this notebook:**
1. Import Statements & Global Variable Definitions
2. Define Functions
3. Read In Sample Names
4. Create Dataframe Of Files
5. Examine results

**Note**: These results are not written back to the data tables or to files

# Import Statements & Global Variable Definitions

## Installs

In [None]:
## May need to restart kernel after the following installs 

In [None]:
%%capture
%pip install gcsfs
## capture CANNOT have comments above it
## For reading CSVs stored in Google Cloud (without downloading them first)
## May need to restart kernel after install 

In [None]:
%%capture
%pip install --upgrade --no-cache-dir --force-reinstall terra-pandas
%pip install --upgrade --no-cache-dir  --force-reinstall git+https://github.com/DataBiosphere/terra-notebook-utils
## For reading/writing data tables into pandas data frames
## May need to restart kernel after install 

## Import Statements

In [None]:
from firecloud import fiss
import pandas as pd 
import numpy as np
import terra_pandas as tp
import os                 
import subprocess       
import re                 
import io
import gcsfs
import glob

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from typing import Any, Callable, List, Optional
from terra_notebook_utils import table, WORKSPACE_NAME, WORKSPACE_GOOGLE_PROJECT


## Global Variable Declarations

In [None]:
# Get the Google billing project name and workspace name for current workspace
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

# Extract NTSM Data

## Read in NTSM Data Table

In [None]:
ntsm_df = tp.table_to_dataframe("ntsm", workspace=WORKSPACE, workspace_namespace=PROJECT)

ntsm_df.head()

## Read NTSM Output & Write To DataFrame

In [None]:
ntsm_df['ntsm_score'] = np.nan
ntsm_df['result']     = np.nan

for index, row in ntsm_df.iterrows():

        sample_ntsm_fp = row['ntsm_eval_out']
        sample_ntsm_df = pd.read_csv(sample_ntsm_fp, header=None, sep='\t')

        ntsm_df.loc[index,'ntsm_score'] = sample_ntsm_df.iloc[0][2]
        ntsm_df.loc[index,'result'] = sample_ntsm_df[3].astype('str')[0]



In [None]:
## How many rows don't match? (Should be 0)
sum(ntsm_df['result'] != 'Similar')

# Extract ReadStats Data

## Read in ReadStats Data Table

In [None]:
covstats_df = tp.table_to_dataframe("covstats", workspace=WORKSPACE, workspace_namespace=PROJECT)

covstats_df.head()

## Read ReadStats Output & Write To DataFrame

In [None]:
summary_files = list(covstats_df['pass_summary_stats']) + list(covstats_df['fail_summary_stats'])
#summary_files=['gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/submissions/84baaeda-5840-4d9b-93da-8c6ede10596d/run_calc_ont_stats/e1842c56-7bc2-46ad-9196-62deb717a086/call-calc_ont_summary_stats/cacheCopy/glob-f0d314809f0b58a96bee1f8f36b45ca1/12_08_21_R941_HG00423_1_Guppy_6.4.6_450bps_modbases_5mc_cg_sup_prom.pass_summary_stats.txt','gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/submissions/84baaeda-5840-4d9b-93da-8c6ede10596d/run_calc_ont_stats/129377bc-f790-44b9-b6a0-ab8bbba9ec0d/call-calc_ont_summary_stats/glob-f0d314809f0b58a96bee1f8f36b45ca1/07_20_21_R941_HG02698_3_Guppy_6.5.7_450bps_modbases_5mc_cg_sup_prom.pass_summary_stats.txt','gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/submissions/84baaeda-5840-4d9b-93da-8c6ede10596d/run_calc_ont_stats/4e5a1164-d9e0-48d3-bed1-d3c25710c32b/call-calc_ont_summary_stats/attempt-2/glob-f0d314809f0b58a96bee1f8f36b45ca1/08_17_21_R941_HG02735_1_Guppy_6.5.7_450bps_modbases_5mc_cg_sup_prom.pass_summary_stats.txt', 'gs://fc-72c79fce-944a-4bf1-a8d1-717ecd7a29a3/submissions/84baaeda-5840-4d9b-93da-8c6ede10596d/run_calc_ont_stats/304ebe1c-7c06-4b5d-a9ed-c898f29fcf95/call-calc_ont_summary_stats/attempt-2/glob-f0d314809f0b58a96bee1f8f36b45ca1/08_17_21_R941_HG02735_2_Guppy_6.5.7_450bps_modbases_5mc_cg_sup_prom.pass_summary_stats.txt']
#print(summary_files)
summary_df = pd.DataFrame()

# Loop through each summary file and append its contents to the combined dataframe
for summary in summary_files:
    df = pd.read_csv(summary, sep='\t')
    #df['pass_summary_stats'] = summary
    #df['sample'] = df['File'].str.split(pat="_").str[4] 
    df.insert(1, 'sample', df['File'].str.split(pat="_").str[4])
    df.insert(2, 'flowcell', df['File'].str.split(pat="_").str[4:6].str.join('_'))
    df['File'] = df['File'].str.replace('\[\'', '').str.replace('\'\]', '').str.replace('txt','bam')
    summary_df = pd.concat([summary_df, df])

summary_df.head()

In [None]:
summary_df.shape
len(covstats_df['sample'].unique())

In [None]:
# sum coverage by sample
for sample in (summary_df['sample'].unique()):
    total_coverage = summary_df.loc[summary_df['sample'] == sample, 'coverage'].sum()
    if total_coverage < 60:
        print(sample, round(total_coverage,2))
# this should output nothing 

In [None]:
# sum coverage by sample
for sample in (summary_df['sample'].unique()):
    total_100kb_coverage = summary_df.loc[summary_df['sample'] == sample, '100kb+'].sum()
    if total_100kb_coverage < 30:
        print(sample, round(total_100kb_coverage,2))
# this should output nothing 

# Check tables, then export to tsv

In [None]:
summary_df.head()

In [None]:
ntsm_df.head()

In [None]:
# Create table
outSumm = os.path.join(bucket, WORKSPACE + '_summary.tsv')
summary_df.to_csv(outSumm, sep="\t", index=False)

In [None]:
# Create table
outNTSM = os.path.join(bucket, WORKSPACE + '_NTSM.tsv')
ntsm_df.to_csv(outNTSM, sep="\t", index=False)