# Extract HiFi QC Data<a class="tocSkip">

**This notebook reads in data from NTSM and ReadStats WDLS (stored in data tables). This is part of the HiFi QC process.**

**Below are the steps taken in this notebook:**
1. Import Statements & Global Variable Definitions
2. Define Functions
3. Read In Sample Names
4. Create Dataframe Of Files
5. Examine results

**Note**: These results are not written back to the data tables or to files

# Import Statements & Global Variable Definitions

## Installs

In [None]:
%%capture
%pip install gcsfs
## capture CANNOT have comments above it
## For reading CSVs stored in Google Cloud (without downloading them first)
## May need to restart kernel after install 

In [None]:
%%capture
%pip install --upgrade --no-cache-dir --force-reinstall terra-pandas
%pip install --upgrade --no-cache-dir  --force-reinstall git+https://github.com/DataBiosphere/terra-notebook-utils
## For reading/writing data tables into pandas data frames
## May need to restart kernel after install 

## Import Statements

In [None]:
from firecloud import fiss
import pandas as pd 
import numpy as np
import terra_pandas as tp
import os                 
import subprocess       
import re                 
import io
import gcsfs

from typing import Any, Callable, List, Optional
from terra_notebook_utils import table, WORKSPACE_NAME, WORKSPACE_GOOGLE_PROJECT


## Global Variable Declarations

In [None]:
# Get the Google billing project name and workspace name for current workspace
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

# Extract NTSM Data

## Read in NTSM Data Table

In [None]:
ntsm_df = tp.table_to_dataframe("ntsm", workspace=WORKSPACE, workspace_namespace=PROJECT)

ntsm_df.head()

## Read NTSM Output & Write To DataFrame

In [None]:
ntsm_df['ntsm_score'] = np.nan
ntsm_df['result']     = np.nan

for index, row in ntsm_df.iterrows():

        sample_ntsm_fp = row['ntsm_eval_out']
        sample_ntsm_df = pd.read_csv(sample_ntsm_fp, header=None, sep='\t')

        ntsm_df.loc[index,'ntsm_score'] = sample_ntsm_df.iloc[0][2]
        ntsm_df.loc[index,'result'] = sample_ntsm_df[3].astype('str')[0]



In [None]:
## How many rows don't match? (Should be 0)
sum(ntsm_df['result'] != 'Similar')

# Extract ReadStats Data

## Read in ReadStats Data Table

In [None]:
readstats_df = tp.table_to_dataframe("readstats", workspace=WORKSPACE, workspace_namespace=PROJECT)

readstats_df.head()

## Read ReadStats Output & Write To DataFrame

In [None]:
readstats_df['output']   = np.nan

for index, row in readstats_df.iterrows():

        sample_readstats_fp = row['ReadStatsReport']
        #sample_readstats_fn = os.path.basename(sample_readstats_fp)

        #! gsutil cp {sample_readstats_fp} .
        
        sample_readstats_df = pd.read_csv(sample_readstats_fp, header=None, sep='\t')

        ## Just look at sample-level metrics
        sample_readstats_df = sample_readstats_df[sample_readstats_df[0]=='sample.fastq']

        ## Get rid of extra row
        sample_readstats_df = sample_readstats_df.iloc[1: , :]


        sample_coverage = sample_readstats_df[sample_readstats_df[1] == 'total_Gbp'][2]
        readstats_df.loc[index,'output'] = float(sample_coverage.values[0])

        
readstats_df['coverage'] = readstats_df['output']/3.1

In [None]:
readstats_df.shape
len(readstats_df['sample'].unique())

In [None]:
# sum coverage by sample
for sample in (readstats_df['sample'].unique()):
    total_coverage = readstats_df.loc[readstats_df['sample'] == sample, 'coverage'].sum()
    if total_coverage < 35:
        print(sample, round(total_coverage,2))
# this should output nothing 

# TODO: put these in a .csv