# Notebook 14: Create QC Table For Genbank Assemblies<a class="tocSkip">

**Upload the final reassembled fastas as a submission**
    
    
**The steps that we will take are:**
1. Import Statements & Global Variable Definitions
2. Upload Final Assemblies
3. Upload Raw Assemblies + GFAs

# Import Statements & Global Variable Definitions

In [1]:
import terra_notebook_utils as tnu
import terra_pandas as tp
import pandas as pd
import os
from ssds import deployment

## Global Variable Definitions

## Set Environment Variables

In [2]:
# Get the Google billing project name and workspace name
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

Billing project: human-pangenome-ucsc
Workspace: HPRC_Reassembly
Workspace storage bucket: gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/


# Read In Data Tables

## Read In Data Table w/ Final Assemblies

In [5]:
genbank_assemblies_df = tp.table_to_dataframe("assembly_sample", workspace="AnVIL_HPRC", workspace_namespace="anvil-datastorage")

genbank_assemblies_df.head()

Unnamed: 0_level_0,mat_fasta,pat_fasta
assembly_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
HG002,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00438,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG005,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00621,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00673,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...


## Read In Sample Data Table

In [6]:
sample_df = tp.table_to_dataframe("sample", workspace="AnVIL_HPRC", workspace_namespace="anvil-datastorage")

sample_df.head()

Unnamed: 0_level_0,mat_ilmn,cohort,hifi,paternal_id,hic,maternal_id,child_ilmn,bionano_cmap,bionano_bnx,nanopore,pat_ilmn,strandseq,notes
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
HG002,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HPRC_PLUS,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG003,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG004,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,
HG00438,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HPRC,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG00436,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG00437,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,,
HG00480,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HPRC,,HG00478,,HG00479,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,,,,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,,Abnormal Karyotype
HG005,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HPRC_PLUS,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG006,,HG007,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,,,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,,
HG00621,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HPRC,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG00619,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG00620,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,,


In [7]:
## Just keep the columns we need
sample_df = sample_df[["mat_ilmn", "pat_ilmn", "child_ilmn"]]

## Read in 1000G Pedigree Table

In [8]:
pedigree_df = tp.table_to_dataframe("pedigree", workspace="1000G-high-coverage-2019", 
                                    workspace_namespace="anvil-datastorage")

# Print out the top few rows to check that we have what we expect
pedigree_df.head()

Unnamed: 0_level_0,FamilyID,Population,MotherID,FatherID,Superpopulation,Sex
pedigree_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HG00096,HG00096,GBR,0,0,EUR,1
HG00097,HG00097,GBR,0,0,EUR,2
HG00099,HG00099,GBR,0,0,EUR,2
HG00100,HG00100,GBR,0,0,EUR,2
HG00101,HG00101,GBR,0,0,EUR,1


In [9]:
## Just keep the info that we care about...
pedigree_df = pedigree_df[['Sex']]

In [10]:
pedigree_df['is_male'] = True

is_female = pedigree_df['Sex'] == '2'
pedigree_df.loc[is_female, 'is_male'] = False

pedigree_df = pedigree_df.drop('Sex', 1)

# Combine Data Frames
## Add Sex Info To Sample Table

In [11]:
launch_df = sample_df.copy()

In [12]:
launch_df = pd.merge(launch_df, pedigree_df, how='left', left_index=True, right_index=True)

## Add Sex + Ilmn Reads To Assemblies

In [15]:
launch_df = pd.merge(launch_df, genbank_assemblies_df, how='right', left_index=True, right_index=True)

In [19]:
launch_df = launch_df.rename(index={'1': 'sample_name'})

## Fix Sex For Samples Not In 1000G Pedigree

In [23]:
## Manually write values
launch_df.loc[launch_df.index == "HG002",   'is_male'] = True
launch_df.loc[launch_df.index == "HG005",   'is_male'] = True
launch_df.loc[launch_df.index == "HG01123", 'is_male'] = False
launch_df.loc[launch_df.index == "HG02109", 'is_male'] = False
launch_df.loc[launch_df.index == "HG02486", 'is_male'] = True
launch_df.loc[launch_df.index == "HG02559", 'is_male'] = False
launch_df.loc[launch_df.index == "NA21309", 'is_male'] = False
launch_df.loc[launch_df.index == "HG02559", 'is_male'] = False
launch_df.loc[launch_df.index == "HG02055", 'is_male'] = True
launch_df.loc[launch_df.index == "HG03098", 'is_male'] = True


launch_df

Unnamed: 0_level_0,mat_ilmn,pat_ilmn,child_ilmn,is_male,mat_fasta,pat_fasta
assembly_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HG002,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,True,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00438,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,False,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG005,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,True,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00621,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,True,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00673,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,True,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00733,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,False,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00735,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,False,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG00741,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,False,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG01071,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,False,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...
HG01106,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,True,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...


In [None]:
# Upload QC Sample Data Table

In [24]:
launch_df = launch_df.rename(index={'1': 'genbank_qc_sample_id'})

In [25]:
tp.dataframe_to_table("genbank_qc_sample_id", launch_df)