# Notebook 8: Upload HG002 Assemblies For Kerstin<a class="tocSkip">

**Upload more versions of HG002 for Kerstin to run**
    
    
**The steps that we will take are:**
1. Import Statements & Global Variable Definitions
2. Read In Data Tables
3. Upload Assemblies

# Import Statements & Global Variable Definitions

In [46]:
%%capture
%pip install --upgrade --no-cache-dir git+https://github.com/DataBiosphere/ssds

In [1]:
import terra_notebook_utils as tnu
import terra_pandas as tp
import pandas as pd
import os
from ssds import deployment
from pprint import pprint
import gcsfs
import seaborn as sns
%matplotlib inline

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


## Global Variable Definitions

In [2]:
## submission_id is generated by uuid (gives unique ID to each submission)
submission_id       = "9464E2BC-3F4C-4226-9813-0E91F9AD612E"

## submission_name lends some recognizability to the submission
submission_name     = "HG002_UPDATED_ASSEMBLIES"

ds = deployment._GSStaging(google_billing_project="firecloud-cgl")   # _GSStaging()
destintation_bucket = "gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf" # _GSStaging()

## Where the submission files will actually land (we may add one directory level later)
dest_full_path      = os.path.join(destintation_bucket, "submissions", submission_id + "--" + submission_name)

## Set Environment Variables

In [3]:
# Get the Google billing project name and workspace name
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

Billing project: human-pangenome-ucsc
Workspace: HPRC_Reassembly
Workspace storage bucket: gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/


# Read In Data Tables

## Read In Original QC Samples

In [5]:
sample_df = tp.table_to_dataframe("sample")

sample_df.head()

Unnamed: 0_level_0,QCstats,mat_ilmn,hifi,paternal_id,matYak,maternal_id,patYak,nanopore,pat_ilmn,maternalFastaGz,...,adapterBlockListTarGz,countReadsTarGz,binFilesTarGz,hic,bionano_cmap,bionano_bnx,strandseq,notes,patAssemblyChm13WinnowmapBam,matAssemblyChm13WinnowmapBam
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HG002_chem_v2,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG003,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,HG004,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/6...,...,,,,,,,,,,
HG002_downsampled,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG003,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,HG004,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/2...,...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/2...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/2...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/2...,,,,,,,
HG002_full_v0.14,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/f...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG003,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,HG004,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/f...,...,,,,[],[],[],[],,,
HG002_full_v0.14.1,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/e...,,,HG003,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,HG004,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,,,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/H...,...,,,,,,,,,,
HG00438,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG00436,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,HG00437,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,[],,,


In [9]:
samples_to_keep = ["HG002_full_v0.14.1", "HG002_chem_v2"]

HG002_sample_df = sample_df.loc[samples_to_keep]

HG002_sample_df

Unnamed: 0_level_0,QCstats,mat_ilmn,hifi,paternal_id,matYak,maternal_id,patYak,nanopore,pat_ilmn,maternalFastaGz,...,adapterBlockListTarGz,countReadsTarGz,binFilesTarGz,hic,bionano_cmap,bionano_bnx,strandseq,notes,patAssemblyChm13WinnowmapBam,matAssemblyChm13WinnowmapBam
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HG002_full_v0.14.1,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/e...,,,HG003,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,HG004,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,,,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/H...,...,,,,,,,,,,
HG002_chem_v2,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,[gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/...,HG003,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,HG004,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/y...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/6...,...,,,,,,,,,,


# Upload Assemblies
## Upload Directly From GCP w/ SSDS

In [12]:
## Loop through rows of data table, upload fastas
for index, row in HG002_sample_df.iterrows():
    
    sample_id   = row.name
    
    mat_fa_fp = row['maternalFastaGz']
    pat_fa_fp = row['paternalFastaGz']

    mat_fa_fn = os.path.basename(mat_fa_fp)
    pat_fa_fn = os.path.basename(pat_fa_fp)
        
    mat_subm_path = f"{sample_id}/{mat_fa_fn}"
    pat_subm_path = f"{sample_id}/{pat_fa_fn}"
                
    ds.copy(mat_fa_fp, submission_id, submission_name, mat_subm_path)
    ds.copy(pat_fa_fp, submission_id, submission_name, pat_subm_path)

2021-03-26 03:15:52::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/HG002_FULL_V0.14.1/HG002-full-0.14.1.mat.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/9464E2BC-3F4C-4226-9813-0E91F9AD612E--HG002_UPDATED_ASSEMBLIES/HG002_full_v0.14.1/HG002-full-0.14.1.mat.fa.gz
2021-03-26 03:16:04::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/HG002_FULL_V0.14.1/HG002-full-0.14.1.pat.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/9464E2BC-3F4C-4226-9813-0E91F9AD612E--HG002_UPDATED_ASSEMBLIES/HG002_full_v0.14.1/HG002-full-0.14.1.pat.fa.gz
2021-03-26 03:16:17::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/63b5a366-7aab-48e4-92b5-2c9b86553086/trioHifiasmAssembly/aceda867-2cb3-4ef8-a2d3-818c06f009fd/call-gfa2fasta/HG002_chem_v2.mat.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/9464E2BC-3F4C-4226-9813-0E91F9AD612E--HG002_UPDATED_ASSEMBLIES/HG002_chem_v2/HG002_chem_v2.mat.fa.gz
2021-03-26 03:16:29::INFO  Copied g