# Notebook 15: Upload Genbank Assemblies<a class="tocSkip">

**Upload the final reassembled fastas (from Genbank) as a submission**
    
    
**The steps that we will take are:**
1. Import Statements & Global Variable Definitions
2. Upload Final Assemblies
3. Calculate SHA256

# Import Statements & Global Variable Definitions

In [2]:
%%capture
%pip install --upgrade --no-cache-dir git+https://github.com/DataBiosphere/ssds

In [1]:
import terra_notebook_utils as tnu
import terra_pandas as tp
import pandas as pd
import os
from ssds import deployment

## Global Variable Definitions

In [2]:
## submission_id is generated by uuid (gives unique ID to each submission)
submission_id       = "728E4476-8D84-4E8E-BA6D-AC9BF482ECCD"

## submission_name lends some recognizability to the submission
submission_name     = "YEAR_1_GENBANK_ASSEM"

ds = deployment._GSStaging(google_billing_project="firecloud-cgl")   # _GSStaging()
destintation_bucket = "gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf" # _GSStaging()

## Where the submission files will actually land (we may add one directory level later)
dest_full_path      = os.path.join(destintation_bucket, "submissions", submission_id + "--" + submission_name)

## Set Environment Variables

In [3]:
# Get the Google billing project name and workspace name
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

Billing project: human-pangenome-ucsc
Workspace: HPRC_Reassembly
Workspace storage bucket: gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/


# Upload Final Assemblies

## Read In Data Table w/ Final Assemblies

In [4]:
final_assemblies_df = tp.table_to_dataframe("raw_genbank_sample")

final_assemblies_df.head()

Unnamed: 0_level_0,pat_renamed_unmasked_assembly,original_fasta_gcp_loc_pat,mat_renamed_unmasked_assembly,original_fasta_gcp_loc_mat
raw_genbank_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HG002,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/5...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...
HG00438,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/5...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...
HG005,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/5...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...
HG00621,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/5...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...
HG00673,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/5...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/g...


## Upload Directly From GCP w/ SSDS

In [7]:
## Loop through rows of data table, return tuple of values
for index, row in final_assemblies_df.iterrows():
    
    sample_id   = row.name
    sample_name = sample_id
    

    mat_fa_fp = row['mat_renamed_unmasked_assembly']
    pat_fa_fp = row['pat_renamed_unmasked_assembly']

    mat_fa_fn = os.path.basename(mat_fa_fp)
    pat_fa_fn = os.path.basename(pat_fa_fp)
        
    mat_subm_path = f"{sample_name}/assemblies/year1_f1_assembly_v2_genbank/{mat_fa_fn}"
    pat_subm_path = f"{sample_name}/assemblies/year1_f1_assembly_v2_genbank/{pat_fa_fn}"
                
    ds.copy(mat_fa_fp, submission_id, submission_name, mat_subm_path)
    ds.copy(pat_fa_fp, submission_id, submission_name, pat_subm_path)

2021-06-22 01:58:31::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/9d5504ed-dcf7-46ce-8ffb-3503a6daf9a4/call-renameAndUnmask/HG002.maternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG002/assemblies/year1_f1_assembly_v2_genbank/HG002.maternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 01:58:42::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/52e70f55-5ac3-4996-a6fd-a304a1ddb24b/finalizeGenbankAssembly/0626eef0-226d-460b-bd01-712864eac438/call-renameAndUnmask/HG002.paternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG002/assemblies/year1_f1_assembly_v2_genbank/HG002.paternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 01:58:56::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-

2021-06-22 02:02:05::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/0f4b19d8-2be3-4118-920a-0a3f0599761e/call-renameAndUnmask/HG01106.maternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG01106/assemblies/year1_f1_assembly_v2_genbank/HG01106.maternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:02:16::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/52e70f55-5ac3-4996-a6fd-a304a1ddb24b/finalizeGenbankAssembly/b8414cfc-0f69-4886-9c3a-6b41712a4879/call-renameAndUnmask/HG01106.paternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG01106/assemblies/year1_f1_assembly_v2_genbank/HG01106.paternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:02:27::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae07

2021-06-22 02:05:26::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/f94ebe91-e182-4035-8339-72ae28c98dc9/call-renameAndUnmask/HG01928.maternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG01928/assemblies/year1_f1_assembly_v2_genbank/HG01928.maternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:05:37::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/52e70f55-5ac3-4996-a6fd-a304a1ddb24b/finalizeGenbankAssembly/3604be23-1784-4732-9342-1de1676e1bb8/call-renameAndUnmask/HG01928.paternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG01928/assemblies/year1_f1_assembly_v2_genbank/HG01928.paternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:05:49::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae07

2021-06-22 02:08:52::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/c2b97378-71e4-4dbd-bc10-088239baa7d1/call-renameAndUnmask/HG02486.maternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG02486/assemblies/year1_f1_assembly_v2_genbank/HG02486.maternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:09:03::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/52e70f55-5ac3-4996-a6fd-a304a1ddb24b/finalizeGenbankAssembly/bbbca26e-712e-44c8-9374-7830f7b8ddfd/call-renameAndUnmask/HG02486.paternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG02486/assemblies/year1_f1_assembly_v2_genbank/HG02486.paternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:09:15::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae07

2021-06-22 02:12:18::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/b4348379-6f29-4976-8e11-7b5388915542/call-renameAndUnmask/HG03098.maternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG03098/assemblies/year1_f1_assembly_v2_genbank/HG03098.maternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:12:28::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/52e70f55-5ac3-4996-a6fd-a304a1ddb24b/finalizeGenbankAssembly/ea4748aa-0c3d-42c7-998f-0ac43f18569f/call-renameAndUnmask/HG03098.paternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/HG03098/assemblies/year1_f1_assembly_v2_genbank/HG03098.paternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:12:39::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae07

2021-06-22 02:15:25::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/a0671a38-71be-43a5-8a54-47fb56b81439/call-renameAndUnmask/NA20129.maternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/NA20129/assemblies/year1_f1_assembly_v2_genbank/NA20129.maternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:15:35::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/52e70f55-5ac3-4996-a6fd-a304a1ddb24b/finalizeGenbankAssembly/7c59d8e7-fe04-400d-bce7-6d332dfbff33/call-renameAndUnmask/NA20129.paternal.f1_assembly_v2_genbank.fa.gz to gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/submissions/728E4476-8D84-4E8E-BA6D-AC9BF482ECCD--YEAR_1_GENBANK_ASSEM/NA20129/assemblies/year1_f1_assembly_v2_genbank/NA20129.paternal.f1_assembly_v2_genbank.fa.gz
2021-06-22 02:15:46::INFO  Copied gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae07

# Calculate SHA256

In [5]:
! mkdir sha256
%cd sha256

mkdir: cannot create directory ‘sha256’: File exists
/home/jupyter-user/notebooks/HPRC_Reassembly/edit/sha256


In [None]:
## Pull assemblies to local VM, then calculate SHA256
for index, row in final_assemblies_df.iterrows():
    
    sample_id   = row.name
    sample_name = sample_id
    

    mat_fa_fp = row['mat_renamed_unmasked_assembly']
    pat_fa_fp = row['pat_renamed_unmasked_assembly']

    mat_fa_fn = os.path.basename(mat_fa_fp)
    pat_fa_fn = os.path.basename(pat_fa_fp)
        
    ! gsutil cp {mat_fa_fp} .
    ! gsutil cp {pat_fa_fp} .
    
    !sha256sum {mat_fa_fn} >> sha_sums.txt
    !sha256sum {pat_fa_fn} >> sha_sums.txt

Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/9d5504ed-dcf7-46ce-8ffb-3503a6daf9a4/call-renameAndUnmask/HG002.maternal.f1_assembly_v2_genbank.fa.gz...
- [1 files][838.0 MiB/838.0 MiB]   48.2 MiB/s                                   
Operation completed over 1 objects/838.0 MiB.                                    
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/52e70f55-5ac3-4996-a6fd-a304a1ddb24b/finalizeGenbankAssembly/0626eef0-226d-460b-bd01-712864eac438/call-renameAndUnmask/HG002.paternal.f1_assembly_v2_genbank.fa.gz...
/ [1 files][803.8 MiB/803.8 MiB]   47.4 MiB/s                                   
Operation completed over 1 objects/803.8 MiB.                                    
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/acb21e49-c22e-4157-9d12-e2606c9d54f3/call-renameAndUnmask/HG00438.maternal.f1_assembly_v2_genbank.fa.gz...
- [1 files][834

Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/c2b97378-71e4-4dbd-bc10-088239baa7d1/call-renameAndUnmask/HG02486.maternal.f1_assembly_v2_genbank.fa.gz...
\ [1 files][833.7 MiB/833.7 MiB]   45.5 MiB/s                                   
Operation completed over 1 objects/833.7 MiB.                                    
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/52e70f55-5ac3-4996-a6fd-a304a1ddb24b/finalizeGenbankAssembly/bbbca26e-712e-44c8-9374-7830f7b8ddfd/call-renameAndUnmask/HG02486.paternal.f1_assembly_v2_genbank.fa.gz...
- [1 files][801.2 MiB/801.2 MiB]   45.1 MiB/s                                   
Operation completed over 1 objects/801.2 MiB.                                    
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/b17ae072-b018-467e-a8d4-0147ca275489/finalizeGenbankAssembly/7cca2008-923a-4257-b398-c7e5abefced8/call-renameAndUnmask/HG02559.maternal.f1_assembly_v2_genbank.fa.gz...
| [1 files]

In [6]:
! cat sha_sums.txt

9e0b7ae6b0727a8828cef61c7b216eabdb2f098fcd2940fd1ffb8cca05185831  HG002.maternal.f1_assembly_v2_genbank.fa.gz
ec46feb51b343f030aae3e7c4781463a36f059a031f4aacef864a9863d4fc986  HG002.paternal.f1_assembly_v2_genbank.fa.gz
ca307e58dfba48336dd3a05d0ea59b87e3383a5a8c839e5b7c6e7ab6178d58ea  HG00438.maternal.f1_assembly_v2_genbank.fa.gz
a5840157a6995c5fbc08698e52b614fbdf6f57a7245db944edb328a0b3e23ff8  HG00438.paternal.f1_assembly_v2_genbank.fa.gz
8a9b715d91685b7db161e878ab205fb7a149794ecf79b3d6602d9610382ad71a  HG005.maternal.f1_assembly_v2_genbank.fa.gz
399e1bb4bf40f6b70152acd6f4cc8502fd50d0edef774d9af7ea1add1982c5e6  HG005.paternal.f1_assembly_v2_genbank.fa.gz
1180dda57e968c6f9cf1902981038141f0905b395d44ea9a068131891d22e353  HG00621.maternal.f1_assembly_v2_genbank.fa.gz
bb7d30232f423cabf8d2ebf7443d15aa42bdbfe826424e89ac5f0ee33b0bc780  HG00621.paternal.f1_assembly_v2_genbank.fa.gz
0eef54ceca964b5a65c046529e5e433f0cb1a19d6d8a19485a292ae763e6a96c  HG00673.maternal.f1_assembly_v2_genban

In [18]:
! ls