# Notebook 1: Find Adapter Contamination<a class="tocSkip">

**Create bed files with adapter hits to mask our HPRC Year 1 reassembled samples.**
    
**In this notbook we will create bed files to mask adapter sequences using hits from Kerstin Howe's decontamination piepline and minimap2 alignments of the PacBio SMRTbell adapter (dimer) to the assemblies.**


**The steps that we will take are:**
1. Import Statements & Global Variable Definitions
3. Extract VecScreen Results
4. Extract Adapter Hits From minimap2 PAFs
5. Combine Adapter Hits From VecScreen + minimap2
6. Look At How Many Adapter Hits We Are Finding
7. Create Masking Data Table

# Import Statements & Global Variable Definitions

## Load Python packages
----

In [1]:
%%capture 
import terra_notebook_utils as tnu
import terra_pandas as tp
import os
import io
import gzip
import pandas as pd
import numpy as np
from Bio import SeqIO

## Set Environment Variables

In [2]:
# Get the Google billing project name and workspace name
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

Billing project: human-pangenome-ucsc
Workspace: HPRC_Reassembly
Workspace storage bucket: gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/


## Function Definitions

In [3]:
def write_vecscreen_bed(contam_fp: str, output_bed: str) -> bool:
### Writes bed file of adapter sequences identified from Kerstin Howe's decontamination pipeline.
### If no adapter contamination is found and empty file is written.
### Returns bool of whether or not adapter contamination was found.

    header_string = "========== EUKARYOTE ADAPTOR SCREEN =========="
    end_string    = ""

    ## Loop through file, and pull adaptor screen entries. (These are written
    ## in between the header_string and the end_string -- if there are any.)
    with open(contam_fp) as infile, open(output_bed, 'w') as outfile:
        copy = False
        found_hits = False
        
        for line in infile:
            if line.strip() == header_string:
                copy = True
                continue
            elif line.strip() == end_string:
                copy = False
                continue
            elif copy:
                ## Pull lines with VecScreen_Strong entries
                spl_line = line.strip().split("\t")

                if (spl_line[0] == 'VecScreen_Strong') or (spl_line[0] == 'VecScreen_Moderate'):
                    
                    chrom = spl_line[1]
                    start = str(int(spl_line[2]) - 1)
                    stop  = spl_line[3]
                   
                    out_str = chrom + "\t" + start + "\t" + stop  + "\n"
                    outfile.write(out_str)
                    found_hits = True
                    
    return found_hits

In [4]:
def gz_size(fname):
### Check for empty gzipped file
    with gzip.open(fname, 'rb') as f:
        return f.seek(0, whence=2)

# Extract VecScreen Results

## Read In Data Table

**Read in data table that has assemblies that have been corrected for MT problems**

In [5]:
decont_results_df = tp.table_to_dataframe("initial_qc_sample")

decont_results_df.head()

Unnamed: 0_level_0,pat_contam_results,pat_adapter_paf,hifiasm_mat_fa,hifiasm_pat_fa,mat_adapter_paf,sample_name,mat_contam_results
initial_qc_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HG002_downsampled,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/2...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/2...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/d...,HG002,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...
HG002_full_v0.14,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/f...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/f...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/d...,HG002,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...
HG00438,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/d...,HG00438,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...
HG005,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/d...,HG005,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...
HG00621,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/c...,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/d...,HG00621,gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/k...


In [6]:
! mkdir vecscreen_results
%cd vecscreen_results

mkdir: cannot create directory ‘vecscreen_results’: File exists
/home/jupyter-user/notebooks/HPRC_Reassembly/edit/vecscreen_results


## Read In Decontamination Results; Extract VecScreen Hits & Write As BED

In [7]:
for index, row in decont_results_df.iterrows():
    
    sample_id = row.name
    
    ## Get path to decontamination results
    mat_decont_results_fp = row['mat_contam_results']
    pat_decont_results_fp = row['pat_contam_results']
    
    ## Extract file names
    mat_decont_results_fn = os.path.basename(mat_decont_results_fp)
    pat_decont_results_fn = os.path.basename(pat_decont_results_fp)
    
    ## Copy files to VM
    ! gsutil cp {mat_decont_results_fp} .
    ! gsutil cp {pat_decont_results_fp} .
    
    ## output bed file names
    mat_decont_bed_fn = sample_id + ".mat_decontam_results.bed"
    pat_decont_bed_fn = sample_id + ".pat_decontam_results.bed"
    
    ## Extract results and write bed file
    write_vecscreen_bed(mat_decont_results_fn, mat_decont_bed_fn)
    write_vecscreen_bed(pat_decont_results_fn, pat_decont_bed_fn)

Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG002_downsampled.mat.contamination.short...
/ [1 files][  864.0 B/  864.0 B]                                                
Operation completed over 1 objects/864.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG002_downsampled.pat.contamination.short...
/ [1 files][499.0 KiB/499.0 KiB]                                                
Operation completed over 1 objects/499.0 KiB.                                    
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG002-full-0.14.mat.contamination.short...
/ [1 files][364.3 KiB/364.3 KiB]                                                
Operation completed over 1 objects/364.3 KiB.                                    
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG002-full-0.14.pat.contamination.short...
/ [1 files][  3.9 MiB/ 

/ [1 files][  223.0 B/  223.0 B]                                                
Operation completed over 1 objects/223.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG01258.mat.contamination.short...
/ [1 files][  1.0 KiB/  1.0 KiB]                                                
Operation completed over 1 objects/1.0 KiB.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG01258.pat.contamination.short...
/ [1 files][  222.0 B/  222.0 B]                                                
Operation completed over 1 objects/222.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG01358.mat.contamination.short...
/ [1 files][  1.3 KiB/  1.3 KiB]                                                
Operation completed over 1 objects/1.3 KiB.                                      
Copying g

/ [1 files][  224.0 B/  224.0 B]                                                
Operation completed over 1 objects/224.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG02572.mat.contamination.short...
/ [1 files][  903.0 B/  903.0 B]                                                
Operation completed over 1 objects/903.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG02572.pat.contamination.short...
/ [1 files][  222.0 B/  222.0 B]                                                
Operation completed over 1 objects/222.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/HG02622.mat.contamination.short...
/ [1 files][  127.0 B/  127.0 B]                                                
Operation completed over 1 objects/127.0 B.                                      
Copying g

/ [1 files][  128.0 B/  128.0 B]                                                
Operation completed over 1 objects/128.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/NA19240.mat.contamination.short...
/ [1 files][  1.6 KiB/  1.6 KiB]                                                
Operation completed over 1 objects/1.6 KiB.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/NA19240.pat.contamination.short...
/ [1 files][  280.0 B/  280.0 B]                                                
Operation completed over 1 objects/280.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/kerstin_decontam_results/NA20129.mat.contamination.short...
/ [1 files][  615.0 B/  615.0 B]                                                
Operation completed over 1 objects/615.0 B.                                      
Copying g

In [8]:
## Check how many files we have: expect 4 * 48 = 192
! ls | wc -l

192


# Extract Adapter Hits From minimap2 PAFs

In [9]:
%cd ..
! mkdir minimap2_results
%cd minimap2_results

/home/jupyter-user/notebooks/HPRC_Reassembly/edit
mkdir: cannot create directory ‘minimap2_results’: File exists
/home/jupyter-user/notebooks/HPRC_Reassembly/edit/minimap2_results


## Install BedTools

In [10]:
%%capture
! wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools.static.binary

In [11]:
! mv bedtools.static.binary bedtools
! chmod a+x bedtools

## Extract PAFs & Write BED Files For Maternal Haplotypes

In [12]:
for index, row in decont_results_df.iterrows():
    
    sample_id = row.name
    
    ## Copy down paf file
    mat_paf_gz_fp = row['mat_adapter_paf']
    ! gsutil cp {mat_paf_gz_fp} ./

    ## get paf file name and set bed file name
    mat_paf_fn          = os.path.basename(mat_paf_gz_fp)
    mat_minimap2_bed_fn = sample_id + ".mat.minimap2_results.bed"
    
    ## If file is empty write blank bed file
    if gz_size(mat_paf_fn) == 0:
        ! touch {mat_minimap2_bed_fn}
       
    else:
        ## create bed file from paf hits. Only pull hits with 0, 1, or 2 mismatches
        ## hits must be over 42nt long. Merge hits.
        ! zcat {mat_paf_fn} | grep -P "NM:i:0\t|NM:i:1\t|NM:i:2\t" | awk '$4-$3>=42' | cut -f6,8,9 | sort -k1,1 -k2,2n | ./bedtools merge > {mat_minimap2_bed_fn}
        
        ## Since wea are converting from PAF to BED stop needs to add 1 nt; start is same
        ! awk -v OFS='\t' '$3+=1' {mat_minimap2_bed_fn} > tmp && mv tmp {mat_minimap2_bed_fn}

Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/deb35c1a-b583-44aa-9db2-24c66c47a0a7/minimap2/3941d23e-3dcc-4816-99ba-06dcae427199/call-alignAndGzip/HG002_downsampled.paf.gz...
/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                                       
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/deb35c1a-b583-44aa-9db2-24c66c47a0a7/minimap2/ff9d6990-2fff-4a8d-9fbe-ce8485f04b87/call-alignAndGzip/HG002_full_v0.14.paf.gz...
/ [1 files][  364.0 B/  364.0 B]                                                
Operation completed over 1 objects/364.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/deb35c1a-b583-44aa-9db2-24c66c47a0a7/minimap2/23b2e0ab-b2d5-4a1a-af57-a6159f3c68b7/call-alignAndGzip/HG00438.paf.gz...
/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                

/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                                       
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/deb35c1a-b583-44aa-9db2-24c66c47a0a7/minimap2/55f260d2-67a3-41bf-8564-ee0d66e0505f/call-alignAndGzip/HG02145.paf.gz...
/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                                       
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/deb35c1a-b583-44aa-9db2-24c66c47a0a7/minimap2/9cb317e8-23bf-4128-97bb-49c47c2068ea/call-alignAndGzip/HG02148.paf.gz...
/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                                       
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/deb35c1a-b583-44aa-9db2-24c66c47a0a7/minimap2/3dc5cd36-ca1b-4845-9e59-b4a70ef16d28/call-alignAndGzip/HG02257.paf.g

## Extract PAFs & Write BED Files For Paternal Haplotypes

In [13]:
for index, row in decont_results_df.iterrows():
    
    sample_id = row.name
    
    ## Copy down paf file
    pat_paf_gz_fp = row['pat_adapter_paf']
    ! gsutil cp {pat_paf_gz_fp} ./

    ## get paf file name and set bed file name
    pat_paf_fn          = os.path.basename(pat_paf_gz_fp)
    pat_minimap2_bed_fn = sample_id + ".pat.minimap2_results.bed"
    
    ## If file is empty write blank bed file
    if gz_size(pat_paf_fn) == 0:
        ! touch {pat_minimap2_bed_fn}
       
    else:   
        ## create bed file from paf hits. Only pull hits with 0, 1, or 2 mismatches
        ## hits must be over 42nt long. Merge hits.
        ! zcat {pat_paf_fn} | grep -P "NM:i:0\t|NM:i:1\t|NM:i:2\t" | awk '$4-$3>=42' | cut -f6,8,9 | sort -k1,1 -k2,2n | ./bedtools merge > {pat_minimap2_bed_fn}
        
        ## Converting from PAF to BED (stop needs to add 1nt; start is same)
        ! awk -v OFS='\t' '$3+=1' {pat_minimap2_bed_fn} > tmp && mv tmp {pat_minimap2_bed_fn}

Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9435229a-fdae-4132-82b1-7588e04550e9/minimap2/ad00620a-6680-4a86-bb4b-c11efdbbf738/call-alignAndGzip/HG002_downsampled.paf.gz...
/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                                       
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9435229a-fdae-4132-82b1-7588e04550e9/minimap2/d2685f92-1f2e-49b4-bef4-e25768089b49/call-alignAndGzip/HG002_full_v0.14.paf.gz...
/ [1 files][  194.0 B/  194.0 B]                                                
Operation completed over 1 objects/194.0 B.                                      
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9435229a-fdae-4132-82b1-7588e04550e9/minimap2/53efbea7-4fd1-4d9d-a838-cc9778d96db4/call-alignAndGzip/HG00438.paf.gz...
/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                

/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                                       
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9435229a-fdae-4132-82b1-7588e04550e9/minimap2/9e3e50d3-8e7f-48f3-ae68-72b424b4726b/call-alignAndGzip/HG02145.paf.gz...
/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                                       
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9435229a-fdae-4132-82b1-7588e04550e9/minimap2/9e5b9025-460c-4067-af4e-e9591d8819d3/call-alignAndGzip/HG02148.paf.gz...
/ [1 files][   20.0 B/   20.0 B]                                                
Operation completed over 1 objects/20.0 B.                                       
Copying gs://fc-0c2122a8-6725-4199-b90e-828ab006078f/9435229a-fdae-4132-82b1-7588e04550e9/minimap2/121e41b6-5e68-4899-9133-52c3e295416f/call-alignAndGzip/HG02257.paf.g

# Combine Adapter Hits From VecScreen + minimap2

In [14]:
%cd ..
! mkdir combined_beds
%cd combined_beds

/home/jupyter-user/notebooks/HPRC_Reassembly/edit
mkdir: cannot create directory ‘combined_beds’: File exists
/home/jupyter-user/notebooks/HPRC_Reassembly/edit/combined_beds


In [15]:
for index, row in decont_results_df.iterrows():
    
    sample_id = row.name

    
    ## Get file names for the files we created above
    mat_decont_bed_fn   = "../vecscreen_results/" + sample_id + ".mat_decontam_results.bed"
    pat_decont_bed_fn   = "../vecscreen_results/" + sample_id + ".pat_decontam_results.bed"

    pat_minimap2_bed_fn = "../minimap2_results/" + sample_id + ".pat.minimap2_results.bed"
    mat_minimap2_bed_fn = "../minimap2_results/" + sample_id + ".mat.minimap2_results.bed"
    
    
    ## Create file names for combined bed files
    mat_tmp_bed_fn   = sample_id + ".mat_comb_unmerged.bed"
    pat_tmp_bed_fn   = sample_id + ".pat_comb_unmerged.bed"   
    
    mat_comb_bed_fn   = sample_id + ".mat_adapter.bed"
    pat_comb_bed_fn   = sample_id + ".pat_adapter.bed"
    
    
    ## combine bed files
    ! cat {mat_decont_bed_fn} {mat_minimap2_bed_fn} | sort -k1,1 -k2,2n > {mat_tmp_bed_fn}
    ! cat {pat_decont_bed_fn} {pat_minimap2_bed_fn} | sort -k1,1 -k2,2n > {pat_tmp_bed_fn}
    
    ## Merge 
    ! ../minimap2_results/bedtools merge -i {mat_tmp_bed_fn} -d 5 > {mat_comb_bed_fn}
    ! ../minimap2_results/bedtools merge -i {pat_tmp_bed_fn} -d 5 > {pat_comb_bed_fn}    
    
    ## upload to bucket
    ! gsutil cp {mat_comb_bed_fn} {bucket}adapter_work/sample_beds/{mat_comb_bed_fn}
    ! gsutil cp {pat_comb_bed_fn} {bucket}adapter_work/sample_beds/{pat_comb_bed_fn}    

Copying file://HG002_downsampled.mat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   28.0 B/   28.0 B]                                                
Operation completed over 1 objects/28.0 B.                                       
Copying file://HG002_downsampled.pat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   30.0 B/   30.0 B]                                                
Operation completed over 1 objects/30.0 B.                                       
Copying file://HG002_full_v0.14.mat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][  129.0 B/  129.0 B]                                                
Operation completed over 1 objects/129.0 B.                                      
Copying file://HG002_full_v0.14.pat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   95.0 B/   95.0 B]                                                
Operation completed over 1 objects/95.0 B.                      

/ [1 files][   30.0 B/   30.0 B]                                                
Operation completed over 1 objects/30.0 B.                                       
Copying file://HG01361.mat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][    0.0 B/    0.0 B]                                                
Operation completed over 1 objects.                                              
Copying file://HG01361.pat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   30.0 B/   30.0 B]                                                
Operation completed over 1 objects/30.0 B.                                       
Copying file://HG01891.mat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   30.0 B/   30.0 B]                                                
Operation completed over 1 objects/30.0 B.                                       
Copying file://HG01891.pat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   30.0 

Copying file://HG02717.pat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   28.0 B/   28.0 B]                                                
Operation completed over 1 objects/28.0 B.                                       
Copying file://HG02723.mat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   56.0 B/   56.0 B]                                                
Operation completed over 1 objects/56.0 B.                                       
Copying file://HG02723.pat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   58.0 B/   58.0 B]                                                
Operation completed over 1 objects/58.0 B.                                       
Copying file://HG02818.mat_adapter.bed [Content-Type=application/octet-stream]...
/ [1 files][   28.0 B/   28.0 B]                                                
Operation completed over 1 objects/28.0 B.                                       
Copying file://HG028

**Add columns for the adapter beds**

In [16]:
decont_results_df['mat_adapter_bed'] = f"{bucket}adapter_work/sample_beds/" + decont_results_df.index + ".mat_adapter.bed"
decont_results_df['pat_adapter_bed'] = f"{bucket}adapter_work/sample_beds/" + decont_results_df.index + ".pat_adapter.bed"

# Look At How Many Adapter Hits We Are Finding

In [17]:
## Set bed column names
bed_col_names = ["contig", "start", "stop"]

In [18]:
track_results_df = decont_results_df.copy()


track_results_df.drop(columns = ["pat_contam_results", "pat_adapter_paf", "hifiasm_mat_fa", "mat_adapter_bed", 
                                 "pat_adapter_bed", "hifiasm_pat_fa", "mat_adapter_paf", "mat_contam_results"],
                      inplace = True)


new_column_ls = ['mat_decon_num_adapter_hit', 'mat_decon_num_contigs_w_ad', 'mat_decon_bp_adapter_seq',
                 'pat_decon_num_adapter_hit', 'pat_decon_num_contigs_w_ad', 'pat_decon_bp_adapter_seq',
                 'mat_minimap_num_adapter_hit', 'mat_minimap_num_contigs_w_ad', 'mat_minimap_bp_adapter_seq',
                 'pat_minimap_num_adapter_hit', 'pat_minimap_num_contigs_w_ad', 'pat_minimap_bp_adapter_seq']


for new_column in new_column_ls:
    track_results_df[new_column] = 0

In [19]:
for index, row in track_results_df.iterrows():
    
    sample_id = row.name

    
    ## Get file names for the files we created above
    mat_decont_bed_fn   = "../vecscreen_results/" + sample_id + ".mat_decontam_results.bed"
    pat_decont_bed_fn   = "../vecscreen_results/" + sample_id + ".pat_decontam_results.bed"

    pat_minimap2_bed_fn = "../minimap2_results/" + sample_id + ".pat.minimap2_results.bed"
    mat_minimap2_bed_fn = "../minimap2_results/" + sample_id + ".mat.minimap2_results.bed"


    ## Maternal Decon Pipeline Hits
    mat_hits_bed_df = pd.read_csv(mat_decont_bed_fn, sep='\t', names=bed_col_names)

    track_results_df.loc[index, 'mat_decon_num_adapter_hit']  = len(mat_hits_bed_df.index)
    track_results_df.loc[index, 'mat_decon_num_contigs_w_ad'] = mat_hits_bed_df['contig'].nunique()
    track_results_df.loc[index, 'mat_decon_bp_adapter_seq']   = sum(mat_hits_bed_df['stop'] - mat_hits_bed_df['start'])
    
    
    ## paternal Decon Pipeline Hits
    pat_hits_bed_df = pd.read_csv(pat_decont_bed_fn, sep='\t', names=bed_col_names)

    track_results_df.loc[index, 'pat_decon_num_adapter_hit']  = len(pat_hits_bed_df.index)
    track_results_df.loc[index, 'pat_decon_num_contigs_w_ad'] = pat_hits_bed_df['contig'].nunique()
    track_results_df.loc[index, 'pat_decon_bp_adapter_seq']   = sum(pat_hits_bed_df['stop'] - pat_hits_bed_df['start'])
    
    
    ## Maternal minimap2 Hits
    mat_hits_bed_df = pd.read_csv(mat_minimap2_bed_fn, sep='\t', names=bed_col_names)

    track_results_df.loc[index, 'mat_minimap_num_adapter_hit']  = len(mat_hits_bed_df.index)
    track_results_df.loc[index, 'mat_minimap_num_contigs_w_ad'] = mat_hits_bed_df['contig'].nunique()
    track_results_df.loc[index, 'mat_minimap_bp_adapter_seq']   = sum(mat_hits_bed_df['stop'] - mat_hits_bed_df['start'])
    
    
    ## paternal minimap2 Hits
    pat_hits_bed_df = pd.read_csv(pat_minimap2_bed_fn, sep='\t', names=bed_col_names)

    ## Set paternal number of hits in data frame
    track_results_df.loc[index, 'pat_minimap_num_adapter_hit']  = len(pat_hits_bed_df.index)
    track_results_df.loc[index, 'pat_minimap_num_contigs_w_ad'] = pat_hits_bed_df['contig'].nunique()
    track_results_df.loc[index, 'pat_minimap_bp_adapter_seq']   = sum(pat_hits_bed_df['stop'] - pat_hits_bed_df['start'])

In [20]:
track_results_df

Unnamed: 0_level_0,sample_name,mat_decon_num_adapter_hit,mat_decon_num_contigs_w_ad,mat_decon_bp_adapter_seq,pat_decon_num_adapter_hit,pat_decon_num_contigs_w_ad,pat_decon_bp_adapter_seq,mat_minimap_num_adapter_hit,mat_minimap_num_contigs_w_ad,mat_minimap_bp_adapter_seq,pat_minimap_num_adapter_hit,pat_minimap_num_contigs_w_ad,pat_minimap_bp_adapter_seq
initial_qc_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
HG002_downsampled,HG002,1,1,26,1,1,26,0,0,0,0,0,0
HG002_full_v0.14,HG002,4,3,148,3,3,95,2,2,134,1,1,87
HG00438,HG00438,0,0,0,1,1,26,0,0,0,0,0,0
HG005,HG005,7,5,280,6,4,256,1,1,44,2,2,106
HG00621,HG00621,0,0,0,0,0,0,0,0,0,0,0,0
HG00673,HG00673,0,0,0,2,2,61,0,0,0,0,0,0
HG00733,HG00733,1,1,26,1,1,25,0,0,0,0,0,0
HG00735,HG00735,1,1,26,1,1,27,0,0,0,0,0,0
HG00741,HG00741,0,0,0,1,1,26,0,0,0,0,0,0
HG01071,HG01071,0,0,0,1,1,26,0,0,0,0,0,0


# Create Masking Data Table

In [21]:
upload_df = decont_results_df.copy()
upload_df = upload_df.rename(index={'1': 'mask_adapter_sample_id'})

In [22]:
tp.dataframe_to_table("mask_adapter_sample", upload_df)