In this notebook, if given a vcf file (for all chromosomes) for a set of individuals, I want to be able to quickly extract the sequences for a region with all the variants for each individual applied to the reference sequence

In [1]:
import os, re, sys
import cyvcf2
from cyvcf2 import VCF
import numpy as np
import kipoiseq
from Bio.Seq import MutableSeq
import subprocess
import warnings

In [2]:
usage_codes = f'./enformer-usage-codes.py'

# import the enformer-usage_codes.py file
exec(open(usage_codes).read(), globals(), globals())

2022-10-25 15:25:18.995775: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /soft/perftools/darshan/darshan-3.3.0/lib:/opt/cray/pe/papi/6.0.0.1/lib64:/opt/cray/job/2.2.4-7.0.2.1_2.91__g36b56f4.ari/lib64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2020.0.166/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2020.0.166/linux/mpi/mic/lib:/opt/intel/compilers_and_libraries_2020.0.166/linux/ipp/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/mkl/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/tbb/lib/intel64/gcc4.4:/opt/intel/debugger_2020/libipt/intel64/lib:/opt/intel/comp

In [3]:
os.getcwd()

'/lus/theta-fs0/projects/covid-ct/imlab/users/temi/projects/TFXcan/scripts'

In [4]:
vcf_file = '../prj6_genotypes/merged_phased_SNPs.vcf.gz'
path_to_bcftools = "/home/temi/miniconda3/envs/compbio-tools/bin/bcftools"
path_to_vcftools = "/home/temi/miniconda3/envs/compbio-tools/bin/vcftools"
fasta_seq = '/lus-projects/covid-ct/imlab/data/hg_sequences/hg38/Homo_sapiens_assembly38.fasta'
path_to_tabix = '/home/temi/miniconda3/envs/compbio-tools/bin/tabix'


# this stores temporary files based on regions defined
temporary_vcfs = '/lus-projects/covid-ct/imlab/users/temi/projects/TFXcan/bcftools_region'

if not os.path.isdir(temporary_vcfs):
    os.mkdir(temporary_vcfs)

#path_to_bcftools = "/home/temi/miniconda3/envs/compbio-tools/bin/bcftools"

Assuming these parameters

In [21]:
SEQUENCE_LENGTH = 393216
samples = ['LuCaP_141', 'LuCaP_145', 'LuCaP_167']
reg_chr = 'chr1'
reg_start = 133118092
reg_end = 133511307

# arguments
region = [reg_chr, reg_start, reg_end]

vcf = VCF(vcf_file, samples=samples) # select for the individuals
#region = f'{reg_chr}:{reg_start}-{reg_end}'

region

['chr1', 133118092, 133511307]

In [102]:

#fasta_extractor = FastaStringExtractor(fasta_seq) # the function is defined in the usage codes

#interval = kipoiseq.Interval(reg_chr, reg_start, reg_end).resize(SEQUENCE_LENGTH) # create an interval object for the regions
#seq_extractor = kipoiseq.extractors.VariantSeqExtractor(reference_sequence=fasta_extractor)
#reference = seq_extractor.extract(interval, [], anchor=center)

In [22]:

interval = kipoiseq.Interval('chr1', 133118092, 133511307).resize(SEQUENCE_LENGTH)

In [23]:
interval.center() - interval.start, interval.center(), interval.width(), interval.start, interval.end

(196608, 133314700, 393216, 133118092, 133511308)

In [53]:
def create_region_file(vcf_file, region, output_dir, individual, software_paths=[path_to_bcftools, path_to_tabix]):
    '''
    Creates a subsetted vcf file per region
    
    
    '''

    path_to_bcftools = software_paths[0]
    path_to_tabix = software_paths[1]

    # Center the interval at the region
    interval = kipoiseq.Interval(region[0], region[1], region[2]).resize(SEQUENCE_LENGTH) # resizing will change the regions

    path = f'{output_dir}/{individual}_{interval.chr}_{interval.start}_{interval.end}_subset_genotypes.vcf.gz'

    region = f'{interval.chr}:{interval.start}-{interval.end}'

    view_cmd = f"{path_to_bcftools} filter {vcf_file} -r {region} --output-type z --output {path} && {path_to_tabix} -p vcf {path}"

    out = subprocess.run(view_cmd, shell=True)

    return {'subset_path':path, 'interval':interval}

def extract_individual_sequence(region_details, individuals, fasta_file_path, delete_region=False):

    '''
    Extracts a sequence from a reference for a region with the variants of an individual applied
    '''

    kseq_extractor = kipoiseq.extractors.SingleSeqVCFSeqExtractor(fasta_file=fasta_file_path, vcf_file=region_details['subset_path'])

    center = region_details['interval'].center() - region_details['interval'].start

    individuals_sequences = {}
    for ind in individuals:

        warnings.filterwarnings('error')
        
        try:
            individuals_sequences[ind] = kseq_extractor.extract(interval=region_details['interval'], anchor=center, sample_id=ind)
            seq_source = 'var'
        except Warning:
            warnings.simplefilter("always", category=UserWarning)
            
            print('No variants for this region. Using reference genome.\n')
            individuals_sequences[ind] = fasta_extractor.extract(interval=region_details['interval'], anchor=[])
            seq_source = 'ref'

    if delete_region == True:
        os.remove(region_details['subset_path'])
        os.remove(f"{region_details['subset_path']}.tbi")

    return {'sequence':individuals_sequences, 'sequence_source':seq_source}

In [57]:
type(None)

NoneType

In [59]:
j = None
isinstance(j, type(None))

True

In [55]:
a = create_region_file(vcf_file, region, output_dir=temporary_vcfs, individual=samples[0])
b = extract_individual_sequence(a, [samples[0]], fasta_file_path=fasta_seq)

No variants for this region. Using reference genome.



In [56]:
for reg in [['chr1', 133118092, 133511307], ['chr3', 30879959, 30879961]]:
    a = create_region_file(vcf_file, reg, output_dir=temporary_vcfs, individual=samples[0])
    b = extract_individual_sequence(a, [samples[0]], fasta_file_path=fasta_seq)

No variants for this region. Using reference genome.



In [108]:
vcf.close()

# Learning parsl

In [2]:
exec(open('./parsl-configuration.py').read(), globals(), globals())

1.3.0-dev


In [5]:
import random
from parsl.configs.local_threads import config
parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7fe3b7e97f10>

In [6]:
# write a short code that generates random numbers between a and b and returns the sum

@python_app
def doSomething(a=0, b=500):
    random_numbers = random.sample(range(a, b), 200)
    return(sum(random_numbers))


In [7]:
doSomething(23, 45)

<AppFuture at 0x7fe3e0121720 state=finished raised ValueError>

In [10]:
res_list = []
for i in range(0, 11):
    attempt = doSomething()
    #res_list.append(attempt)
    if (attempt.done() == True) and (attempt.result() > 50000):
        print(f'Finished {i} | result {attempt.result()}\n')
    if (attempt.done() == False):
        print(f'Not Finished {i} | result {attempt.result()}\n')

Finished 0 | result 49102

Finished 1 | result 52076

Finished 2 | result 50790

Finished 3 | result 46982

Finished 4 | result 49596

Finished 5 | result 50456

Finished 6 | result 47748

Finished 7 | result 46151

Finished 8 | result 50701

Finished 9 | result 49732

Finished 10 | result 49939



In [44]:
attempt.result()

50140

In [32]:
[r.done() for r in res_list]

[True, True, True, True, True, True, True, True, True, True, True]

In [33]:
[r.result() for r in res_list]

[{'out': 47919},
 {'out': 49629},
 {'out': 49406},
 {'out': 49589},
 {'out': 51462},
 {'out': 50509},
 {'out': 48292},
 {'out': 49680},
 {'out': 49387},
 {'out': 47572},
 {'out': 48228}]

In [4]:
parsl.clear()

In [26]:
res_list[0].output

AttributeError: 'AppFuture' object has no attribute 'output'