In this notebook, if given a vcf file (for all chromosomes) for a set of individuals, I want to be able to quickly extract the sequences for a region with all the variants for each individual applied to the reference sequence

In [1]:
import os, re, sys
import numpy as np
import kipoiseq
import subprocess
import warnings

In [2]:
usage_codes = f'./enformer-usage-codes.py'

# import the enformer-usage_codes.py file
exec(open(usage_codes).read(), globals(), globals())

2022-11-10 22:43:24.699638: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /soft/perftools/darshan/darshan-3.3.0/lib:/opt/cray/pe/papi/6.0.0.1/lib64:/opt/cray/job/2.2.4-7.0.2.1_2.91__g36b56f4.ari/lib64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2020.0.166/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2020.0.166/linux/mpi/mic/lib:/opt/intel/compilers_and_libraries_2020.0.166/linux/ipp/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/compiler/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/mkl/lib/intel64:/opt/intel/compilers_and_libraries_2020.0.166/linux/tbb/lib/intel64/gcc4.4:/opt/intel/debugger_2020/libipt/intel64/lib:/opt/intel/comp

In [3]:
os.getcwd()

'/lus/theta-fs0/projects/covid-ct/imlab/users/temi/projects/TFXcan/scripts'

In [4]:
vcf_file = '../prj6_genotypes/merged_phased_SNPs.vcf.gz'
path_to_bcftools = "/home/temi/miniconda3/envs/compbio-tools/bin/bcftools"
path_to_vcftools = "/home/temi/miniconda3/envs/compbio-tools/bin/vcftools"
fasta_seq = '/lus-projects/covid-ct/imlab/data/hg_sequences/hg38/Homo_sapiens_assembly38.fasta'
path_to_tabix = '/home/temi/miniconda3/envs/compbio-tools/bin/tabix'

# this stores temporary files based on regions defined
temporary_vcfs = '/lus-projects/covid-ct/imlab/users/temi/projects/TFXcan/bcftools_region'

if not os.path.isdir(temporary_vcfs):
    os.mkdirs(temporary_vcfs)

#path_to_bcftools = "/home/temi/miniconda3/envs/compbio-tools/bin/bcftools"

Assuming these parameters

In [18]:
SEQUENCE_LENGTH = 393216
samples = ['LuCaP_141', 'LuCaP_145', 'LuCaP_167']
reg_chr = 'chr1'
reg_start = 133118092
reg_end = 133511307

# arguments
region = [reg_chr, reg_start, reg_end, '_'.join([reg_chr, str(reg_start), str(reg_end)])]

#vcf = VCF(vcf_file, samples=[samples[0]]) # select for the individuals
#region = f'{reg_chr}:{reg_start}-{reg_end}'

region

['chr1', 133118092, 133511307, 'chr1_133118092_133511307']

In [7]:
fasta_extractor = FastaStringExtractor(fasta_seq) # the function is defined in the usage codes

#interval = kipoiseq.Interval(reg_chr, reg_start, reg_end).resize(SEQUENCE_LENGTH) # create an interval object for the regions
#seq_extractor = kipoiseq.extractors.VariantSeqExtractor(reference_sequence=fasta_extractor)
#reference = seq_extractor.extract(interval, [], anchor=center)

In [30]:
def create_region_file(open_vcf, region, subset_vcf_dir, individual, software_paths=[]):
    '''
    Creates a subsetted vcf file per region
    Arguments:
        open_vcf: A vcf that is already opened
        region: region to query, a list [chr, start, end]
        subset_vcf_dir: the directory to save the subsetted file
        individual: the individual to subset the file for
        software_paths: a list of paths to software to use [bcftools, tabix]
    '''

    import kipoiseq
    import subprocess

    path_to_bcftools = software_paths[0]
    path_to_tabix = software_paths[1]

    SEQUENCE_LENGTH = 313216
    
    # Center the interval at the region
    interval = kipoiseq.Interval(region[0], region[1], region[2]).resize(SEQUENCE_LENGTH) # resizing will change the regions
    path = f'{subset_vcf_dir}/{individual}_{interval.chr}_{interval.start}_{interval.end}_subset_genotypes.vcf.gz'
    region_interval = f'{interval.chr}:{interval.start}-{interval.end}'
    view_cmd = f"{path_to_bcftools} view {open_vcf} -r {region_interval} -s {individual} --output-type z --output-file {path} && {path_to_tabix} -p vcf {path}"
    out = subprocess.run(view_cmd, shell=True)

    return {'subset_path':path, 'interval':interval, 'individual':individual, 'region':region}

def extract_individual_sequence(subset_dict, fasta_file_path, fasta_extractor, delete_region=False):

    '''
    Extracts a sequence from a reference for a region with the variants of an individual applied
    Arguments:
        subset_dict: dict - the result of `create_region_file()`
        fasta_file_path: string/path - the path to the fasta file
        fasta_extractor: extractor object - in case there are not variants to apply to that region, this helps extract the reference sequence instead
        delete_region: bool - after the sequence has been applied, should the temporary vcf file be deleted?
    '''
    import kipoiseq
    import warnings
    import os

    kseq_extractor = kipoiseq.extractors.SingleSeqVCFSeqExtractor(fasta_file=fasta_file_path, vcf_file=subset_dict['subset_path'])
    center = subset_dict['interval'].center() - subset_dict['interval'].start

    individual = subset_dict['individual']
    individual_sequences = {}
    for ind in [individual]:

        warnings.filterwarnings('error')
        
        try:
            individual_sequences[ind] = kseq_extractor.extract(interval=subset_dict['interval'], anchor=center, sample_id=ind)
            seq_source = 'var'
        except Warning:
            warnings.simplefilter("always", category=UserWarning)
            print('No variants for this region. Using reference genome.\n')
            individual_sequences[ind] = fasta_extractor.extract(interval=subset_dict['interval'], anchor=[])
            seq_source = 'ref'

    if delete_region == True:
        os.remove(subset_dict['subset_path'])
        os.remove(f"{subset_dict['subset_path']}.tbi")

    return {'sequence':individual_sequences, 'sequence_source':seq_source, 'region':subset_dict['region'][3]}

In [31]:
a = create_region_file(open_vcf=vcf_file, region=region, subset_vcf_dir=temporary_vcfs, individual=samples[0], software_paths=[path_to_bcftools, path_to_tabix])
b = extract_individual_sequence(subset_dict=a, fasta_file_path=fasta_seq, fasta_extractor=fasta_extractor, delete_region=False)

No variants for this region. Using reference genome.



In [53]:
interval_regions = pd.read_csv('.././enformer-minimal/intervals/LuCaP_145_FOXA1.txt', header=None)[0].tolist()
interval_regions = [ir.split('_') + [ir] for ir in interval_regions]
interval_regions = [[ir[0], int(ir[1]), int(ir[2]), ir[3]] for ir in interval_regions]
interval_regions

[['chr2', 186155022, 186155031, 'chr2_186155022_186155031'],
 ['chr6', 137209238, 137209247, 'chr6_137209238_137209247'],
 ['chr5', 125570522, 125570531, 'chr5_125570522_125570531'],
 ['chr4', 112338145, 112338154, 'chr4_112338145_112338154'],
 ['chr3', 161349247, 161349256, 'chr3_161349247_161349256'],
 ['chr3', 185586974, 185586983, 'chr3_185586974_185586983'],
 ['chr6', 152585586, 152585595, 'chr6_152585586_152585595'],
 ['chrX', 111853630, 111853639, 'chrX_111853630_111853639'],
 ['chr1', 103268936, 103268945, 'chr1_103268936_103268945'],
 ['chr1', 150333985, 150333994, 'chr1_150333985_150333994']]

In [56]:
output = []
for reg in interval_regions:
    a = create_region_file(open_vcf=vcf_file, region=reg, subset_vcf_dir=temporary_vcfs, individual=samples[1], software_paths=[path_to_bcftools, path_to_tabix])
    b = extract_individual_sequence(subset_dict=a, fasta_file_path=fasta_seq, fasta_extractor=fasta_extractor, delete_region=False)
    output.append(b)
#output

In [57]:
output[2]

{'sequence': {'LuCaP_145': 'TATATCTATGCAGCCCTCATTACTAGGCATTTGCAGACTTCACGCTTTTGGCTCCATTTGGTGCTGTCATTAATTAGTCAGTATTAACCTTAGAATACTTTAGAGCTTGCCTATTGAAATAACTTTGGGCACACTCCTTTTTCTCCCTCCCTCCCTTTCACTTGCAAGTGCTTACAGCTTGTTAATAAAAAACTGACAGTATTTCTTTGACCAGCCAGCAGCAGACATAAATCGTACTTCTAGTTGTAGCTGCAACCCATGAAAAGAAATGCCTCCAAACAGGAATTGTCCTCATTACAGGCTGCCCAGCATGGAATTACTGCAGAGCCTCACTGGCTGGCCACATCAGATAGAGCCTTCGACTGGCATCTTAGACAGGATAGAGCTTTTAGGAATGAGGTGGCAATGTGCTACACATTAAAAAGTAGATGAGCTAGTTTTCAGTTAAAATCAGGAAAATGAAGGCTATGGCTTCATGGTATCCAAATTTGTTTAAAAATGGAAAATATTTTCTATCCTGTTGAAAGCAATTCTTAAATATAAAGTCACAACACACACATATATGCATATACAAACACACAATGTGAATTGAATTGTAAAATGTGTATTCTACTACTGATATTTAAAGTGTTCCCTGTAATTATACTTCTCACCCTTCTGCATTTTCATACGGTTCCAAGGAGAGAAAGAAGTAGAATTACAGAGGTAATTACTTGTGAACAACCTCTCCTCCCCCCAGTCATATGCTGCCATCACTATGAAGGTTTTCCATGGGTTTCAGGGAAGGAGGCTGTCTTTTTCTACACCAGAGATGGTCCAGCTAAATTTGGAAGAATAAGAACTAGTTTATATAATAGCTAAAAATGAGCTAGATCTTTAGATCAGGTGCTGTAAATACCAATGGTGCCACGAGAATTACATAAATGTGTGAGGTCGGTTGGATGTGAGAAAATGGAAGATGGTGGGCTTGTC

In [108]:
vcf.close()

# Learning parsl

In [2]:
exec(open('./parsl-configuration.py').read(), globals(), globals())

1.3.0-dev


In [5]:
import random
from parsl.configs.local_threads import config
parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7fe3b7e97f10>

In [6]:
# write a short code that generates random numbers between a and b and returns the sum

@python_app
def doSomething(a=0, b=500):
    random_numbers = random.sample(range(a, b), 200)
    return(sum(random_numbers))


In [7]:
doSomething(23, 45)

<AppFuture at 0x7fe3e0121720 state=finished raised ValueError>

In [10]:
res_list = []
for i in range(0, 11):
    attempt = doSomething()
    #res_list.append(attempt)
    if (attempt.done() == True) and (attempt.result() > 50000):
        print(f'Finished {i} | result {attempt.result()}\n')
    if (attempt.done() == False):
        print(f'Not Finished {i} | result {attempt.result()}\n')

Finished 0 | result 49102

Finished 1 | result 52076

Finished 2 | result 50790

Finished 3 | result 46982

Finished 4 | result 49596

Finished 5 | result 50456

Finished 6 | result 47748

Finished 7 | result 46151

Finished 8 | result 50701

Finished 9 | result 49732

Finished 10 | result 49939



In [44]:
attempt.result()

50140

In [32]:
[r.done() for r in res_list]

[True, True, True, True, True, True, True, True, True, True, True]

In [33]:
[r.result() for r in res_list]

[{'out': 47919},
 {'out': 49629},
 {'out': 49406},
 {'out': 49589},
 {'out': 51462},
 {'out': 50509},
 {'out': 48292},
 {'out': 49680},
 {'out': 49387},
 {'out': 47572},
 {'out': 48228}]

In [4]:
parsl.clear()

In [26]:
res_list[0].output

AttributeError: 'AppFuture' object has no attribute 'output'