In [191]:
import os
import time 
import numpy as np
import copy
# from Bio import SeqIO

In [179]:
project_dir = os.path.join(os.getcwd(), '../')
project_dir = os.path.abspath(project_dir)

In [180]:
# hg19_fpath = os.path.join(project_dir, 'source/hg19_20151104/hg19_sorted.fa')
# hg19 = SeqIO.parse(hg19_fpath, format='fasta')
# # def load_chromosome_21
# with open(hg19_fpath, 'r') as f:
#     chromosome_21 = []
#     chromosome_21_done = False
#     chromosome_21_processing = False
#     count = 0 
#     while not chromosome_21_done:
#         line = f.readline()
        
#         line = line.strip()
#         if line == '>ch17':
#             print('Found chromosome 17')
#             chromosome_21_processing = True
            
#         if chromosome_21_processing:
#             chromosome_21.append(line)
        
#         if chromosome_21_processing and '>chr' in line:
#             chromosome_21_processing = False
#             chromosome_21_done = True
        
#         count += 1
#         if count % 1000 == 9999:
#             print('line: {}'.format(line))

In [182]:
def generate_mutation(base):
    """
    Taking into account the current base, base, return a mutation.
    
    """
    
    if base == 'A':
        return np.random.choice(['C', 'G', 'T'])
    
    elif base == 'C':
        return np.random.choice(['A', 'G', 'T'])
    
    elif base == 'G':
        return np.random.choice(['A', 'C', 'T'])
    
    elif base == 'T':
        return np.random.choice(['A', 'C', 'G'])
    
    else:
        raise Exception('base is not a proper DNA nucleotide (ACGT).')

In [183]:
def introduce_random_mutations(vntr, m):
    """
    Generate a VNTR sequence with random mutations. The mutations will be the same across different copies. 
    
    Params
    ------
    
    - vntr, the DNA copy sequence which is copied. 
    - m, the number of SNP mutations that will be randomly introduced. 
    
    Returns
    -------
    A single copy of the VNTR sequence with m mutations. \
    """
    
    mutation_sites = np.random.choice(range(len(vntr)), m, replace=False)
    m_vntr = []
    for site, nucleotide in enumerate(vntr):
        if site in mutation_sites:
            m_vntr.append(generate_mutation(nucleotide))
        else:
            m_vntr.append(nucleotide)

    return ''.join(m_vntr)

In [184]:
def introduce_specific_mutations(vntr, sites, mutations):
    """
    Generate a VNTR sequence with the specified mutations at the specified sites. 
    
    Params
    ------
    
    - vntr, the DNA copy sequence which is copied. 
    - sites, locus where the SNP mutation will be introduced. 
    - mutations, a list of mutations.
    
    Returns
    -------
    A single copy of the VNTR sequence with mutations at the specified sites. 
    """
    
    if len(sites) != len(mutations):
        raise Exception('The number of sites and mutations do not correspond.')
        
#     print('VNTR: {}'.format(vntr))
#     print('sites: {}'.format(sites))
#     print('mutations: {}'.format(mutations))
#     print('\n')
    
    m_vntr = list(vntr)
    for site, nucleotide in enumerate(m_vntr):
        
#         print('site: {}'.format(site))
        
        if site in sites:
            mut_idx = sites.index(site)
            
#             print('\tmut_idx: {}, mutant: {}, nucleotide: {}'.\
#                   format(mut_idx, mutations[mut_idx], nucleotide))
            
            if nucleotide == mutations[mut_idx]:
                raise Exception('Not a mutation. The current site is {}. The current '.format(site) + \
                                'nucleotide is {}. Please use a different nucleotide '.format(nucleotide) + \
                                'for this site.')
            else:
                m_vntr[site] =  mutations[mut_idx]
    return ''.join(m_vntr)

In [181]:
def generate_sequence_with_vntr(sequence, loc, vntr):
    nseq = sequence[0:loc]
    nseq += vntr 
    nseq += sequence[loc:]
    return nseq 

In [175]:
length = 10
sequence = ''.join(np.random.choice(['A', 'C', 'G', 'T'], size=length))
sequence

In [188]:
if vntr == '':
    vntr = 'GCACGCTGCTGTGTAGTGGAGAAAGGGCAGGCAGCGAGCAAGCGTGTACAAGGTATATACGTGCC'

In [189]:
spacer = True
if spacer == True:
    spacers = ' ' * 10
    vntr = spacers + vntr + spacers

In [170]:
m_vntr = introduce_specific_mutations(vntr, [0], ['C'])
m_vntr

'CCACGCTGCTGTGTAGTGGAGAAAGGGCAGGCAGCGAGCAAGCGTGTACAAGGTATATACGTGCC'

In [173]:
sequence = generate_sequence_with_vntr(sequence, 10, vntr)

In [174]:
sequence

'CTCCG     GCACGCTGCTGTGTAGTGGAGAAAGGGCAGGCAGCGAGCAAGCGTGTACAAGGTATATACGTGCC          GCACGCTGCTGTGTAGTGGAGAAAGGGCAGGCAGCGAGCAAGCGTGTACAAGGTATATACGTGCC     GCGCG'

In [190]:
# # simulation of 150 bp single ended reads 
# /frazer01/home/joreyna/software/art_src_MountRainier_Linux/examples/run_test_examples_illumina.sh

# To run ART use:
#     /frazer01/home/joreyna/software/art_src_MountRainier_Linux/art_illumina or
#     cd into the directory and say ./art_illumina

In [1]:
import math

In [33]:
def critical_copy_number(rlen, clen):
    """
    Determines the minimum number of VNTR copies needed 
    so a read can be completely mapped inside of a VNTR.
    """
    
    if rlen < clen: 
        raise Exception('clen is larger than rlen.')
        
    if rlen % clen > 0:
        return int(math.ceil(float(rlen) / clen))
    else:
        return 1 + (rlen/clen)
        