In [2]:
def circular_permuted1(seq):
    all_possible = []
    for i in range(len(seq)):
        permuted_seq = (seq + seq)[i:i+len(seq)]
        all_possible.append(permuted_seq)
    return(all_possible)

In [3]:
from collections import deque

def circular_permuted2(seq):
    all_possible = []
    d = deque(a)
    for i in range(len(a)):
        d.rotate()
        all_possible.append(''.join(list(d)))
    return(all_possible)

In [4]:
import toolz

def circular_permuted3(seq):
    all_possible = []
    for permuted_seq in toolz.sliding_window(len(seq), seq+seq[:-1]):
        all_possible.append(''.join(permuted_seq))
    return(all_possible)

In [5]:
def circular_permuted4(seq):
    n = len(seq)
    return([''.join([seq[i - j] for i in range(n)]) for j in range(n)])

In [6]:
def circular_permuted5(seq):
    n = len(seq)
    return([(seq + seq)[i:i+len(seq)] for i in range(n)])

In [20]:
def circular_permuted6(x):
    return([x[i:]+x[:i] for i in range(len(x))])

In [7]:
a = "ATCG"

In [8]:
%%timeit -r10
circular_permuted1(a)

100000 loops, best of 10: 2.03 µs per loop


In [9]:
%%timeit -r10
circular_permuted2(a)

100000 loops, best of 10: 3.57 µs per loop


In [10]:
%%timeit -r10
circular_permuted3(a)

The slowest run took 4.08 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 10: 3.63 µs per loop


In [11]:
%%timeit -r10
circular_permuted4(a)

100000 loops, best of 10: 5.84 µs per loop


In [12]:
%%timeit -r10
circular_permuted5(a)

The slowest run took 4.91 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 10: 2 µs per loop


In [24]:
%%timeit -r10
circular_permuted6(a) # Winner!

100000 loops, best of 10: 1.77 µs per loop


In [25]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

def normalise_str(in_dna):
    """Find all possible eqivalent STR sequences. 
    And return the first alphabetically.
    
    For example, TA = AT. But would return AT.
    """
    all_possible = [in_dna]
    
    # Get reverse complement
    dna = Seq(in_dna, generic_dna)
    rev_complement = str(dna.reverse_complement())
    all_possible.append(rev_complement)
    
    # Permute
    for seq in [in_dna, rev_complement]:
        for permuted_seq in circular_permuted1(seq): # Switch to faster permutation (6)
            all_possible.append(permuted_seq)

    # Sort and take the first
    all_possible.sort()
    return(all_possible[0])

In [26]:
from skbio.sequence import DNASequence

def normalise_str2(in_dna):
    """Find all possible eqivalent STR sequences. 
    And return the first alphabetically.
    
    For example, TA = AT. But would return AT.
    """
    all_possible = [in_dna]
    
    # Get reverse complement
    dna = DNASequence(in_dna)
    rev_complement = str(dna.reverse_complement())
    all_possible.append(rev_complement)
    
    # Permute
    for seq in [in_dna, rev_complement]:
        for permuted_seq in circular_permuted1(seq): # Switch to faster permutation (6)
            all_possible.append(permuted_seq)

    # Sort and take the first
    all_possible.sort()
    return(all_possible[0])

In [27]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

def self_and_rev_complement1(in_dna):
    all_possible = [in_dna]
    
    # Get reverse complement
    dna = Seq(in_dna, generic_dna)
    rev_complement = str(dna.reverse_complement())
    all_possible.append(rev_complement)
    return(all_possible)

from skbio.sequence import DNASequence

def self_and_rev_complement2(in_dna):
    all_possible = [in_dna]
    
    # Get reverse complement
    dna = DNASequence(in_dna)
    rev_complement = str(dna.reverse_complement())
    all_possible.append(rev_complement)
    return(all_possible)

In [30]:
%%timeit -r10
self_and_rev_complement1(a)

The slowest run took 4.18 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 10: 5.91 µs per loop


In [31]:
%%timeit -r10
self_and_rev_complement2(a)

100000 loops, best of 10: 11 µs per loop


In [32]:
self_and_rev_complement2(a)

['ATCG', 'CGAT']

In [17]:
# parse VCF files of STR genotype calls (from LobSTR and RepeatSeq)
# Compare the results

import vcf
import pandas as pd
import copy
import csv

def parse_vcf(fname, dirname, outname):
    fieldnames=['chr', 'pos', 'refallelelen', 'repeatunit', 'normrepeatunit', 'repeatunitlen', 'genotype', 'depth']
    with open(dirname + fname, 'r') as f:
        with open(outname, 'w') as csvfile:
            csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames, 
                                   delimiter=',', extrasaction='ignore')
            csvwriter.writeheader()
    
            vcf_reader = vcf.Reader(f)
            sample1 = vcf_reader.samples[0] # Get list of samples in the VCF. save the first

            df = pd.DataFrame()
            for record in vcf_reader:
    
                info = {}
        
                # Locus data
                info['chr'] = record.CHROM
                info['pos'] = record.POS
                info['refallelelen'] = record.INFO['RL']
                info['repeatunit'] = record.INFO['RU']
                # Calculated
                info['normrepeatunit'] = normalise_str(info['repeatunit'])
                info['repeatunitlen'] = len(info['repeatunit']) 
                
                # Genotype data
                info['genotype'] = record.genotype(sample1)['GT']

                # Not sure if this is for the individual sample, or over all samples.
                try:
                    info['depth'] = record.INFO['DP']
                except KeyError:
                    try:
                        info['depth'] = record.genotype(sample1)['DP']
                    except KeyError:
                        info['depth'] = None

                csvwriter.writerow(info)        

In [18]:
dirname = '/Users/hd_vlsci/Documents/git/STR-pipelines/data/intersections_LobSTR-RepeatSeq/'
lobstr = 'intersection0_10.vcf' # Called by LobSTR only
both = 'intersection0_10_1.vcf' # Called by LobSTR and RepeatSeq
repeatseq = 'intersection0_11.vcf' # Called by RepeatSeq only

parse_vcf(both, dirname, outname='LobSTR_RepeatSEQ_intersect.csv')
parse_vcf(lobstr, dirname, outname='LobSTR_only.csv')
parse_vcf(repeatseq, dirname, outname='RepeatSEQ_only.csv')

# Data to extract:
# Chrom
# Pos
# Genotype
#GT
# Allele Length Offset(s)" - need to figure out what this is!
# Depth
#DP
# Repeat unit
#RU
# Reference length of repeat
#RL
# Things to calculate: