Input is output from mapping.ipynb: sample_hs.mapQ25.ex.fq

In [None]:
%%bash
fastaptamer_count -i sample_hs.mapQ25.ex.fq -o sample_hs.mapQ25.ex.fastaptamer_count.fa
perl -ne 'if(/^>(\S+)/){print "$1\n"}' sample_hs.mapQ25.ex.fastaptamer_count.fa | awk -F '-' '{print $2"\t"$3}' - > sample_hs.mapQ25.ex.fastaptamer_count

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
install.packages("poweRlaw")
library(poweRlaw)
library(ggplot2)
library(plyr)
library(scales)
power_law_line <- function(pl){
  #get Xmin fitting power law
  xmin <- pl$getXmin()
  
  #Maximum likelihood estimation (MLE)
  #For each value of xmin, the mle will be used
  alpha <- pl$getPars()
  
  #get x
  x <- pl$dat
  
  #data frame plotting power law line
  x_axis <- exp(seq(log(xmin),log(max(x)),length.out = 100))
  x_axis <- unique(round(x_axis))
  y <- 10^(-alpha * (log10(x_axis) - log10(pl$internal$values[xmin])) + log10(pl$internal$freq[xmin]))
  line_fit <- data.frame(x_axis, y)
  
  # Truncate for y < 1, keeping 1 extra
  keep <- length(which(line_fit$y >= 1)) + 1
  line_fit <- line_fit[1:keep, ]
}

plot_power_law <- function(filename){
  bitmap(paste0(filename,".tiff"), res = 600)
  freq_table <- read.table(filename)
  counts <- count(freq_table$V1)
  colnames(counts) <- c('Copy', 'Frequency')
  
  ####for power law line
  counts2 <- counts[counts$Frequency > 1, ]
  vals <- rep(counts2$Copy, counts2$Frequency)
  pl <- displ$new(vals)
  
  #set Xmin
  xmin <- 5
  pl$setXmin(xmin)
  
  #estimate_pars estimates the distribution’s parameters using their maximum likelihood estimator. 
  #This estimate is conditional on the current xmin value.
  pl$setPars(estimate_pars(pl))
  
  line_fit <- power_law_line(pl) 
  write.csv(line_fit,paste0(filename,".line_fit.csv"))
  #plot
  img <- ggplot(counts, aes(Copy, Frequency)) +
    geom_point(col='blue', size=2) +
    geom_line(data=line_fit, aes(x_axis, y), col='red', size=1.5) + theme_bw() +
                  labels=trans_format('log10', math_format(10^.x)),limits = c(0.8,1e+7)) +
    scale_y_log10(breaks=trans_breaks('log10', function(x) 10^x),
                  labels=trans_format('log10', math_format(10^.x)),limits = c(0.8,1e+7)) +
    xlab("Copy Number") + ylab("Frequency") +
    annotate('text', x=70, y=70, label=paste(expression(alpha), '== ',
                                             format(round(pl$pars, 2), nsmall=2)), col='red', size=3.0, parse=TRUE)
  print(img)
  invisible(dev.off())
}

#export plots
plot_power_law("sample_hs.mapQ25.ex.fastaptamer_count")

Extract enriched sequences (removing background sequences) for clustering purpose
Input file 1: sample_hs.mapQ25.ex.fastaptamer_count.fa
Input file 2: line_fit csv file printed out from plot_power_law

In [None]:
#!/usr/bin/python
import sys, getopt
def open_file(arg):                                         
    input_file=""                                          
    output_file=""
    #parsing command line (structure)
    try:
        a, b = getopt.getopt(arg,"h:i1:i2:o:",["input1=","input2=","output="]) #getopt returns (option letters, value) pairs and arguments
    except getopt.GetoptError:                              #if not getting options, print a line below
        print("file_name.py <input1> <input2> <output>\n")
        print("--input1: sample_hs.mapQ25.ex.fastaptamer_count.fa\n")
        print("--input2: sample_hs.mapQ25.ex.fastaptamer_count.line_fit.csv\n")
        print("--output: cut-off_sequences.fa")
        sys.exit(2)                                         #exit from Python with Unix 
    # input and output file?
    for option,value in a:
        if option == "-h":
            print ("file_name.py <input1> <input2> <output>\n")
            print("--input1: sample_hs.mapQ25.ex.fastaptamer_count.fa\n")
            print("--input2: sample_hs.mapQ25.ex.fastaptamer_count.line_fit.csv\n")
            print("--output: cut-off_sequences.fa")
            sys.exit()
        elif option in ("-i1","--input1"):
            input_file1 = value
        elif option in ("-i2","--input2"):
            input_file2 = value
        elif option in ("-o","--output"):
            output_file = value           
        
if __name__ == "__main__":
    from Bio import SeqIO
    open_file(sys.argv[1:])
    f = open(sys.argv[1],"rtU")
    g = open(sys.argv[2],"rtU")
    h = open(sys.argv[3],"w")
    col2 = []
    for line in g:
        col2.append(line.split(",")[1])
    copy_number = int(col2[-1])
    for sequence in SeqIO.parse(f,"fasta"):
        a = int(sequence.id.split("-")[1])
        if a > copy_number:
            h.write (str(">"+sequence.id+"\n"+sequence.seq+"\n"))
    f.close()
    g.close()
    h.close()

In [None]:
%%bash
./extract_enrich.py -f sample_hs.mapQ25.ex.fastaptamer_count.fa -c sample_hs.mapQ25.ex.fastaptamer_count.line_fit.csv -o cut_off_sequences.fa

In [None]:
%%bash
fastaptamer_cluster -i cut_off_sequences.fa -o cut_off_cluster.fa -d 40

In [None]:
%%bash
perl fasta_to_fastq.pl cut_off_cluster.fa > cut_off_cluster.fq
velveth assembly 25 -short -fastq cut_off_cluster.fq
velvetg assembly/