### Dereplicate and detect circular contigs
You will need to put all contigs in a folder and provide that folder name. 
You must also provide an output folder name. 

In [1]:
%%bash
DATA_DIR=/ebio/abt3_projects/TwinsUK_viromes_Shao_Pei/tmp2/00_contigsVirome
OUT_DIR=/ebio/abt3_projects/TwinsUK_viromes_Shao_Pei/tmp2/01_Derep

cat $DATA_DIR/* > $OUT_DIR/allContigs.fa

perl ../GetSortSequenceLengths.pl $OUT_DIR/allContigs.fa $OUT_DIR/allContigs

makeblastdb -in $OUT_DIR/allContigs_sort.fna -dbtype nucl
blastn -query $OUT_DIR/allContigs_sort.fna -db $OUT_DIR/allContigs_sort.fna -outfmt 6 -evalue 1e-5 > $OUT_DIR/allContigs.OutSelfBlast.txt
perl ../DeRepCircContigs_Blastn.pl $OUT_DIR/allContigs.OutSelfBlast.txt $OUT_DIR/allContigs_len.txt $OUT_DIR/allContigs_sort.fna > $OUT_DIR/allContigs.DeRep.fna



Building a new DB, current time: 07/24/2018 16:57:31
New DB name:   /ebio/abt3_projects/TwinsUK_viromes_Shao_Pei/tmp2/01_Derep/allContigs_sort.fna
New DB title:  /ebio/abt3_projects/TwinsUK_viromes_Shao_Pei/tmp2/01_Derep/allContigs_sort.fna
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /ebio/abt3_projects/TwinsUK_viromes_Shao_Pei/tmp2/01_Derep/allContigs_sort.fna
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 107307 sequences in 4.61077 seconds.


### Contigs length distribution before dereplication

In [37]:
from os import listdir
from Bio import SeqIO
import pandas as pd

In [8]:
inputFolder = "/ebio/abt3_projects/TwinsUK_viromes_Shao_Pei/tmp2/00_contigsVirome"
contigsLength = {}
for file in listdir(inputFolder):
    for record in SeqIO.parse(inputFolder+"/"+file,"fasta"):
        contigsLength[record.id] = len(record.seq)

In [11]:
lenTable = pd.DataFrame(data=[contigsLength.keys(),contigsLength.values()])
lenTable = lenTable.T
lenTable.head()

Unnamed: 0,0,1
0,919_Q1_8Lpp_out,1185
1,9244_O1_8Lpp_out,596
2,1558_J2_8Lpp_out,572
3,2566_O1_8Lpp_out,2406
4,7182_K1_8Lpp_out,716


In [36]:
print "There are %d contigs" %lenTable.shape[0]
print "The largest contig is %d nt lenght. Contig %s" %(lenTable[1].max(),lenTable[lenTable[1]==lenTable[1].max()][0].values)
print "Mean length: %f" %lenTable[1].mean()
print "Standard deviation: %f" %lenTable[1].std()
print "Median length: %f" %lenTable[1].median()
print "Inter quartile range: %d" %(lenTable[1].quantile(0.75) - lenTable[1].quantile(0.25))
print "Min: %d" %lenTable[1].min()
print "Max: %d" %lenTable[1].max()

There are 107307 contigs
The largest contig is 79863 nt lenght. Contig ['678_Y2_8Lpp_out']
Mean length: 1185.971987
Standard deviation: 1740.820737
Median length: 727.000000
Inter quartile range: 505
Min: 501
Max: 79863


##### Make a table of lengths for the dereplicated contigs

In [38]:
from Bio import SeqIO
import pandas as pd

In [39]:
deRep = {}
for record in SeqIO.parse("/ebio/abt3_projects/TwinsUK_viromes_Shao_Pei/tmp2/01_Derep/allContigs.DeRep.fna","fasta"):
    deRep[record.id] = len(record.seq)
    #deRep.append("_".join(record.id.split("_")[:4]))

In [40]:
lenTable = pd.DataFrame(data=[deRep.keys(),deRep.values()])
lenTable = lenTable.T
lenTable.head()

Unnamed: 0,0,1
0,3943_H1_8Lpp_out_L964,964
1,4325_I1_8Lpp_out_L985,985
2,2512_J1_8Lpp_out_L538,538
3,6012_Da1_8Lpp_out_L628,628
4,4329_P2_8Lpp_out_L530,530


In [50]:
print "There are %d contigs. Then %d contigs were dereplicated." %(lenTable.shape[0],107307 - lenTable.shape[0])
print "The largest contig is %d nt lenght. Contig %s" %(lenTable[1].max(),lenTable[lenTable[1]==lenTable[1].max()][0].values)
print "Mean length: %f" %lenTable[1].mean()
print "Standard deviation: %f" %lenTable[1].std()
print "Median length: %f" %lenTable[1].median()
print "Inter quartile range: %d" %(lenTable[1].quantile(0.75) - lenTable[1].quantile(0.25))
print "Min: %d" %lenTable[1].min()
print "Max: %d" %lenTable[1].max()
print "Num contigs >= 10kb: %d" %sum(lenTable[1]>=10000)

There are 76634 contigs. Then 30673 contigs were dereplicated.
The largest contig is 79863 nt lenght. Contig ['678_Y2_8Lpp_out_L79863']
Mean length: 1251.570008
Standard deviation: 1925.364086
Median length: 739.000000
Inter quartile range: 541
Min: 501
Max: 79863
Num contigs >= 10kb: 720


In [11]:
lenTable.to_csv("/ebio/abt3_projects/TwinsUK_viromes_Shao_Pei/tmp2/01_Derep/allContigs.DeRep_len.txt",
                                                        index=False,header=False,sep='\t')