In [121]:
"""
Blasts genes from svevo and zavitan and finds putative orthologues
Outputs first, a table with genes corresponding to each reference
Adds interesting information to output, position in genome according to annotation,
position of blast against genome, GO:terms
"""
#blastn -perc_identity 85 -subject disk/counts/svevo.counts.fa -query disk/counts/zav_v2.counts.fa -outfmt "6 qseqid sseqid pident mismatch gapopen qstart qend sstart send evalue length qlen slen"  -evalue 10e-10 > disk/counts/blast_zv_sv.csv
#blastn -perc_identity 85 -query disk/counts/svevo.counts.fa -subject disk/counts/zav_v2.counts.fa -outfmt "6 qseqid sseqid pident mismatch gapopen qstart qend sstart send evalue length qlen slen"  -evalue 10e-10 > disk/counts/blast_sv_zv.csv

'\nBlasts genes from svevo and zavitan and finds putative orthologues\nOutputs first, a table with genes corresponding to each reference\nAdds interesting information to output, position in genome according to annotation,\nposition of blast against genome, GO:terms\n'

In [122]:
import pandas as pd

In [123]:
file_zavitan = 'disk/counts/zavitan_v2.genes.csv'
file_svevo = 'disk/counts/svevo.genes.csv'
file_blast = 'disk/counts/blast_sv_zv.csv'

df_zavitan = pd.read_csv(file_zavitan, sep=',', )
df_svevo = pd.read_csv(file_svevo, sep=',', )

df_blast = pd.read_csv(file_blast, sep='\t', header=None)
df_blast.columns = ['svevo','zavitan','pident','mismatch','gapopen','qstart','qend','sstart','send','evalue','length','qlen','slen']


In [124]:
df_blast = df_blast[((df_blast.length / df_blast.qlen) >= 0.7) &
       ((df_blast.length / df_blast.slen) >= 0.7) &
       ((df_blast.length / df_blast.qlen) <= 1.3) &
       ((df_blast.length / df_blast.slen) <= 1.3)]

print('blast:',len(df_blast_svevo.index))
df_blast.head(4)

blast: 195


Unnamed: 0,svevo,zavitan,pident,mismatch,gapopen,qstart,qend,sstart,send,evalue,length,qlen,slen
2,TRITD3Av1G027010,TRIDC3Av2G023650,99.859,4,0,1,2839,1,2839,0.0,2839,2839,2839
3,TRITD3Av1G027040,TRIDC3Av2G023690,100.0,0,0,1162,4852,39,3729,0.0,3691,4852,3729
10,TRITD3Av1G027200,TRIDC3Av2G023850,97.69,114,18,2078,10540,2294,10718,0.0,8485,10540,10718
13,TRITD3Av1G027230,TRIDC3Av2G023890,99.538,46,7,1,13633,1,13632,0.0,13641,13633,13632


In [125]:
df_zavitan = df_zavitan[['id']]
df_svevo = df_svevo[['id']]

In [126]:
df_zavitan_merge = pd.merge(df_zavitan, df_blast, left_on='id', right_on='zavitan', how='left')
df_zavitan_merge = df_zavitan_merge[['id','svevo']]
df_zavitan_merge.head(3)

Unnamed: 0,id,svevo
0,TRIDC3Av2G023650,TRITD3Av1G027010
1,TRIDC3Av2G023690,TRITD3Av1G027040
2,TRIDC3Av2G023850,TRITD3Av1G027200


In [127]:
df_svevo_merge = pd.merge(df_svevo, df_blast, left_on='id', right_on='svevo', how='left')
df_svevo_merge = df_svevo_merge[['id','zavitan']]
df_svevo_merge.head(3)

Unnamed: 0,id,zavitan
0,TRITD3Av1G026840,
1,TRITD3Av1G026860,
2,TRITD3Av1G027010,TRIDC3Av2G023650


In [128]:
df_merge = pd.merge(df_svevo_merge, df_zavitan_merge, left_on='id', right_on='svevo', how='outer')
df_merge = df_merge[['id_x','id_y']]
df_merge.columns = ['svevo','zavitan']


In [129]:
df_merge.to_csv('disk/counts/cross.csv', sep=',', index=None)

In [130]:
df_merge.head(3)

Unnamed: 0,svevo,zavitan
0,TRITD3Av1G026840,
1,TRITD3Av1G026860,
2,TRITD3Av1G027010,TRIDC3Av2G023650


In [131]:
#add info to list
file_ann_svevo = 'disk/svevo.gtf'
file_ann_zav_v2 = 'disk/zavitan_v2.gtf'

df_ann_sv = pd.read_csv(file_ann_svevo, index_col=False, sep='\t', header=None)
df_ann_sv.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

df_ann_zv = pd.read_csv(file_ann_zav_v2, index_col=False, sep='\t', header=None)
df_ann_zv.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']


In [132]:
df_ann_zv

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,chr1A,PGSB_Mar2017,gene,40613,41835,.,-,.,ID=TRIDC1Av2G000010;primconf=LC
1,chr1A,PGSB_Mar2017,mRNA,40613,41835,.,-,.,ID=TRIDC1Av2G000010.1;Parent=TRIDC1Av2G000010;...
2,chr1A,PGSB_Mar2017,CDS,40613,40699,.,-,0,Parent=TRIDC1Av2G000010.1
3,chr1A,PGSB_Mar2017,CDS,40774,40980,.,-,0,Parent=TRIDC1Av2G000010.1
4,chr1A,PGSB_Mar2017,CDS,41299,41436,.,-,0,Parent=TRIDC1Av2G000010.1
5,chr1A,PGSB_Mar2017,CDS,41515,41835,.,-,0,Parent=TRIDC1Av2G000010.1
6,chr1A,PGSB_Mar2017,gene,41896,42282,.,-,.,ID=TRIDC1Av2G000020;primconf=LC
7,chr1A,PGSB_Mar2017,mRNA,41896,42282,.,-,.,ID=TRIDC1Av2G000020.1;Parent=TRIDC1Av2G000020;...
8,chr1A,PGSB_Mar2017,CDS,41896,42138,.,-,0,Parent=TRIDC1Av2G000020.1
9,chr1A,PGSB_Mar2017,CDS,42220,42282,.,-,0,Parent=TRIDC1Av2G000020.1
