In [1]:
"""
Blasts genes from svevo and zavitan and finds putative orthologues
Outputs first, a table with genes corresponding to each reference
Adds interesting information to output, position in genome according to annotation,
position of blast against genome, GO:terms
"""
#blastn -perc_identity 85 -subject disk/counts/svevo.counts.fa -query disk/counts/zav_v2.counts.fa -outfmt "6 qseqid sseqid pident mismatch gapopen qstart qend sstart send evalue length qlen slen"  -evalue 10e-10 > disk/counts/blast_zv_sv.csv
#blastn -perc_identity 85 -query disk/counts/svevo.counts.fa -subject disk/counts/zav_v2.counts.fa -outfmt "6 qseqid sseqid pident mismatch gapopen qstart qend sstart send evalue length qlen slen"  -evalue 10e-10 > disk/counts/blast_sv_zv.csv

'\nBlasts genes from svevo and zavitan and finds putative orthologues\nOutputs first, a table with genes corresponding to each reference\nAdds interesting information to output, position in genome according to annotation,\nposition of blast against genome, GO:terms\n'

In [2]:
import pandas as pd

In [5]:
file_zavitan = 'disk/counts/zavitan_v2.genes.csv'
file_svevo = 'disk/counts/svevo.genes.csv'
file_blast = 'disk/counts/blast_sv_zv.csv'

df_zavitan = pd.read_csv(file_zavitan, sep=',', )
df_svevo = pd.read_csv(file_svevo, sep=',', )

df_blast = pd.read_csv(file_blast, sep='\t', header=None)
df_blast.columns = ['svevo','zavitan','pident','mismatch','gapopen','qstart','qend','sstart','send','evalue','length','qlen','slen']


In [6]:
df_blast = df_blast[((df_blast.length / df_blast.qlen) >= 0.7) &
       ((df_blast.length / df_blast.slen) >= 0.7) &
       ((df_blast.length / df_blast.qlen) <= 1.3) &
       ((df_blast.length / df_blast.slen) <= 1.3)]

print('blast:',len(df_blast.index))
df_blast.head(4)

blast: 73


Unnamed: 0,svevo,zavitan,pident,mismatch,gapopen,qstart,qend,sstart,send,evalue,length,qlen,slen
0,TRITD3Av1G027040,TRIDC3Av2G023690,100.0,0,0,1162,4852,39,3729,0.0,3691,4852,3729
7,TRITD3Av1G027860,TRIDC3Av2G024160,99.601,20,0,1,5010,1,5010,0.0,5010,5010,5010
8,TRITD3Av1G027970,TRIDC3Av2G024260,99.635,4,1,1,1371,1,1370,0.0,1371,1371,1370
9,TRITD3Av1G027990,TRIDC3Av2G024270,99.797,6,0,1,2949,1,2949,0.0,2949,2949,2949


In [7]:
df_zavitan = df_zavitan[['id']]
df_svevo = df_svevo[['id']]

In [8]:
df_zavitan_merge = pd.merge(df_zavitan, df_blast, left_on='id', right_on='zavitan', how='left')
df_zavitan_merge = df_zavitan_merge[['id','svevo']]
df_zavitan_merge.head(3)

Unnamed: 0,id,svevo
0,TRIDC3Av2G023690,TRITD3Av1G027040
1,TRIDC3Av2G024010,
2,TRIDC3Av2G024160,TRITD3Av1G027860


In [9]:
df_svevo_merge = pd.merge(df_svevo, df_blast, left_on='id', right_on='svevo', how='left')
df_svevo_merge = df_svevo_merge[['id','zavitan']]
df_svevo_merge.head(3)

Unnamed: 0,id,zavitan
0,TRITD3Av1G026860,
1,TRITD3Av1G027040,TRIDC3Av2G023690
2,TRITD3Av1G027590,


In [10]:
df_merge = pd.merge(df_svevo_merge, df_zavitan_merge, left_on='id', right_on='svevo', how='outer')
df_merge = df_merge[['id_x','id_y']]
df_merge.columns = ['svevo','zavitan']


In [11]:
df_merge.to_csv('disk/counts/cross.csv', sep=',', index=None)

In [12]:
print(len(df_merge.index))
df_merge.head(3)

111


Unnamed: 0,svevo,zavitan
0,TRITD3Av1G026860,
1,TRITD3Av1G027040,TRIDC3Av2G023690
2,TRITD3Av1G027590,


In [13]:
#add info to list
file_ann_svevo = 'disk/PGSB_TRITD_Jan2017_all.gff3'
file_ann_zav_v2 = 'disk/SORTED_WEW_v2_HC_e_LC_GFF3_CATTATI_gff_PAO_26feb18.gff3'

df_ann_sv = pd.read_csv(file_ann_svevo, index_col=False, sep='\t', header=None)
df_ann_sv.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

df_ann_zv = pd.read_csv(file_ann_zav_v2, index_col=False, sep='\t', header=None)
df_ann_zv.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

df_ann_zv = df_ann_zv[df_ann_zv.feature=='gene']
df_ann_zv = df_ann_zv[df_ann_zv.seqname == 'chr3A']
df_ann_sv = df_ann_sv[df_ann_sv.feature =='gene']
df_ann_sv = df_ann_sv[df_ann_sv.seqname == 'chr3A']



In [14]:
df_ann_sv['gene'] = df_ann_sv.attribute.str.split(';', expand=True)[0].str.split('=', expand=True)[1]
df_ann_zv['gene'] = df_ann_zv.attribute.str.split(';', expand=True)[0].str.split('=', expand=True)[1]

df_ann_sv = df_ann_sv[['seqname','start','end','gene']]
df_ann_zv = df_ann_zv[['seqname','start','end','gene']]

In [17]:
#adds svevo coordinates
df_merge_svevo = pd.merge(df_merge, df_ann_sv, left_on='svevo', right_on='gene', how='left')
df_merge_svevo = df_merge_svevo[['seqname','start','end','svevo','zavitan']]
df_merge_svevo.columns = ['svevo.seqname','svevo.start','svevo.end','svevo.gene','zavitan.gene']

In [18]:
#adds zavitan coordinates
df_merge_zavitan = pd.merge(df_merge_svevo, df_ann_zv, left_on='zavitan.gene', right_on='gene', how='left')
df_merge_zavitan = df_merge_zavitan[['svevo.seqname','svevo.start','svevo.end','svevo.gene','zavitan.gene','seqname','start','end']]
df_merge_zavitan.columns = ['svevo.seqname','svevo.start','svevo.end','svevo.gene','zavitan.gene','zavitan.seqname','zavitan.start','zavitan.end']



In [19]:
df_merge_zavitan.to_csv('disk/counts/cross_info.csv', sep=',', index=None)