In [39]:
"""
Blasts genes from svevo and zavitan and finds putative orthologues
Outputs first, a table with genes corresponding to each reference
Adds interesting information to output, position in genome according to annotation,
position of blast against genome, GO:terms
"""
#blastn -perc_identity 85 -subject disk/counts/svevo.counts.fa -query disk/counts/zav_v2.counts.fa -outfmt "6 qseqid sseqid pident mismatch gapopen qstart qend sstart send evalue length qlen slen"  -evalue 10e-10 > disk/counts/blast_zv_sv.csv
#blastn -perc_identity 85 -query disk/counts/svevo.counts.fa -subject disk/counts/zav_v2.counts.fa -outfmt "6 qseqid sseqid pident mismatch gapopen qstart qend sstart send evalue length qlen slen"  -evalue 10e-10 > disk/counts/blast_sv_zv.csv



'\nBlasts genes from svevo and zavitan and finds putative orthologues\nOutputs first, a table with genes corresponding to each reference\nAdds interesting information to output, position in genome according to annotation,\nposition of blast against genome, GO:terms\n'

In [40]:
import pandas as pd

In [41]:
file_zavitan = 'disk/counts/zavitan_v2.genes.csv'
file_svevo = 'disk/counts/svevo.genes.csv'
file_blast_sv_zv = 'disk/counts/blast_sv_zv.csv' #we'll use this layout
file_blast_zv_sv = 'disk/counts/blast_zv_sv.csv' #we'll use this layout

df_zavitan_all = pd.read_csv(file_zavitan, sep=',', )
df_svevo_all = pd.read_csv(file_svevo, sep=',', )

df_blast_sv_zv = pd.read_csv(file_blast_sv_zv, sep='\t', header=None)
df_blast_sv_zv.columns = ['svevo','zavitan','pident','mismatch','gapopen','qstart','qend','sstart','send','evalue','length','qlen','slen']


df_blast_zv_sv = pd.read_csv(file_blast_zv_sv, sep='\t', header=None)
df_blast_zv_sv.columns = ['zavitan','svevo','pident','mismatch','gapopen','qstart','qend','sstart','send','evalue','length','qlen','slen']


In [42]:
df_blast = df_blast[((df_blast.length / df_blast.qlen) >= 0.7) &
       ((df_blast.length / df_blast.slen) >= 0.7) &
       ((df_blast.length / df_blast.qlen) <= 1.3) &
       ((df_blast.length / df_blast.slen) <= 1.3)]

print('blast:',len(df_blast.index))
df_blast.tail(4)

blast: 294


Unnamed: 0,evalue,gapopen,length,mismatch,pident,qend,qlen,qstart,send,slen,sstart,svevo,zavitan
459,0.0,2,1919,0,99.896,1919,1919,1,1917,1917,1,TRITD3Av1G036830,TRIDC3Av2G034510
461,0.0,1,2980,7,99.732,2979,2979,1,2980,2980,1,TRITD3Av1G036880,TRIDC3Av2G034560
465,0.0,1,1579,2,99.81,1578,1578,1,1579,1579,1,TRITD3Av1G036930,TRIDC3Av2G034640
468,0.0,1,21421,1,99.991,21420,21420,1,21421,21421,1,TRITD3Av1G036950,TRIDC3Av2G034660


In [43]:
df_zavitan = df_zavitan_all[['id']]
df_svevo = df_svevo_all[['id']]

In [44]:
df_zavitan_merge = pd.merge(df_zavitan, df_blast, left_on='id', right_on='zavitan', how='left')
df_zavitan_merge = df_zavitan_merge[['id','svevo']]
df_zavitan_merge.head(3)

Unnamed: 0,id,svevo
0,TRIDC3Av2G023650,TRITD3Av1G027010
1,TRIDC3Av2G023650,TRITD3Av1G027010
2,TRIDC3Av2G023690,TRITD3Av1G027040


In [45]:
df_svevo_merge = pd.merge(df_svevo, df_blast, left_on='id', right_on='svevo', how='left')
df_svevo_merge = df_svevo_merge[['id','zavitan']]
df_svevo_merge.head(3)

Unnamed: 0,id,zavitan
0,TRITD3Av1G026840,
1,TRITD3Av1G026860,
2,TRITD3Av1G027010,TRIDC3Av2G023650


In [46]:
df_merge = pd.merge(df_svevo_merge, df_zavitan_merge, left_on='id', right_on='svevo', how='outer')
df_merge = df_merge[['id_x','id_y']]
df_merge.columns = ['svevo','zavitan']


In [47]:
df_merge.to_csv('disk/counts/cross.csv', sep=',', index=None)

In [48]:
print(len(df_merge.index))
df_merge.head(3)

657


Unnamed: 0,svevo,zavitan
0,TRITD3Av1G026840,
1,TRITD3Av1G026860,
2,TRITD3Av1G027010,TRIDC3Av2G023650


In [49]:
#add info to list
file_ann_svevo = 'disk/PGSB_TRITD_Jan2017_all.gff3'
file_ann_zav_v2 = 'disk/SORTED_WEW_v2_HC_e_LC_GFF3_CATTATI_gff_PAO_26feb18.gff3'

df_ann_sv = pd.read_csv(file_ann_svevo, index_col=False, sep='\t', header=None)
df_ann_sv.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

df_ann_zv = pd.read_csv(file_ann_zav_v2, index_col=False, sep='\t', header=None)
df_ann_zv.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

df_ann_zv = df_ann_zv[df_ann_zv.feature=='gene']
df_ann_zv = df_ann_zv[df_ann_zv.seqname == 'chr3A']
df_ann_sv = df_ann_sv[df_ann_sv.feature =='gene']
df_ann_sv = df_ann_sv[df_ann_sv.seqname == 'chr3A']



In [50]:
df_ann_sv['gene'] = df_ann_sv.attribute.str.split(';', expand=True)[0].str.split('=', expand=True)[1]
df_ann_zv['gene'] = df_ann_zv.attribute.str.split(';', expand=True)[0].str.split('=', expand=True)[1]

df_ann_sv = df_ann_sv[['seqname','start','end','gene']]
df_ann_zv = df_ann_zv[['seqname','start','end','gene']]

In [51]:
#adds svevo coordinates
df_merge_svevo = pd.merge(df_merge, df_ann_sv, left_on='svevo', right_on='gene', how='left')
df_merge_svevo = df_merge_svevo[['seqname','start','end','svevo','zavitan']]
df_merge_svevo.columns = ['svevo.seqname','svevo.start','svevo.end','svevo.gene','zavitan.gene']

In [65]:
#adds zavitan coordinates
df_merge_zavitan = pd.merge(df_merge_svevo, df_ann_zv, left_on='zavitan.gene', right_on='gene', how='left')
df_merge_zavitan = df_merge_zavitan[['svevo.seqname','svevo.start','svevo.end','svevo.gene','zavitan.gene','seqname','start','end']]
df_merge_zavitan.columns = ['svevo.seqname','svevo.start','svevo.end','svevo.gene','zavitan.gene','zavitan.seqname','zavitan.start','zavitan.end']



In [66]:
print(len(df_merge_zavitan.index))
df_merge_zavitan.drop_duplicates( keep='first', inplace=True)
print("W/O duplicates:",len(df_merge_zavitan.index))

657
W/O duplicates: 208


In [67]:
df_merge_zavitan.head(4)


Unnamed: 0,svevo.seqname,svevo.start,svevo.end,svevo.gene,zavitan.gene,zavitan.seqname,zavitan.start,zavitan.end
0,chr3A,54132399.0,54132875.0,TRITD3Av1G026840,,,,
1,chr3A,54184665.0,54187934.0,TRITD3Av1G026860,,,,
2,chr3A,54339905.0,54342744.0,TRITD3Av1G027010,TRIDC3Av2G023650,chr3A,54834278.0,54837117.0
6,chr3A,54405290.0,54410142.0,TRITD3Av1G027040,TRIDC3Av2G023690,chr3A,54901515.0,54905244.0


In [68]:


df_merge_zavitan = pd.merge(df_merge_zavitan,df_svevo_all[['id','log2FoldChange','padj']], left_on='svevo.gene', right_on='id', how='left')
df_merge_zavitan.rename(columns= {'log2FoldChange':'svevo.l2fc', 'padj':'svevo.padj'}, inplace=True)
df_merge_zavitan.drop('id', axis=1, inplace=True)


df_merge_zavitan = pd.merge(df_merge_zavitan,df_zavitan_all[['id','log2FoldChange','padj']], left_on='zavitan.gene', right_on='id', how='left')
df_merge_zavitan.rename(columns={'log2FoldChange':'zavitan.l2fc', 'padj':'zavitan.padj'}, inplace=True)
df_merge_zavitan.drop('id', axis=1, inplace=True)


In [71]:
df_merge_zavitan = df_merge_zavitan[['svevo.seqname','svevo.start','svevo.end','svevo.padj','svevo.l2fc','svevo.gene','zavitan.gene','zavitan.seqname','zavitan.start','zavitan.end','zavitan.padj','zavitan.l2fc']]
df_merge_zavitan.to_csv('disk/counts/cross_info.csv', sep=',', index=None)

