In [3]:
import pandas as pd
import plotly.plotly as py
import cufflinks as cf
import matplotlib.pyplot as plt    
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import os
import re
import csv

init_notebook_mode(connected=True)

In [4]:
#vs F. vesca genome
e = "G06.contigs.fasta_vs_Fragaria_vesca_v1.1.a2_cds_removed.fasta_nucl.db"
#vs selected NBS-LRR genes
f = "G06.contigs.fasta_vs_vesca_v1.1_nblrrs_augustus_cds_nucl.db"
#Output from NBS Parser
nbs_pars = "G06.assembly_nlr.tsv"

#Lengths
i = "G06.contigs_lengths.txt"
g = "Fragaria_vesca_v1.1.a2_cds_lengths.txt"
h = "vesca_v1.1_nblrrs_augustus_cds_lengths.txt"

df = pd.read_table(e, sep="\t", header=0,  index_col=False,
    names=["qseqid", "sseqid", "pident", "aln_length", "mismatch", 
           "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen", "sstrand"])
df2 = pd.read_table(f, sep="\t", header=0,  index_col=False,
    names=["qseqid", "sseqid", "pident", "aln_length", "mismatch", 
           "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen", "sstrand"])

all_sequences = pd.read_table(i, sep=",", header=0,  index_col=False,
    names=["seqid", "length"])

all_vesca = pd.read_table(g, sep=",", header=0,  index_col=False,
    names=["vescaid", "length"])
all_nblrrs = pd.read_table(h, sep=",", header=0,  index_col=False,
    names=["nblrrid", "length"])
all_nbs_parser = pd.read_table(nbs_pars, sep="\t", header=0,  index_col=False,
    names=["contig", "pred_id", "status", "start", "end", "strand", "domains"])

In [5]:
#Homologs of how many vesca genes from original bait design are present in the Ren-seq output?
union1 = pd.Series(list(set(df2['sseqid']).intersection(set(all_nblrrs['nblrrid']))))
len(union1)

239

In [45]:
#Homologs of how many vesca genes from original bait design are present in the Ren-seq output? (Max 90% divergence)
df2_max90 = df2[df2['pident'] >= 90]
union90 = pd.Series(list(set(df2_max90['sseqid']).intersection(set(all_nblrrs['nblrrid']))))
len(union90)

200

In [46]:
#How many vesca genes from original bait design are NOT present in the Ren-seq output?
diff2 = pd.Series(list(set(all_nblrrs['nblrrid']).difference(set(df2['sseqid']))))
len(diff2)

89

In [6]:
#How many input sequences have a hit to the vesca genes in the original bait design?
union2 = pd.Series(list(set(df2['qseqid']).intersection(set(all_sequences['seqid']))))
len(union2)

143

In [48]:
###Are any additional vesca genes present? 
#All the hits in all the genes, max. 90 divergence:
df_max90 = df[df['pident'] >= 90]
union9 = pd.Series(list(set(df_max90['sseqid']).intersection(set(all_vesca['vescaid']))))
len(union9)

451

In [49]:
#How many input sequences have a hit on the vesca genome?
union_v = pd.Series(list(set(df['qseqid']).intersection(set(all_sequences['seqid']))))
len(union_v)

145

In [50]:
#How many input sequences DO NOT have a hit on the vesca genome?
diff3 = pd.Series(list(set(all_sequences['seqid']).difference(set(df['qseqid']))))
len(diff3)

0

In [51]:
#How many contigs contain a predicted NBS-LRR gene (according to NBS Parser?
just_ids = (all_nbs_parser.iloc[:,[0,2]])
just_ids_dedup = just_ids.drop_duplicates()
len(just_ids_dedup)

1561

In [52]:
#How many of them complete?
complete = just_ids[just_ids['status'] == "complete"]
len(complete)

469

In [53]:
#How many of them partial?
partial = just_ids[just_ids['status'] == "partial"]
len(partial)

564

In [54]:
#How many of them pseudogenes?
pseudogene = just_ids[just_ids['status'] == "pseudogene"]
len(pseudogene)

550

In [55]:
###Investigate the distribution of coding, 5' upstream and 3' upstream regions among all the hits in the vesca genome.

In [56]:
#Prefilter to retain only top hit (in terms of %aln) for each query sequence.
#df.groupby(['qseqid'], sort=False)['pident'].max()
idx = df.groupby(['qseqid'])['aln_length'].transform(max) == df['aln_length']
df_top = df[idx]
select_matches = (df_top.iloc[:,[0,1]])
select_dict = select_matches.set_index('qseqid')['sseqid'].to_dict()
lst_index = list()
for index, row in select_matches.iterrows():
    if row['qseqid'] in select_dict:
        if select_dict[row['qseqid']] == row['sseqid']:
            lst_index.append(index)
df_prefilt = (df.ix[lst_index])
all_alignment_lengths = list()
all_relative_alignment_lengths = list()
all_five_lengths = list()
all_three_lengths = list()
for key in select_dict.keys():
    aln_length = 0
    UTR_5_len = 0
    UTR_3_len = 0
    df_slice = df_prefilt[df_prefilt['qseqid'] == key]
    for i, r in df_slice.iterrows():
        aln_length += r['aln_length']
        #Check if matches start position
        if r['sstart'] == 1:
            UTR_5_len = r['qlen'] - r['qstart']
            all_five_lengths.append(UTR_5_len)
        elif r['send'] == 1:
            UTR_5_len = r['qlen'] - r['qstart'] 
            all_five_lengths.append(UTR_5_len)
        #Check if matches end position
        if r['sstart'] >= r['slen'] - 3:
            UTR_3_len = r['qlen'] - r ['qend']
            all_three_lengths.append(UTR_3_len)
        elif r['send'] >= r['slen'] - 3:
            UTR_3_len = r['qlen'] - r ['qend']
            all_three_lengths.append(UTR_3_len)
    all_alignment_lengths.append(aln_length)
    relative_length = float(aln_length)/float(r['slen'])
    all_relative_alignment_lengths.append(relative_length)

In [57]:
#Histogram for CDS total length
cds = pd.Series(all_alignment_lengths).value_counts()
cds_d = cds.to_frame(name="Frequency")
cds_d.iplot(kind='bar', color="black")





In [58]:
#Histogram for CDS relative length (fraction reference)
cds_rel = pd.Series(all_relative_alignment_lengths).value_counts()
cds_rel_d = cds_rel.to_frame(name="Frequency")
cds_rel_d.iplot(kind='bar', color="gray")





In [59]:
#Histogram for 5' UTR lengths
five_l = pd.Series(all_five_lengths).value_counts()
five_l_d = five_l.to_frame(name="Frequency")
five_l_d.iplot(kind='bar', color="green")





In [60]:
#Histogram for 3' UTR lengths
three_l = pd.Series(all_three_lengths).value_counts()
three_l_d = three_l.to_frame(name="Frequency")
three_l_d.iplot(kind='bar', color="purple")



