In [164]:
# library
import numpy as np
import gff2coverage
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [165]:
path_gff = 'data/results/all.gff'
path_genome = 'data/Stuberosum_genome.gff3'
te_types = ['MITE', 'TRIM', 'LARD', 'TIR','helitron','LTR','SINE', 'LINE']
unit_value = 1000000

In [166]:
#load chromosomal srtucture
df_genome = pd.read_csv(path_genome, sep='\t')
df_genome.columns = ['seqname' , 'source' , 'feature' , 'start' , 'end' , 'score' , 'strand' , 'frame' , 'attribute']

In [167]:
#load data
df = pd.read_csv(path_gff, sep='\t')
df.columns = ['seqname' , 'source' , 'feature' , 'start' , 'end' , 'score' , 'strand' , 'frame' , 'attribute']

In [168]:
# sep by chr
dfs = {}
for seq in df.seqname.unique():
    dfs[seq] = df[df.seqname == seq]

In [None]:
ys = {}
xs = {}
max_chrs = {}
for chr_num in range(1, 13):
    chr_name = 'chr' + str(chr_num).zfill(2)
    df_genome_chr = df_genome[df_genome.seqname == chr_name]
    max_chr = df_genome_chr.iloc[0].end
    max_chrs[chr_num] = max_chr
    df_chr = dfs[chr_name]
    print('%s: elements count: %i max: %i' % (chr_name, len(df_chr.index), max_chr) )
    #separate by step and calculate coverage
    step = 1 # in mb
    bins = (max_chr / unit_value) + 1
    x = np.arange(0, bins + step, step)
    y = []
    for te in te_types:
        current = []
        for i in np.arange(0, bins + step, step):
            nt_start = i * unit_value
            nt_end = (i * unit_value) + (unit_value * step) - 1
            df_res = df_chr[((df_chr.start + df_chr.end) / 2 >= nt_start) & ((df_chr.start + df_chr.end) / 2 <= nt_end) & (df_chr.attribute.str.contains(te))]
            coverage = gff2coverage.calc_coverage_part(df_res, unit_value)
            current.append(coverage)
        y.append(current)
    ys[chr_name] = y
    xs[chr_name] = x

chr01: elements count: 39484 max: 88663952


In [None]:

#plt.style.use('classic')
#plt.rcParams['figure.figsize'] = [12, 4]
#plt.ylabel('% of genome covered', fontsize=10)
#plt.xlabel('Genome position in Mb', fontsize=10)
#plt.margins(x=0)
#fig.subplots_adjust(hspace=2, wspace=2)
legend = False
fig, axs = plt.subplots(nrows=12, sharex=True)
fig.set_size_inches(20, 25)
for chr_num in range(1, 13):
    chr_name = 'chr' + str(chr_num).zfill(2)
    axs[chr_num - 1].set_ylim([0,50])
    axs[chr_num - 1].set_xlim([0,max_chrs[chr_num]])
    axs[chr_num - 1].stackplot(xs[chr_name], ys[chr_name], labels=te_types,)    
    axs[chr_num - 1].set_title(chr_name)
#    if not legend:
#        plt.legend(loc='best')
#        legend = True
#    plt.title(chr_name)
plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
plt.savefig('data/results/foo.png')

In [80]:
x_ = range(0,6)
y_=[ [1,1,4,6,8,10], [1,2,2,7,10,10], [1,2,8,5,10,1] ]