In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
file_all_tes = '../../data/TEs/all.gff3'
file_all_mite = '../../data/TEs/all_filter_mite.gff3'
file_ann = '../../data/genome/annotations/introns.gff3'
file_ann_promoters = '../../data/genome/annotations/promoters.gff3'

In [3]:
df = pd.read_csv(file_all_tes, index_col=False, sep='\t', comment='#', header=None)
df.columns = ['seqname' , 'source' , 'feature' , 'start' , 'end' , 'score' , 'strand' , 'frame' , 'attribute']


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
print(len(df.index))

1745371


In [5]:
df.head(4)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1A,TE,TE,77566,78177,.,+,.,MITE_T_122702|chr2B|476704370|476705072|AT|10|...
1,1A,TE,TE,86285,86766,.,+,.,MITE_T_126801|chr7B|78567635|78568204|CTTCCCT|...
2,1A,TE,TE,213867,214194,.,+,.,MITE_T_8262|chr5A|273309274|273309638|TA|373|F468
3,1A,TE,TE,219804,220469,.,+,.,MITE_T_82582|chr3D|312088827|312089549|AT|18|F...


In [6]:
df = df[(df.attribute.str.startswith('MITE')) | (df.attribute.str.startswith('DT'))]

In [7]:
print(len(df.index))

1222547


In [8]:
df.head(4)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1A,TE,TE,77566,78177,.,+,.,MITE_T_122702|chr2B|476704370|476705072|AT|10|...
1,1A,TE,TE,86285,86766,.,+,.,MITE_T_126801|chr7B|78567635|78568204|CTTCCCT|...
2,1A,TE,TE,213867,214194,.,+,.,MITE_T_8262|chr5A|273309274|273309638|TA|373|F468
3,1A,TE,TE,219804,220469,.,+,.,MITE_T_82582|chr3D|312088827|312089549|AT|18|F...


In [9]:
df.reset_index(inplace=True)

In [10]:
df.to_csv(file_all_mite, sep='\t', index=None, header=None)

In [11]:
df_ann = pd.read_csv(file_ann, index_col=False, sep='\t', comment='#', header=None)
df_ann.columns = ['seqname' , 'source' , 'feature' , 'start' , 'end' , 'score' , 'strand' , 'frame' , 'attribute']


In [12]:
print(len(df_ann.index))

2573233


In [13]:
df_ann = df_ann[
    (df_ann.feature == 'intron') | 
    (df_ann.feature == 'exon') | 
    (df_ann.feature == 'three_prime_UTR') | 
    (df_ann.feature == 'five_prime_UTR')]

In [14]:
df_ann.head(4)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
3,1A,Ensembl_Plants,exon,200,1683,.,+,.,Parent=transcript:ENSRNA050013875-T1;Name=ENSR...
6,1A,Ensembl_Plants,exon,5023,6833,.,+,.,Parent=transcript:ENSRNA050013847-T1;Name=ENSR...
9,1A,Ensembl_Plants,exon,7060,7215,.,+,.,Parent=transcript:ENSRNA050013913-T1;Name=ENSR...
12,1A,Ensembl_Plants,exon,12317,13827,.,+,.,Parent=transcript:ENSRNA050013874-T1;Name=ENSR...


In [15]:
print(len(df_ann.index))

1593666


In [16]:
df_ann_promoters = pd.read_csv(file_ann_promoters, index_col=False, sep='\t', comment='#', header=None)
df_ann_promoters.columns = ['seqname' , 'source' , 'feature' , 'start' , 'end' , 'score' , 'strand' , 'frame' , 'attribute']
print(len(df_ann_promoters.index))

120744


In [17]:
df_ann_promoters.head(2)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1A,Ensembl_Plants,promoter,0,200,.,+,.,ID=gene:ENSRNA050013875;Name=LSU_rRNA_eukarya;...
1,1A,Ensembl_Plants,promoter,3023,5023,.,+,.,ID=gene:ENSRNA050013847;Name=SSU_rRNA_eukarya;...


In [18]:
df_ann_total = pd.concat([df_ann, df_ann_promoters])

In [19]:
len(df_ann_total.index)

1714410

In [20]:
df_anns = {}
for chromosome in df_ann_total.seqname.unique():
    df_anns[chromosome] = df_ann_total[df_ann_total.seqname == chromosome]

In [21]:
millon = 1000000
df_chr_pos = {}
for k,ann_chr in df_anns.items():
    prev = 0
    df_chr_pos[k] = {}
    for k_mill in range(1,90):
        position = k_mill * 10 * millon
        start = max(prev - 10000, 0)
        end = position + 10000
        df_chr_pos[k][k_mill] = df_anns[k][(df_anns[k].start >= start) & (df_anns[k].end <= end)]
        prev = position - millon
        print(k_mill, start, end, len(df_chr_pos[k][k_mill].index))
    

1 0 10010000 2667
2 8990000 20010000 2099
3 18990000 30010000 1592
4 28990000 40010000 1574
5 38990000 50010000 1642
6 48990000 60010000 1755
7 58990000 70010000 837
8 68990000 80010000 1156
9 78990000 90010000 871
10 88990000 100010000 1516
11 98990000 110010000 1463
12 108990000 120010000 890
13 118990000 130010000 548
14 128990000 140010000 758
15 138990000 150010000 616
16 148990000 160010000 457
17 158990000 170010000 405
18 168990000 180010000 334
19 178990000 190010000 748
20 188990000 200010000 221
21 198990000 210010000 447
22 208990000 220010000 382
23 218990000 230010000 447
24 228990000 240010000 781
25 238990000 250010000 1146
26 248990000 260010000 1166
27 258990000 270010000 1137
28 268990000 280010000 501
29 278990000 290010000 741
30 288990000 300010000 1339
31 298990000 310010000 1478
32 308990000 320010000 790
33 318990000 330010000 1182
34 328990000 340010000 1045
35 338990000 350010000 841
36 348990000 360010000 1417
37 358990000 370010000 1321
38 368990000 3800100

In [22]:
howfar = 200
df['feature'] = ''
df['gene'] = ''
rows = []
for k_mite, v_mite in df.iterrows():
    if k_mite % 100 == 0:
        print(k_mite)
    pos = int(int(v_mite.start) / 10 / millon) + 1
    cur = df_chr_pos[v_mite.seqname][pos]
    for k_feature,v_feature in cur.iterrows():
        start = max(int(v_feature.start) - howfar, 0)
        end = int(v_feature.end) + howfar
        #overlaps with gene
        if int(v_mite.end) >= start and int(v_mite.start) <= end:
            df.at[k_mite, 'feature'] = v_feature.feature
            df.at[k_mite, 'gene'] = v_feature.attribute
            print (v_mite.attribute, v_feature.feature)
            break

0
MITE_T_122702|chr2B|476704370|476705072|AT|10|F5743 intron
MITE_T_126801|chr7B|78567635|78568204|CTTCCCT|14|F5915 intron
MITE_T_126801|chr7B|78567635|78568204|CTTCCCT|14|F5915 exon
MITE_T_122702|chr2B|476704370|476705072|AT|10|F5743 exon
MITE_T_122702|chr2B|476704370|476705072|AT|10|F5743 exon
MITE_T_101006|chr4A|583911697|583911802|GC|112|F4445 promoter
MITE_T_80783|chr5A|439864642|439864784|GTAGT|143|F3462 intron
MITE_T_106414|chr7B|461476009|461476104|CA|96|F4797 intron
MITE_T_80367|chr2B|655006817|655006964|CA|150|F3428 three_prime_UTR
MITE_T_65522|chr4D|293550123|293550295|TATA|180|F2651 three_prime_UTR
100
MITE_T_95653|chr4B|466965714|466965828|ATT|18|F4186 three_prime_UTR
MITE_T_86668|chr3D|433565985|433566112|TT|129|F3732 exon
MITE_T_99104|chr6A|581744576|581744687|TA|123|F4312 exon
MITE_T_45812|chr5A|568401370|568401611|ATG|249|F1905 promoter
MITE_T_36769|chr7D|152542784|152543049|TT|272|F1524 promoter
MITE_T_122398|chr2D|334331512|334331576|TA|69|F5714 exon
MITE_T_26389|chr

KeyboardInterrupt: 

In [None]:
file_all_mite_ann = '../../data/TEs/all_filter_mite.ann.gff3'
df.to_csv(file_all_mite_ann, sep='\t', index=None, header=None)

In [None]:
df2 = df
df2.feature[df2.feature == ''] = 'intergenic'
df2 = df2[df2.feature != 'intergenic']

In [None]:
counts  = df2.feature.value_counts()
#city_count = city_count[:10,]
plt.figure(figsize=(10,5))
sns.barplot(counts.index, counts.values, alpha=0.8)
plt.title('Starbucks in top 10 cities in the World')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('city', fontsize=12)
plt.show()

In [None]:
seq = '1A'
pos_s = 1792810
pos_end = 1793074
df_ann_total[(df_ann_total.seqname == seq) & 
             (df_ann_total.start >= pos_s - 1000)].head(10).sort_values('start')