In [18]:
import pandas as pd
chromosome = "1A"
path_blast = '../../../data/TEs/blast/' + chromosome + '.fasta.csv'
path_blast_filtered = '../../../data/TEs/blast/' + chromosome + '.filtered.csv'


In [19]:
#TEs
params = {'min_len':50,'max_len':False,'min_distance':5,'max_q':1.3,'min_q':0.7,'min_pident':80,'min_qcov':50}


In [20]:
#read blast output
df = pd.read_csv(path_blast, sep='\t', header=None)
df.columns = ['qseqid','sseqid','qstart','qend','sstart','send','mismatch','gaps','pident','evalue','length','qlen','slen','qcovs']
print('initial:',len(df.index))
initial = len(df.index)

initial: 3346762


In [21]:
#filter by length
if(params['min_len']):
    df = df[df.qlen > params['min_len']]
print('Min len: ' + str(len(df.index)))
min_length = str(len(df.index))

Min len: 3346743


In [22]:
if(params['max_len']):
    df = df[df.qlen < params['max_len']]
print('Max len: ' + str(len(df.index)))    
max_length = str(len(df.index))

Max len: 3346743


In [23]:
#filter by query / subject length treshold
df = df[((df.length / df.qlen) >= params['min_q'])]
print('min treshold:',len(df.index))
min_treshold = str(len(df.index))

min treshold: 789036


In [24]:
df = df[((df.length / df.qlen) <= params['max_q'])]
print('max treshold:',len(df.index))
max_treshold = str(len(df.index))

max treshold: 789036


In [25]:
#filter by pident
df = df[(df.pident >= params['min_pident'])]
print('Min_pident: ' + str(len(df.index)))
min_pident = str(len(df.index))

Min_pident: 733546


In [26]:
#filter by qcov
df = df[(df.qcovs >= params['min_qcov'])]
print('Min qcov: ' + str(len(df.index)))
min_qcov = str(len(df.index))

Min qcov: 733546


In [27]:
#order sstart and send
df['new_sstart'] = df[['sstart','send']].min(axis=1)
df['new_ssend'] = df[['sstart','send']].max(axis=1)
df['sstart'] = df['new_sstart']
df['send'] = df['new_ssend']
df = df.drop('new_sstart',axis=1).drop('new_ssend',axis=1)

# sep by chr
dfs = {}
for seq in df.sseqid.unique():
    dfs[seq] = df[df.sseqid == seq]


In [28]:
# filter overlapped 
rows = []
discard = []
total = len(df.index)
count = 0
curr = 0
for index, row in df.iterrows():
    count += 1
    curr_new = int(count * 100 * 1.0 / (total * 1.0))
    if curr_new != curr:
        curr = curr_new
        if curr_new % 1 == 0:
            print(curr_new)
    if index in discard:
        continue
    df_2 = dfs[row.sseqid]
    res = df_2[(abs(df_2.sstart - row.sstart) <= params['min_distance']) | (abs(df_2.send - row.send) <= params['min_distance'])]
    if len(res.index) > 1:
        discard.extend(res.index.values)
    rows.append(row)

1
2
3
4
5
6


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(rows)
df.sort_values(['sseqid', 'sstart'], inplace=True)
print('Non overlapped: ' + str(len(df.index)))
non_overlapped = str(len(df.index))

In [None]:
filename = path_blast + params['file'] + '.filtered'
df.to_csv(path_blast_filtered, index=None, sep='\t')
filename

In [None]:
print('Initial: ' + str(initial))
print('Min len: ' + str(min_length))
print('Max len: ' + str(max_length))
print('Min treshold: ' + str(min_treshold))
print('Max treshold: ' + str(max_treshold))
print('Min pident: ' + str(min_pident))
print('Min qcov: ' + str(min_qcov))
print('Non overlapped: ' + str(non_overlapped))
print('Saved: ' + path_blast_filtered)

In [None]:
#OTHER ALTERNATIVE

In [15]:
df = df.sort_values(by=['sseqid','sstart', 'send'])
df.reset_index(inplace=True)
df = df.drop('index',axis=1)

In [17]:
my_index = 0
indexes = []
discard = []
count = 0
curr = 0
total_len = len(df.index)
while my_index < total_len - 1:
    row = df.iloc[[my_index]]
    cond = True
    next_index = 1
    while cond:
        second_row = df.iloc[[my_index + next_index]]
        c1 = (row.iloc[0].sseqid == second_row.iloc[0].sseqid)
        c2 = (abs(second_row.iloc[0].sstart - row.iloc[0].sstart) <= params['min_distance'])
        c3 = (abs(second_row.iloc[0].send - row.iloc[0].send) <= params['min_distance'])
        cond =  c1 and c2 and c3
        if not cond and (c2 or c3):
            indexes.append(my_index)
            cond = True
        next_index += 1
    indexes.append(my_index)
    my_index += next_index
    #just a counter
    curr_new = int(my_index * 100 * 1.0 / (total_len * 1.0))
    if curr_new != curr:
        curr = curr_new
        if curr_new % 5 == 0:
            print(curr_new)
indexes.append(total_len - 1)

KeyboardInterrupt: 

In [None]:
df = df[df.index.isin(indexes)]
df.sort_values(['sseqid', 'sstart'], inplace=True)
print('Non overlapped: ' + str(len(df.index
