In [1]:
from Bio import SeqIO
import pandas as pd
import re

In [2]:
# input FASTA file:
in_fasta = 'dlh'

# in and out paths
in_path = 'inputs/' + in_fasta + '.fasta.txt'
out_path = 'outputs/clean_' + in_fasta + '.fasta.txt'

In [3]:
# put sequences into a pandas dataframe
fasta_seqs = SeqIO.parse(open(in_path),'fasta')
data = []
for fasta in fasta_seqs:
    data.append([fasta.id, str(fasta.seq).strip()])
    
sequences_df = pd.DataFrame(data, columns = ['name', 'sequence'])

print('Total number of input sequences:', len(sequences_df))

sequences_df.head()

Total number of input sequences: 70110


Unnamed: 0,name,sequence
0,A0A009EPN2|IPR002925|21...241|DLH,MAAIKTREIQYTAQDGSTLIGYFAAPETDAPVAGVIVAPEWWGRND...
1,A0A009GF28|IPR002925|22...242|DLH,MSTAIKTREIQYTAPDGSHLIGYFAAPDSETPVAGVIVAPEWWGRN...
2,A0A009GFC1|IPR002925|16...232|Dienelactone,MAGQTIQIKTASGKQFSAYLATPETGKGPGVVLCQEIFGVNAAMRE...
3,A0A009GGE5|IPR002925|16...231|DLH,MAGQTVQIKTASGKQFSAYLATPETEKGPGVVLCQEIFGVNAAMRE...
4,A0A009H5J6|IPR002925|22...242|DLH,MSTAIKTREIQYTAPDGSHLIGYFAAPDSETPVAGVIVAPEWWGRN...


In [4]:
# remove sequences with length > 1000 AA's
rows2drop = []
for i in range(len(sequences_df)):
    if (len(sequences_df.sequence[i]) > 1000):
        rows2drop.append(i)

print('Total number of sequences dropped:', len(rows2drop))

sequences_df = sequences_df.drop(rows2drop).reset_index()

print('Total number of sequences remaining:', len(sequences_df))


Total number of sequences dropped: 231
Total number of sequences remaining: 69879


In [5]:
# remove sequences with invalid AA residues
#valid_alphabet = ['F','I','W','L','V','M','Y','C','A','T','H','G','S','Q','R','K','N','E','P','D']

def validate(seq, pattern=re.compile(r'^[FIWLVMYCATHGSQRKNEPD]+$')):
    if (pattern.match(seq)):
        return True
    return False

invalid_seqs = []

for i in range(len(sequences_df)):
    if (not validate(sequences_df.sequence[i])):
        invalid_seqs.append(i)
        
print('Total number of sequences dropped:', len(invalid_seqs))

sequences_df = sequences_df.drop(invalid_seqs).reset_index()

print('Total number of sequences remaining:', len(sequences_df))


Total number of sequences dropped: 852
Total number of sequences remaining: 69027


In [6]:
out_file = open(out_path, "w")
for i in range(len(sequences_df)):
    out_file.write(sequences_df.name[i] + '\n')
    out_file.write(sequences_df.sequence[i] + '\n')
out_file.close()