In [1]:
import Bio.SeqIO
import Bio.Alphabet
import matplotlib.pyplot as plt
import random
import tensorflow as tf
import collections
import numpy as np
from matplotlib import pyplot as plt

In [3]:
all_families = set()
family_counter = collections.Counter()
family_total_lengths = collections.Counter()
family_to_seqs = collections.defaultdict(list)
family_to_big_seqs = collections.defaultdict(list)
for i, rec in enumerate(Bio.SeqIO.parse('../input/Pfam-A.fasta', 'fasta', alphabet=Bio.Alphabet.ProteinAlphabet())):
    length = len(rec.seq)
    # Exclude peptides and extremely long domains
    if 50 <= length <= 512:
        name = rec.description
        words = name.split()
        family = words[2].split(';')[0].split('.')[0]

        family_counter[family] += 1
        family_total_lengths[family] += length
        all_families.add(family)
    
        if length <= 128:
            family_to_seqs[family].append(str(rec.seq))
        elif length <= 512:
            family_to_big_seqs[family].append(str(rec.seq))

In [4]:
selected_seqs = []
for seqs in family_to_seqs.values():
    selected_seqs.extend(seqs)
random.shuffle(selected_seqs)
    
selected_big_seqs = []
for seqs in family_to_big_seqs.values():
    selected_big_seqs.extend(seqs)
random.shuffle(selected_big_seqs)

In [5]:
def _bytes_feature(value):
    b = value.encode('ascii')
    a = np.frombuffer(b, dtype=np.uint8, count=len(b)) - 61
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[a.tobytes()]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def write_to_file(filename, data): 
    with tf.io.TFRecordWriter(filename) as fo:
        for seq in data:
            ex = tf.train.Example(features=tf.train.Features(feature={
                'sequence': _bytes_feature(seq),
            }))

            fo.write(ex.SerializeToString())

def write_seqs(name, seqs, K=10):
    chunk_size = len(seqs) // K
    for k, i in enumerate(range(0, len(seqs), chunk_size)):
        fname = "{0}-k{1}.tfrecord".format(name, k+1)
        print("Writing {0}:{1} to {2}...".format(i, i + chunk_size, fname))
        write_to_file('../output/' + fname, seqs[i:i + chunk_size])

In [6]:
write_seqs('bert-sequences', selected_seqs)

Writing 0:1404866 to bert-sequences-k1.tfrecord...
Writing 1404866:2809732 to bert-sequences-k2.tfrecord...
Writing 2809732:4214598 to bert-sequences-k3.tfrecord...
Writing 4214598:5619464 to bert-sequences-k4.tfrecord...
Writing 5619464:7024330 to bert-sequences-k5.tfrecord...
Writing 7024330:8429196 to bert-sequences-k6.tfrecord...
Writing 8429196:9834062 to bert-sequences-k7.tfrecord...
Writing 9834062:11238928 to bert-sequences-k8.tfrecord...
Writing 11238928:12643794 to bert-sequences-k9.tfrecord...
Writing 12643794:14048660 to bert-sequences-k10.tfrecord...
Writing 14048660:15453526 to bert-sequences-k11.tfrecord...


In [8]:
write_seqs('bert-sequences-big', selected_big_seqs, 1)

Writing 0:16946731 to bert-sequences-big-k1.tfrecord...
