In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline
from tabulate import tabulate

##### generate mini dataset

In [47]:
with open('data/train_sequences.txt', 'r') as fpr:
    with open('data/train_sequences_mini.txt', 'w') as fpw:
        for _ in range(1024):
            ln = fpr.readline()
            fpw.write(ln)
            
with open('data/test_sequences.txt', 'r') as fpr:
    with open('data/test_sequences_mini.txt', 'w') as fpw:
        for _ in range(256):
            ln = fpr.readline()
            fpw.write(ln)

In [48]:
train = pd.read_csv('data/train_sequences.txt', sep='\t', header=None)
test = pd.read_csv('data/test_sequences.txt', sep='\t', header=None)

#### Embedding
(A, T, C , G) + N

quote: Each promoter sequence is comprised of the bases A, T, G, and C, and rarely includes an N (in the training data), when a base could not be confidently called during DNA sequencing.

In [None]:
# Passed! don't run again

# for s in tqdm(train[0].values):
#     for _ in s:
#         if _ not in ('A', 'T', 'C', 'G', 'N'):
#             print(s)

#### Statistics

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ax.hist(train[1].values, bins=15)
plt.show()

In [None]:
report = tabulate([
    ('Max', train[1].values.max()),
    ('Avg', train[1].values.mean()),
    ('Med', np.median(train[1].values)),
    ('Min', train[1].values.min())
], tablefmt='fancy_grid'
)
print(report)

#### Preprocess
use mini set as example

In [50]:
import torch
import torch.nn as nn

In [210]:
train = pd.read_csv('data/train_sequences.txt', sep='\t', header=None)
test = pd.read_csv('data/test_sequences.txt', sep='\t', header=None)

In [211]:
# max length = 142 in train and length = 110 for all test sequences
maxlen = 150

##### pad sequence

In [212]:
pad_dict = {
    'A': 0,
    'T': 1,
    'C': 2,
    'G': 3,
    'N': 4,
    '<PAD>': 5,
}

In [213]:
def seq2tensor(seq, maxlength=150):
    tensor = torch.zeros(maxlength, dtype=torch.long) + pad_dict['<PAD>']
    for i in range(len(seq)):
        tensor[i] = pad_dict[seq[i]]
    return tensor

In [214]:
def get_data(df):
    seqs = [None] * len(df)
    scores = [None] * len(df)
    for i in tqdm(range(len(df))):
        seqs[i] = seq2tensor(df[0][i]).long()
        scores[i] = df[1][i]
    seqs = torch.stack(seqs)
    scores = torch.tensor(scores)
    return seqs, scores

In [224]:
train_seqs, train_scores = get_data(train)

100%|██████████| 6739258/6739258 [47:33<00:00, 2361.42it/s]  


In [225]:
test_seqs, test_scores = get_data(test)

100%|██████████| 71103/71103 [00:29<00:00, 2440.17it/s]


In [226]:
torch.save((train_seqs, train_scores), 'train_full.pt')

In [227]:
torch.save((test_seqs, test_scores), 'test_full.pt')

In [228]:
train_seqs

tensor([[1, 3, 2,  ..., 5, 5, 5],
        [1, 3, 2,  ..., 5, 5, 5],
        [1, 3, 2,  ..., 5, 5, 5],
        ...,
        [1, 3, 2,  ..., 5, 5, 5],
        [1, 3, 2,  ..., 5, 5, 5],
        [1, 3, 2,  ..., 5, 5, 5]])