In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline
from tabulate import tabulate

##### generate mini dataset

In [None]:
%%bash
head -n 1024 data/train_sequences.txt > data/train_sequences_mini.txt
head -n 256 data/test_sequences.txt > data/test_sequences_mini.txt

In [None]:
train = pd.read_csv('data/train_sequences.txt', sep='\t', header=None)
test = pd.read_csv('data/test_sequences.txt', sep='\t', header=None)

#### Embedding
(A, T, C , G) + N

quote: Each promoter sequence is comprised of the bases A, T, G, and C, and rarely includes an N (in the training data), when a base could not be confidently called during DNA sequencing.

In [None]:
# Passed! don't run again

# for s in tqdm(train[0].values):
#     for _ in s:
#         if _ not in ('A', 'T', 'C', 'G', 'N'):
#             print(s)

#### Statistics

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ax.hist(train[1].values, bins=15)
plt.show()

In [None]:
report = tabulate([
    ('Max', train[1].values.max()),
    ('Avg', train[1].values.mean()),
    ('Med', np.median(train[1].values)),
    ('Min', train[1].values.min())
], tablefmt='fancy_grid'
)
print(report)

#### Preprocess
use mini set as example

In [None]:
import torch
import torch.nn as nn

In [None]:
train = pd.read_csv('data/train_sequences_mini.txt', sep='\t', header=None, names=["sequence", "score"])
test = pd.read_csv('data/test_sequences_mini.txt', sep='\t', header=None, names=["sequence", "score"])

In [None]:
# max length = 142 in train and length = 110 for all test sequences
maxlen = 150

##### pad sequence

In [None]:
pad_dict = {
    'A': 0,
    'T': 1,
    'C': 2,
    'G': 3,
    'N': 4,
    '<PAD>': 5,
}

In [None]:
def seq2tensor(seq, maxlength=150):
    tensor = torch.zeros(maxlength, dtype=torch.long) + pad_dict['<PAD>']
    for i in range(len(seq)):
        tensor[i] = pad_dict[seq[i]]
    return tensor

In [None]:
def get_data(df):
    seqs = [None] * len(df)
    scores = [None] * len(df)
    for i in tqdm(range(len(df))):
        seqs[i] = seq2tensor(df["sequence"][i]).long()
        scores[i] = df["score"][i]
    seqs = torch.stack(seqs)
    scores = torch.tensor(scores)
    return seqs, scores

In [None]:
train_seqs, train_scores = get_data(train)

In [None]:
test_seqs, test_scores = get_data(test)

In [None]:
torch.save((train_seqs, train_scores), 'train_mini.pt')

In [None]:
torch.save((test_seqs, test_scores), 'test_mini.pt')

In [None]:
train_seqs