In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/hyeshik/colab-biolab.git
!cd colab-biolab && bash tools/setup.sh
exec(open('colab-biolab/tools/activate_conda.py').read())
!conda install -y bedtools bioawk

In [7]:
%cd /content/drive/MyDrive/binfo1-datapack01/

/content/drive/MyDrive/binfo1-datapack01


In [None]:
!pip install contractions
!pip install textsearch
!pip install tqdm
import nltk
nltk.download('punkt')

In [10]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

Prepare Train Test set 

In [8]:
!bedtools getfasta -fi GRCm39.primary_assembly.genome.fa -bed fifty-over_entropy.bed -fo fifty-over_entropy.txt -tab
!bedtools getfasta -fi GRCm39.primary_assembly.genome.fa -bed fifty-under_entropy.bed -fo fifty-under_entropy.txt -tab

In [11]:
over = pd.read_csv("fifty-over_entropy.txt", sep='\t', names=['position', 'seq'])
under = pd.read_csv("fifty-under_entropy.txt", sep='\t', names=['position', 'seq'])
over['value'] = 'over'
under['value'] = 'under'

In [17]:
dataset = pd.concat([over, under])

In [18]:
dataset

Unnamed: 0,position,seq,value
0,chr15:55822163-55822169,CAAGAG,over
1,chr5:134173076-134173082,AGAAAA,over
2,chr11:18277202-18277208,CATTCA,over
3,chr8:124328381-124328387,CCGAGA,over
4,chr16:17040076-17040082,AACTCT,over
...,...,...,...
49995,chr8:112735968-112735974,GCAGGC,under
49996,chr10:34048376-34048382,GGGCTT,under
49997,chr11:4770557-4770563,CAATTG,under
49998,chr12:80245895-80245901,CCATGT,under


In [22]:
dataset.drop_duplicates(subset=['seq'])
dataset = dataset.sample(frac=1).reset_index(drop=True)

In [23]:
dataset

Unnamed: 0,position,seq,value
0,chr8:93124747-93124753,TTCCAA,under
1,chr11:29689829-29689835,TAACTG,over
2,chr10:28060869-28060875,GAGGGG,over
3,chr3:157766178-157766184,ATTAGA,under
4,chr1:40020640-40020646,CCTTTG,under
...,...,...,...
99995,chr1:4882916-4882922,AAAAAC,over
99996,chr8:11608197-11608203,TTTTTT,over
99997,chr11:59396650-59396656,CACCTT,over
99998,chr2:113675656-113675662,CCAAAT,over


In [None]:
seq = dataset['seq'].values
val = dataset['value'].values

train_seq = seq[:70000]
train_val = val[:70000]
test_seq = seq[70000:]
test_val = val[70000:]

In [None]:
def label_to_count(labels):
    '''
    Given a list of labels, returns a dictionary that maps each class label to how many
    instances of that label were present in the list.
    '''
    label_to_count_dict = {}
    for label in labels:
        if label not in label_to_count_dict:
            label_to_count_dict[label] = 0
        label_to_count_dict[label] += 1
    return label_to_count_dict


def prepare_data(seqs):
    '''
    Given a list of sequences, will turn into a tokenized vector.
    
    ARGS:
        seqs: a list of strings where every string is a sequence
    RETURNS:
        tokenized_seqs (list(list(int))): list of list of tokens
        voc2ind (dict) a dictionary where keys are letters, values are the corresponding token
    '''
    max_len = 0
    
    # build up a voc2ind (letters:token)
    # based on ATGC and include padding and unknown tokens
    voc2ind = {voc:ind for ind,voc in enumerate(['<pad>', '<unk>', 'A', 'T', 'C', 'G'])}
    
    i = len(voc2ind)
    
    # tokenize the sequences
    tokenized_seqs = []
    for seq in seqs:
        tokenized_seq = []
        for e in seq:
            # make sure the sequence is upper case, a == A
            seq = seq.upper()
            # if we haven't seen this letter before, add to the corupus
            if not e in voc2ind:
                voc2ind[e] = i
                i += 1
            tokenized_seq.append(voc2ind[e])
        tokenized_seqs.append(tokenized_seq)
        
    return tokenized_seqs, voc2ind
        
res = prepare_data(['ATCG', 'TAGA', 'APO'])
print(res)
assert(res[0] == [[2, 3, 4, 5], [3, 2, 5, 2], [2, 6, 7]]), res[0]


def prepare_labels(labels):
    '''
    Given a list of labels will turn them into integer labels
    Args:
        labels: a list of labels
    Returns:
        tokenized_labels: numpy array(list) a list of label tokens
        label2token: (dict) a dictionary where keys are letters, values are corresponding token
    '''
    tokenized_labels = []
    label2token = {}
    i = 0
    for label in labels:
        if not label in label2token:
            label2token[label] = i
            i += 1
        tokenized_labels.append(label2token[label])
    return tokenized_labels, label2token


def pad(tokenized_seqs, voc2ind):
    '''
    Pad each sequence to the maximum length by adding a <pad> token
    
    ARGS:
        tokenized_seqs (list(list(str))): list of list of tokens
        voc2ind (dict) a dictionary where keys are letters, values are the corresponding token
    RETURNS:
        a numpy array of all the tokenized sequences that have been padded to be the same
        length.
    '''

    padded_seqs = []
    
    # find max sequence length
    max_len = 0
    for seq in tokenized_seqs:
        max_len = max(len(seq), max_len)
    
    # add padding so sequences are max_length
    for seq in tokenized_seqs:
        padded_seq = seq + [voc2ind['<pad>']] * (max_len - len(seq))
        padded_seqs.append(padded_seq)
        
    return np.array(padded_seqs, dtype=np.float32)


def data_loader(train_inputs, val_inputs, train_labels, val_labels,
                batch_size=50):
    """
    Convert train and validation sets to torch.Tensors and load them to
    DataLoader.
    """

    # Convert data type to torch.Tensor
    train_inputs, val_inputs, train_labels, val_labels =\
    tuple(torch.tensor(data) for data in
          [train_inputs, val_inputs, train_labels, val_labels])

    # Create DataLoader for training data
    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader