<a href="https://colab.research.google.com/github/katarinagresova/benchmarks/blob/main/enhancers/Enhancers_Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is working with data from paper [Enhancer Identification using Transfer and Adversarial Deep Learning of DNA Sequences](https://www.biorxiv.org/content/biorxiv/early/2018/02/14/264200.full.pdf). 

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!pip3 install biopython

In [14]:
import urllib
from fastai.text import *
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import os
from sklearn.model_selection import train_test_split

In [None]:
u = 'http://www.cs.huji.ac.il/~tommy//enhancer_CNN/Enhancers_vs_negative.tgz'
path = Path('/content/drive/My Drive/Benchmarks/enhancers/Enhancers_vs_negative.tgz')
urllib.request.urlretrieve(u, path)

In [None]:
!mkdir /content/drive/My\ Drive/Benchmarks/enhancers/unpacked/

In [None]:
!tar zxvf /content/drive/My\ Drive/Benchmarks/enhancers/Enhancers_vs_negative.tgz -C /content/drive/My\ Drive/Benchmarks/enhancers/unpacked/

In [11]:
def read_txt_to_list(txt_file):
  seqs = []
  with open(txt_file, 'r') as file:
    seqs = file.read().splitlines()
  return seqs

In [27]:
def split_train_val_test(data, train_ratio = 0.7, validation_ratio = 0.2, test_ratio = 0.1):

  # Produces test split.
  x_remaining, x_test = train_test_split(data, test_size=test_ratio, random_state=42)

  # Adjusts val ratio, w.r.t. remaining dataset.
  ratio_remaining = 1 - test_ratio
  ratio_val_adjusted = validation_ratio / ratio_remaining

  # Produces train and val splits.
  x_train, x_val = train_test_split(x_remaining, test_size=ratio_val_adjusted, random_state=42)

  return x_train, x_val, x_test

In [30]:
def save_to_fasta(filename, data):
  with open(filename, 'w') as handle:
    for index, seq in enumerate(data):
      SeqIO.write(SeqRecord(
          Seq(seq), 
          'seq_' + str(index), 
          description=""), 
        handle, 'fasta'
      )

In [32]:
def txt_to_fasta_with_split(txt_file, out_dir, prefix='positive'):

  current_seqs = read_txt_to_list(txt_file)
  x_train, x_val, x_test = split_train_val_test(current_seqs)

  save_to_fasta(Path(out_dir, prefix + '_train.fa'), x_train)
  save_to_fasta(Path(out_dir, prefix + '_valid.fa'), x_val)
  save_to_fasta(Path(out_dir, prefix + '_test.fa'), x_test)

In [33]:
from pathlib import Path
def make_dir(dir):
  Path(dir).mkdir(parents=True, exist_ok=True)

In [36]:
in_dir = '/content/drive/My Drive/Benchmarks/enhancers/unpacked/'
out_dir = '/content/drive/My Drive/Benchmarks/enhancers/prepared/'
enhancer_file = 'positive_samples'
negative_file = 'negative_samples'
exclude_dir = 'peaks_fasta_files'
for (root, subdirectories, files) in os.walk(in_dir):
  for subdirectory in subdirectories:
    if subdirectory != exclude_dir:
      current_in_dir = os.path.join(root, subdirectory)
      current_out_dir = os.path.join(out_dir, subdirectory)
      make_dir(current_out_dir)

      txt_to_fasta_with_split(current_in_dir + '/' + enhancer_file, current_out_dir, 'positive')
      txt_to_fasta_with_split(current_in_dir + '/' + negative_file, current_out_dir, 'negative')