In [1]:
from matplotlib import pyplot as plt
import numpy as np
import statsmodels.api as sm
from scipy import stats
import scipy
import csv
import pandas as pd
import math
import statsmodels
import itertools
import seaborn as sns
import random
import os
from textwrap import wrap
import pickle
import gzip
import random
from time import sleep
import sys

In [3]:
E032_DNase = pd.read_csv('data/E032/E032-DNase.tagAlign.gz', sep='\t', header=None)
E032_H3K4me3 = pd.read_csv('data/E032/E032-H3K4me3.tagAlign.gz', sep='\t', header=None)
E032_H3K36me3 = pd.read_csv('data/E032/E032-H3K36me3.tagAlign.gz', sep='\t', header=None)
chromatin_marks = [E032_H3K4me3, E032_H3K36me3]
dnase = [E032_DNase]

In [4]:
def drawProgressBar(percent, barLen = 20):
    # percent float from 0 to 1. 
    sys.stdout.write("\r")
    sys.stdout.write("[{:<{}}] {:.0f}%".format("=" * int(barLen * percent), barLen, percent * 100))
    sys.stdout.flush()

In [5]:
def make_spot(chromosome, threshold_hot=35, threshold_cold=0.0014):
    rates = pd.read_csv('data/rates/genetic_map_GRCh37_{}.txt'.format(chromosome), sep='\t')
    rates['hotspot'] = [0 if i < threshold_hot else 1 for i in rates['Rate(cM/Mb)']]

    hotspots = []
    for index, row in rates[rates['Rate(cM/Mb)']>=threshold_hot].iterrows():
        center = row['Position(bp)']
        start = center - 500
        end = center + 500
        width = 1
        hotspots.append([chromosome, center, start, end, width])
    
    coldspots = []
    for index, row in rates[rates['Rate(cM/Mb)']<=threshold_cold].iterrows():
        center = row['Position(bp)']
        start = center - 500
        end = center + 500
        width = 1
        coldspots.append([chromosome, center, start, end, width])
        
    header = ['Chromosome', 'Centre', 'Start', 'End', 'Width(kb) ']
    hotspots = pd.DataFrame(hotspots,columns=header)
    coldspots = pd.DataFrame(coldspots,columns=header)
    return hotspots, coldspots

In [6]:
def flatten(data, start, end):
    df = data[(start <= data[1]) & (data[2] <= end)]
    out = [0 for i in range(start, end)]
    for index, row in df.iterrows():
        for i in range(row[1]-start, row[2]-start):
            if i < len(out):
                out[i] = 1
    return out

def get_sequence(chromosome):
    rawseq = open('data/chromosome/'+ chromosome + '.fa','r')
    seq = rawseq.read()
    seq = seq.replace("\n", "")
    seq = seq[len(chromosome) + 1:]
    return seq

def get_nucleotides(seq, start, end):
    convert = {'g':'G', 'a':'A', 't':'T', 'c': 'C', 'n':'N', '>':'N'}
    nucleotides = [convert[seq[i]] if seq[i] in convert else seq[i] for i in range(start, end)]
    return nucleotides

def divide_chunks(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

def one_hot_encode(seq):
    mapping = dict(zip("ACGT", range(4)))    
    seq2 = [mapping[i] for i in seq]
    return np.eye(4)[seq2]

In [7]:
def get_data(chromosome, hotspots, chromatin_marks, dnase):
    data = []
    hotspots = hotspots[hotspots['Chromosome'] == chromosome]
    chromatin_marks = [mark[(mark[0]==chromosome) & (mark[5]=='+')] for mark in chromatin_marks]
    dnase = [dna[(dna[0]==chromosome) & (dna[5]=='+')] for dna in dnase]
    seq = get_sequence(chromosome)
    snps = pd.read_csv('data/snp/bed_{}_{}.bed.gz'.format(chromosome[:3], chromosome[3:]), skiprows=1, sep='\t', header=None)
    snps = snps[snps[5]=='+']
    
    for index, row in hotspots.iterrows():
        
        # Grab the hotspot sequence and the corresponding states
        start, end, length = row['Start'], row['End'], row['Width(kb) '] 
        
        # Randomly choose one
        if length != 1:
            start = random.randint(start, end-1000)
            end = start + 1000
        
        # divide_chunks(1d array, 1000) to split the data into intervals of 1000
        # for i, j, k, l, m in zip(nucleot|ide, h3k1, h3k4, h3k36, dna):
        if end < len(seq):
            nucleotide = get_nucleotides(seq, start, end)
            marks = [flatten(mark, start, end) for mark in chromatin_marks]
            dna = [flatten(dna, start, end) for dna in dnase]
            snp = [flatten(snps, start, end)]

            # We do not want broken sequences
            if 'N' not in nucleotide:
                data.append(np.hstack((one_hot_encode(nucleotide), np.array(marks+snp+dna).T)))

            percent = len(data) / len(hotspots)
            drawProgressBar(percent, barLen = 20)
            
    return data

In [8]:
def create_data(chromosome, chromatin_marks, dnase):
    hotspots, coldspots = make_spot(chromosome)
    
    hotspot_data = get_data(chromosome, hotspots, chromatin_marks, dnase)
    with open("data/spots/"+chromosome+"_hotspot_data.pickle", "wb") as fp: 
        pickle.dump(hotspot_data, fp)
        
    coldspot_data = get_data(chromosome, coldspots, chromatin_marks, dnase)
    with open("data/spots/"+chromosome+"_coldspot_data.pickle", "wb") as fp: 
        pickle.dump(coldspot_data, fp)

In [None]:
for i in range(10, 11):
    print(i)
    create_data('chr'+str(i), chromatin_marks, dnase)

10