In [1]:
import pandas as pd
import numpy as np

#for executable
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as patches
#%matplotlib inline 

import warnings
# both loading plastid and one of the lines below provoke some warnings
warnings.filterwarnings('ignore')

from plastid import *
from twobitreader import TwoBitFile
from docx import Document
from docx.shared import Inches
from collections import defaultdict
import six

#to make executable
import os

In [2]:
##All files are defined by their path rather than by a prefix.
os.system('pwd')
##clear out figures and fasta for new parameters
os.system('rm ./figures/*')
os.system('rm ./fastas/*')

256

In [3]:
# keep adding FASTAs for additional transcripts (in descending order of abundance)
# until this fraction of total reads is accounted for
GENE_COVERAGE_THRESHOLD = 0.8 
# fraction of reads to try to capture in each transcript
TRANSCRIPT_COVERAGE_THRESHOLD = 0.8
# kernel size for median filter used when smoothing read densities
KERNEL_SIZE = 49
# amount to pad the 5' end of each suggested capture region
PAD5 = 25
# amount to pad the 3' end of each suggested capture region
PAD3 = 400
# amount from end of transcripts to target for genes with inadequate sequencing
FAILED_LENGTH = 800
# maximum length, all sequences will be clipped to this with the clipping beginning from the 3' end
MAXIMUM_LENGTH = 2000


**Algorithm outline**

1. Using K562 ENCODE data, estimate the abundance of different transcript isoforms. Target a sufficient number of isoforms such that at least the fraction `GENE_COVERAGE_THRESHOLD` of reads (from the ENCODE data) would be captured.

2. In each targeted isoform, we perform a peak finding procedure to estimate the target region. Take all reads that are compatible with that isoform and then smooth them using a median filter (with width `KERNEL_SIZE`). Then set a threshold such that at least the fraction `TRANSCRIPT_COVERAGE_THRESHOLD` of reads is accounted for. Then pad the selected region on the 5' and 3' sides by the constants `PAD5` and `PAD3` (this will not extend target sequences past the annotated transcript ends). If the resulting sequence is too long (this happens e.g. if there is an extraneous peak of density early in a transcript), clip the sequence on the side closer to the beginning of the transcript such that the total targeted length does not exceed `MAXIMUM_LENGTH`. For isoforms that have insufficient sequencing coverage, just target `FAILED_LENGTH` nt from the annotated 3' ends. Diagram below:

    ```
                     /----\
    5'              /      \      _____         3'
                   /        \    /     \
    _______/-\____/          ----       \______
    
                         ||
                         ||
                         \/
                         
                |    /----\                    |
                |   /      \      _____        |
                |  /        \    /     \       |
    ______/-\___|_/          ----       \______|
                    
    ```
3. For each gene, compare the regions chosen across different isoforms. If one of the smaller regions is a strict subset of one of the bigger ones, eliminate the smaller region.

In [4]:
genome = TwoBitFile('./references/genome.2bit') 

In [5]:
# this line produces a lot of warnings about duplicate tags...
annotated_transcripts = {transcript.attr['transcript_id']: transcript for transcript 
                             in GTF2_TranscriptAssembler('./references/targeted_genes.gtf',return_type=Transcript)}

In [6]:
# what are the ensembl ids for these genes?
targeted_genes = pd.read_csv('target_gene_list.txt', header=None, names=['gene_name'])
feature_names = pd.read_csv('./references/features.tsv.gz',
                            sep='\t',
                            header=None,
                            names=['gene_id', 'gene_name', 'feature_type'])
targeted_genes = feature_names[feature_names['gene_name'].isin(targeted_genes['gene_name'])]

In [7]:
# make a table of all the unique transcripts for each gene
targeted_transcripts = pd.Series({transcript_id: transcript.get_gene() for transcript_id, transcript in annotated_transcripts.iteritems()}).reset_index()
targeted_transcripts.columns = ['transcript_id', 'gene_id']

name_mapper = dict(zip(targeted_genes['gene_id'], targeted_genes['gene_name']))
id_mapper = dict(zip(targeted_genes['gene_name'], targeted_genes['gene_id']))

targeted_transcripts['gene_name'] = targeted_transcripts['gene_id'].map(name_mapper)
targeted_transcripts = targeted_transcripts.sort_values('gene_name').reset_index(drop=True)

In [8]:
# construct dictionary that groups transcript models by gene
transcripts_by_gene = defaultdict(dict)

for name, x in targeted_transcripts.iterrows():
    transcript_id = x['transcript_id']
    transcripts_by_gene[x['gene_name']][transcript_id] = annotated_transcripts[transcript_id]

# Load sequencing data

In [9]:
# load aggregated sequencing data (from process_bams notebook)

import cPickle as pickle

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [10]:
spanning_segments = load_obj('./counts/spanning_segments.pkl')
count_vectors = load_obj('./counts/merged_count_vectors.count.pkl')
start_count_vectors = load_obj('./counts/merged_start_count_vectors.count.pkl')
transcript_count_vectors = load_obj('./counts/merged_transcript_count_vectors.count.pkl')

In [11]:
start_read_counts = pd.Series({name: np.sum(c) for name, c in start_count_vectors.iteritems()})
start_read_counts.head()

ASNA1     62355.0
ATF4     386623.0
CDH3       1363.0
FOXO3    269350.0
ORC1      85226.0
dtype: float64

# Find isoforms to target for each gene

In [12]:
# ENCODE K562 transcript isoform estimates from RSEM
transcript_abundances = pd.read_csv('./references/ENCFF717EVE.tsv', sep='\t')
transcript_abundances['base_transcript_id'] = transcript_abundances['transcript_id'].map(lambda x: x.split('.')[0])
transcript_abundances.set_index('base_transcript_id', inplace=True)

In [13]:
# order transcripts for each gene according to RSEM estimates of abundance
transcript_sort_order = dict()

for gene, transcripts in transcripts_by_gene.iteritems():
    transcript_sort_order[gene] = transcript_abundances.loc[transcripts.keys()].sort_values('TPM', ascending=False)['TPM']

In [14]:
# select the top transcripts necessary to account for the majority of reads
transcript_cumulative_fraction = dict()
no_data = list()

for gene, abundance in transcript_sort_order.iteritems():
    data = abundance.copy()
    data = data/data.sum()
    data = data.cumsum()
    if (abundance == 0).all():
        no_data.extend([gene,])
        transcript_cumulative_fraction[gene] = data
        continue
    num_needed = np.max([1, len(data[data < GENE_COVERAGE_THRESHOLD]) + 1])
    transcript_cumulative_fraction[gene] = data.iloc[0:num_needed]

In [15]:
# genes that have no representation in ENCODE

In [16]:
for gene in no_data:
    print(gene)

CDH3


# Finding target sequences for top isoforms

In [17]:
from scipy.signal import medfilt

def get_threshold_for_coverage(density, coverage_threshold=0.95):
    levels = np.unique(density)
    density_by_level = pd.Series({level: density[density >= level].sum() for level in levels})
    thresh = density_by_level[density_by_level >= coverage_threshold].argmin()
    return thresh

def get_thresholded_density(density, coverage_threshold=0.95):
    thresh = get_threshold_for_coverage(density, coverage_threshold=coverage_threshold)
    thresholded_density = density.copy()
    thresholded_density[thresholded_density < thresh] = 0
    return thresholded_density

def find_runs(a):
    # Create an array that is 1 where a is `value`, and pad each end with an extra 0.
    isvalue = np.concatenate(([0], (a != 0).view(np.int8), [0]))
    absdiff = np.abs(np.diff(isvalue))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges

def get_transcript_density(gene_name, transcript_id, coverage_threshold=TRANSCRIPT_COVERAGE_THRESHOLD, kernel_size=KERNEL_SIZE):
    counts = transcript_count_vectors[gene_name][transcript_id]
    filtered = medfilt(counts, kernel_size=kernel_size)
    density = filtered/filtered.sum()
    thresholded_density = get_thresholded_density(density, coverage_threshold=coverage_threshold)
    return thresholded_density, density

def get_transcript_cover(thresholded_density, pad3=PAD3, pad5=PAD5):
    runs = find_runs(thresholded_density)
    thresholded_start = runs[0][0]
    thresholded_end = runs[-1][1]
    transcript_end = len(thresholded_density) - 1
    padded_start = np.max([thresholded_start - pad5, 0])
    padded_end = np.min([thresholded_end + pad3, transcript_end])
    
    cover_length = padded_end - padded_start
    if cover_length > MAXIMUM_LENGTH:
        padded_start = padded_end - MAXIMUM_LENGTH
    
    return padded_start, padded_end

def get_transcript_subsequence(gene_name, transcript_id, start, end):
    transcript = transcripts_by_gene[gene_name][transcript_id]
    return transcript.get_sequence(genome)[start:end]

def get_transcript_cover_sequence(gene_name, transcript_id, coverage_threshold=TRANSCRIPT_COVERAGE_THRESHOLD, kernel_size=KERNEL_SIZE, pad3=PAD3, pad5=PAD5):
    thresholded_density, density = get_transcript_density(gene_name,
                                                          transcript_id,
                                                          coverage_threshold=coverage_threshold,
                                                          kernel_size=kernel_size)
    start, end = get_transcript_cover(thresholded_density, pad3=pad3, pad5=pad5)
    return get_transcript_subsequence(gene_name, transcript_id, start, end)

def get_transcript_cover_position_list(gene_name, transcript_id, coverage_threshold=TRANSCRIPT_COVERAGE_THRESHOLD, kernel_size=KERNEL_SIZE, pad3=PAD3, pad5=PAD5):
    thresholded_density, density = get_transcript_density(gene_name,
                                                          transcript_id,
                                                          coverage_threshold=coverage_threshold,
                                                          kernel_size=kernel_size)
    start, end = get_transcript_cover(thresholded_density, pad3=pad3, pad5=pad5)
    transcript = transcripts_by_gene[gene_name][transcript_id]
    if transcript.strand == '+':
        pos_list = transcript.get_position_list()[start:end]
    else:
        pos_list = transcript.get_position_list()[::-1][start:end][::-1]
    return pos_list

In [18]:
cover_positions = defaultdict(dict)
cover_sequences = defaultdict(dict)
no_cover = list()

for gene, transcripts in transcript_cumulative_fraction.iteritems():
    print(gene)
    print('=================================')
    for transcript_id in transcripts.index:
        print('\t{0}...'.format(transcript_id))
        try:
            cover_positions[gene][transcript_id] = get_transcript_cover_position_list(gene, transcript_id)
            cover_sequences[gene][transcript_id] = get_transcript_cover_sequence(gene, transcript_id)
        except ValueError:
            no_cover.extend([(gene, transcript_id),])
            print('\tFAILED')
            transcript = transcripts_by_gene[gene][transcript_id]
            cover_positions[gene][transcript_id] = transcript.get_position_list()[-FAILED_LENGTH:]
            cover_sequences[gene][transcript_id] = transcript.get_sequence(genome)[-FAILED_LENGTH:]
            continue

TOMM7
	ENST00000358435...
ASNA1
	ENST00000357332...
TUBB
	ENST00000327892...
ORC1
	ENST00000371568...
FOXO3
	ENST00000343882...
	ENST00000406360...
RPS5
	ENST00000196551...
ATF4
	ENST00000396680...
CDH3
	ENST00000569080...
	FAILED
	ENST00000569036...
	FAILED
	ENST00000567674...
	FAILED
	ENST00000542274...
	FAILED
	ENST00000569117...
	ENST00000429102...
	FAILED
	ENST00000264012...
	FAILED
	ENST00000568292...
	FAILED
	ENST00000566808...
	FAILED
	ENST00000565453...
	FAILED
SEC61A1
	ENST00000243253...


In [19]:
# transcripts where too little sequencing so just took piece from annotated end
no_cover

[('CDH3', 'ENST00000569080'),
 ('CDH3', 'ENST00000569036'),
 ('CDH3', 'ENST00000567674'),
 ('CDH3', 'ENST00000542274'),
 ('CDH3', 'ENST00000429102'),
 ('CDH3', 'ENST00000264012'),
 ('CDH3', 'ENST00000568292'),
 ('CDH3', 'ENST00000566808'),
 ('CDH3', 'ENST00000565453')]

# Removing target sequences that overlap

In [20]:
import itertools

def find_redundant_sequences(seqs):
    # order sequences by length as smaller must be in larger
    seq_lens = pd.Series({transcript_id: len(seq) for transcript_id, seq in seqs.iteritems()}).sort_values()
    # make all pairwise comparisons of smaller to larger ((1, 2), (1, 3), (1, 4), ..., (2, 3), (2, 4), ...)
    ordered_pairs = list(itertools.combinations(np.arange(len(seq_lens)), 2))    
    redundant_pairs = list()
    
    for pair in ordered_pairs:
        name1 = seq_lens.index[pair[0]]
        name2 = seq_lens.index[pair[1]]
        len1 = seq_lens.loc[name1]
        len2 = seq_lens.loc[name2]
        seq1 = seqs[name1]
        seq2 = seqs[name2]

        if seq1 in seq2:
            print('\t{0} (length: {1}) is contained in {2} (length: {3})'.format(name1, len1, name2, len2))
            redundant_pairs.extend([(name1, name2),])
    return redundant_pairs

In [21]:
redundant_sequences = dict()
for gene, seqs in cover_sequences.iteritems():
    print(gene)
    print('=================================')
    
    redundant_pairs = find_redundant_sequences(seqs)
    if len(redundant_pairs) > 0:
        redundant_sequences[gene] = redundant_pairs

TOMM7
ASNA1
TUBB
ORC1
FOXO3
RPS5
ATF4
CDH3
SEC61A1


In [22]:
redundant_covers = list()

for pairs in redundant_sequences.itervalues():
    for pair in pairs:
        redundant_covers.append(pair[0])

# list to keep for plots
redundant_covers = np.unique(redundant_covers)

In [23]:
# makes names summarizing all of the transcripts targeted by overlapping sequences
dominance_relations = dict()
redundant_names = dict()

for gene, pairs in redundant_sequences.iteritems():
    dominant_covers = np.unique([pair[1] for pair in pairs])
    for d in dominant_covers:
        dominance_relations[d] = reduce(np.union1d, [pair for pair in pairs if pair[1] == d])
        redundant_names[d] = '_'.join(dominance_relations[d])

In [24]:
# this has all sequences that are encapsulated in another removed
# and the name has been changed to be all transcripts targeted mashed together
filtered_cover_sequences = defaultdict(dict)

for gene, covers in cover_sequences.iteritems():
    for transcript_id, seq in covers.iteritems():
        if transcript_id not in redundant_covers:
            if transcript_id not in redundant_names.keys():
                filtered_cover_sequences[gene][transcript_id] = seq
            else:
                filtered_cover_sequences[gene][redundant_names[transcript_id]] = seq

# Plotting

In [25]:
vec_start = np.vectorize(lambda x: x.start)
vec_end = np.vectorize(lambda x: x.end)

def transcript_to_segments(transcript):
    starts = vec_start(transcript.segments)
    ends = vec_end(transcript.segments)
    return starts, ends

def plot_genome_coverage(gene_name, transcript_id, xlim=None, ylim=None):
    transcript = transcripts_by_gene[gene_name][transcript_id]
    strand = transcript.strand
    if (strand == '+'):
        positions = transcript.get_position_list()
    elif (strand == '-'):
        positions = transcript.get_position_list()[::-1]
    
    plt.step(positions, transcript_count_vectors[gene_name][transcript_id], linewidth=0.5, alpha=0.5)
    
    starts, ends = transcript_to_segments(transcript)

    for i in xrange(len(starts)):
        plt.plot([starts[i], ends[i]], [0, 0], linewidth=5, color='gray')

    utr3 = transcript.get_utr3()
    if len(utr3) > 0:
        starts, ends = transcript_to_segments(utr3)
        for i in xrange(len(starts)):
            plt.plot([starts[i], ends[i]], [0, 0], linewidth=5, color='r', alpha=0.6)
        
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    
    if xlim is not None:
        plt.xlim(xlim)
    if ylim is not None:
        plt.ylim(ylim)
        
    if transcript_id in cover_positions[gene_name].keys():
        cov_pos = np.array(cover_positions[gene_name][transcript_id])
        x = np.arange(cov_pos[0], cov_pos[-1] + 1)
        y = np.zeros(*x.shape)
        ymin, ymax = plt.gca().get_ylim()
        y[cov_pos - cov_pos[0]] = ymax
        color = 'g' if transcript_id not in redundant_covers else 'lightgreen'
        plt.fill_between(x, y, step='pre', alpha=0.3, facecolor=color)

    return ax

In [26]:
def plot_transcript_coverage(gene_name, transcript_id, xlim=None, ylim=None):
    transcript = transcripts_by_gene[gene_name][transcript_id]
    strand = transcript.strand

    genome_positions = transcript.get_position_list()
    transcript_length = len(genome_positions)
    positions = np.arange(transcript_length)

    if (strand == '+'):
        # line them all up at right side
        positions = positions + genome_positions[-1] - transcript_length
    elif (strand == '-'):
        # line them up at left side
        positions = positions[::-1] + genome_positions[0]

    plt.step(positions, transcript_count_vectors[gene_name][transcript_id], linewidth=0.5, alpha=0.5, color='red')

    transcript_to_genome = dict(enumerate(genome_positions))
    pos_mapper = {v: positions[k] for k, v in transcript_to_genome.iteritems()}

    starts, ends = transcript_to_segments(transcript)
    
    for i in xrange(len(starts)):
        plt.plot([pos_mapper[starts[i]], pos_mapper[ends[i] - 1]], [0, 0], linewidth=5, color='gray')

    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
        
    if xlim is not None:
        plt.xlim(xlim)
    if ylim is not None:
        plt.ylim(ylim)
        
    if transcript_id in cover_positions[gene_name].keys():
        cov_pos = np.array(cover_positions[gene_name][transcript_id]) 
        x = np.arange(cov_pos[0], cov_pos[-1] + 1)
        y = np.zeros(*x.shape)
        ymin, ymax = plt.gca().get_ylim()
        y[cov_pos - cov_pos[0]] = ymax
        if strand == '+':
            genome_pos_mapper = dict(zip(genome_positions, positions))
        else:
            genome_pos_mapper = dict(zip(genome_positions, positions[::-1])) 
        x = np.vectorize(genome_pos_mapper.get, otypes=[float])(x)
        
        color = 'g' if transcript_id not in redundant_covers else 'lightgreen'
        plt.fill_between(x, y, step='pre', alpha=0.3, facecolor=color)
        
    return ax

In [27]:
def plot_aggregated_genome_coverage(gene_name, xlim=None, ylim=None):
    strand = spanning_segments[gene_name].strand
    chrom = spanning_segments[gene_name].chrom

    if (strand == '+'):
        positions = spanning_segments[gene_name].get_position_list()
    elif (strand == '-'):
        positions = spanning_segments[gene_name].get_position_list()[::-1]
      
    plt.step(positions, start_count_vectors[gene_name], linewidth=0.5, alpha=0.5)
    plt.plot(positions, count_vectors[gene_name], linewidth=1)

    if xlim is not None:
        plt.xlim(xlim)
    if ylim is not None:
        plt.ylim(ylim)
    
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

In [28]:
def plot_all_transcript_coverage(gene_name, sort=True, filename=None):
    transcripts = transcripts_by_gene[gene_name]
    strand = spanning_segments[gene_name].strand
    
    num_transcripts = len(transcripts)
    xlim = [spanning_segments[gene_name].get_position_list()[0], spanning_segments[gene_name].get_position_list()[-1]]
    
    max_counts = np.max([np.max(counts) for counts in transcript_count_vectors[gene_name].itervalues()])
    ylim = [0, max_counts]
    
    plt.figure(figsize=[15, num_transcripts])
    
    if not sort:
        keys = transcripts.iterkeys()
    else:
        keys = transcript_sort_order[gene_name].index.values
    
    for i, transcript_id in enumerate(keys):
        plt.subplot(num_transcripts, 1, i+1)
        ax = plot_transcript_coverage(gene_name, transcript_id, xlim=xlim, ylim=ylim)

        if sort:
            min_x, max_x = ax.get_xlim()
            x_range = max_x - min_x
            max_y = ax.get_ylim()[1]
            ax.text(max_x - 0.05*x_range, 0.7*max_y, transcript_sort_order[gene_name].loc[transcript_id])
        
        if i == 0:
            plt.title('{0} - {1} ({2})'.format(gene_name, id_mapper[gene_name], strand))
            
        if i < num_transcripts - 1:
            plt.xticks([])
    if filename is not None:
        plt.savefig('./figures/{0}.png'.format(filename))

In [29]:
def plot_all_genome_coverage(gene_name, sort=True, filename=None):
    transcripts = transcripts_by_gene[gene_name]
    strand = spanning_segments[gene_name].strand
    
    num_transcripts = len(transcripts)
    xlim = [spanning_segments[gene_name].get_position_list()[0], spanning_segments[gene_name].get_position_list()[-1]]
    
    max_counts = np.max([np.max(counts) for counts in transcript_count_vectors[gene_name].itervalues()])
    ylim = [0, max_counts]
    
    plt.figure(figsize=[15, 1*(num_transcripts + 1)])
    plt.subplot(num_transcripts + 1, 1, 1)
    plot_aggregated_genome_coverage(gene_name, xlim=xlim)
    plt.title('{0} - {1} ({2})'.format(gene_name, id_mapper[gene_name], strand))
    plt.xticks([])
    
    if not sort:
        keys = transcripts.iterkeys()
    else:
        keys = transcript_sort_order[gene_name].index.values
    
    for i, transcript_id in enumerate(keys):
        plt.subplot(num_transcripts + 1, 1, i+2)
        ax = plot_genome_coverage(gene_name, transcript_id, xlim=xlim, ylim=ylim)
        if sort:
            min_x, max_x = ax.get_xlim()
            x_range = max_x - min_x
            max_y = ax.get_ylim()[1]
            ax.text(max_x - 0.05*x_range, 0.7*max_y, transcript_sort_order[gene_name].loc[transcript_id])
            
        if i < num_transcripts - 1:
            plt.xticks([])
    if filename is not None:
        plt.savefig('./figures/{0}.png'.format(filename))

In [30]:
from docx import Document
from docx.shared import Inches
from docx.enum.section import WD_SECTION
from docx.enum.section import WD_ORIENT

In [31]:
document = Document()
summary_data = defaultdict(dict)

section = document.sections[-1]
new_width, new_height = section.page_height, section.page_width
section.orientation = WD_ORIENT.LANDSCAPE
section.page_width = new_width
section.page_height = new_height

for name, row in targeted_genes.sort_values('gene_name').iterrows():
    gene_name = row['gene_name']
    gene_id = row['gene_id']
    plot_all_genome_coverage(gene_name, filename=gene_name + '_genome')
    plot_all_transcript_coverage(gene_name, filename=gene_name + '_transcriptome')
    
    spanning_start = spanning_segments[gene_name].segments[0].start
    spanning_end = spanning_segments[gene_name].segments[0].end
    spanning_chrom = spanning_segments[gene_name].segments[0].chrom
    spanning_strand = spanning_segments[gene_name].segments[0].strand
    
    document.add_heading('{0} - {1}'.format(gene_name, gene_id), 0)
    document.add_picture('./figures/{0}_genome.png'.format(gene_name), width=Inches(9))
    document.add_picture('./figures/{0}_transcriptome.png'.format(gene_name), width=Inches(9))
    
    p = document.add_paragraph()
    run = p.add_run('Position: ' + str(spanning_chrom) + ':' + \
                              str(spanning_start) + '-' + \
                              str(spanning_end) + ' (' + \
                              str(spanning_strand) + ')')
    font = run.font
    font.name = 'Courier New'
    
    for transcript_target, seq in filtered_cover_sequences[gene_name].iteritems():
        document.add_heading(transcript_target, 1)
        p = document.add_paragraph()
        run = p.add_run(seq)
        font = run.font
        font.name = 'Courier New'
        
        try:
            fasta_name = '{0}__{1}'.format(gene_id, transcript_target)
            fasta_file = open('./fastas/{0}.fasta'.format(fasta_name), 'wb')
            fasta_file.write('>{0}\n'.format(fasta_name))
            fasta_file.write(seq)
            fasta_file.close()
        except IOError as ioerr:
            if ioerr.errno == 36:
                fasta_name = '{0}__{1}'.format(gene_id, transcript_target[0:3])
                fasta_file = open('./fastas/{0}.fasta'.format(fasta_name), 'wb')
                fasta_file.write('>{0}\n'.format(fasta_name))
                fasta_file.write(seq)
                fasta_file.close()
                
        
        # summary data spreadsheet
        summary_data[transcript_target]['gene_name'] =  gene_name
        summary_data[transcript_target]['gene_id'] =  gene_id
        summary_data[transcript_target]['strand'] =  spanning_strand
        summary_data[transcript_target]['length'] =  len(seq)
        summary_data[transcript_target]['seq'] =  seq
 
    document.add_page_break()
        
document.save('target_regions.docx')

In [32]:
pd.DataFrame(summary_data).T.sort_values('gene_name').to_excel('./target_regions_summary.xlsx')

In [33]:
print('Total length:')
print(pd.DataFrame(summary_data).T.sort_values('gene_name')['length'].sum())

Total length:
15736


In [34]:
pd.DataFrame(summary_data).T[pd.DataFrame(summary_data).T['gene_name']=='ATF4']

Unnamed: 0,gene_id,gene_name,length,seq,strand
ENST00000396680,ENSG00000128272,ATF4,411,ATAGGAGCCTCCCATCTCCAGGTGTTCTCTGTGGGTCTGCCCGTCC...,+
