In [1]:
import numpy as np
import pandas as pd

1.) Download Ensembl transcript annotations from [UCSC Table Browser](http://rohsdb.cmb.usc.edu/GBshape/cgi-bin/hgTables?hgsid=3960312_ZMTtI4bvavkuiWrNuR3OxAWB52dn&clade=mammal&org=Human&db=hg19&hgta_group=genes&hgta_track=ensGene&hgta_table=0&hgta_regionType=genome&position=chr21%3A33031597-33041570&hgta_outputType=primaryTable&hgta_outFileName=) using settings specified in the [wiki](https://github.com/keoughkath/ExcisionFinder/wiki/Get-gene-annotations) and name the resulting file ensembl_ucsc_output_grch37.tsv. Ensure you select the correct reference genome, and specify it in the filename in the "grch37" spot. Alternatively, get this file [here](http://lighthouse.ucsf.edu/public_files_no_password/excisionFinderData_public/gene_annots/).

2.) Download mappings from Ensembl gene IDs to gene symbols from the [HUGO website](https://www.genenames.org/cgi-bin/download) with boxes checked for "Approved Symbol" and "Ensembl Gene ID". Name the resulting file name_to_id.tsv. Alternatively, download this file [here](http://lighthouse.ucsf.edu/public_files_no_password/excisionFinderData_public/gene_annots/).

Files from steps 1 and 2 will need to be in your current directory for the notebook to run correctly.

Importantly, it's not required to complete these steps to use ExcisionFinder. You can just use the pre-generated gene annotations file "gene_annots_wsize" provided [here](http://lighthouse.ucsf.edu/public_files_no_password/excisionFinderData_public/gene_annots/). This is intended for documentation and reproducibility purposes as well as to allow users to generate their own gene annotations in order to analyze different reference genomes.

In [61]:
def get_gene_annots(ensembl, name_mapping, out):
    gene_df = pd.read_csv(ensembl, sep='\t',
                          usecols = ['name','chrom','txStart','txEnd',
                                    'cdsStart','cdsEnd','exonStarts',
                                    'exonEnds','name2'])
    gene_df.columns = ['name','chrom','txStart','txEnd',
                                        'cdsStart','cdsEnd','exonStarts',
                                        'exonEnds','ensembl_id']
    gene_to_id = pd.read_csv(name_mapping, sep='\t', header=0,
                            names=['name2','ensembl_id'])

    gene_df['size'] = gene_df['txEnd'] - gene_df['txStart']
    gene_df = gene_df.merge(gene_to_id, how='left')
    gene_df.to_csv(out, sep='\t', index=False)

# GrCh37

In [62]:
get_gene_annots('ensembl_ucsc_output_grch37.tsv', 'gene_to_id.tsv', 'gene_annots_wsize_grch37.tsv')

# GrCh38 - not supported in UCSC yet

In [None]:
# get_gene_annots('ensembl_ucsc_output_grch38.tsv', 'gene_to_id.tsv', 'gene_annots_wsize_grch37.tsv')