In [1]:
import numpy as np
import pandas as pd

# GrCh37

Download RefSeq transcript annotations from [UCSC Table Browser](http://rohsdb.cmb.usc.edu/GBshape/cgi-bin/hgTables?hgsid=3960312_ZMTtI4bvavkuiWrNuR3OxAWB52dn&clade=mammal&org=Human&db=hg19&hgta_group=genes&hgta_track=ensGene&hgta_table=0&hgta_regionType=genome&position=chr21%3A33031597-33041570&hgta_outputType=primaryTable&hgta_outFileName=) using settings specified in the [wiki](https://github.com/keoughkath/ExcisionFinder/wiki/Get-gene-annotations). Alternatively, get this file [here](http://lighthouse.ucsf.edu/public_files_no_password/excisionFinderData_public/gene_annots/).

# GrCh38

Download RefSeq annotations from UCSC Table Browser using settings specified in the [wiki](https://github.com/keoughkath/ExcisionFinder/wiki/Get-gene-annotations). Alternatively, get this file [here](http://lighthouse.ucsf.edu/public_files_no_password/excisionFinderData_public/gene_annots/).

In [4]:
def get_gene_annots(infile):
    gene_df = pd.read_csv(infile, sep='\t',
                          usecols = ['name','chrom','txStart','txEnd',
                                    'cdsStart','cdsEnd','exonCount','exonStarts',
                                    'exonEnds','name2'])
    gene_df['size'] = gene_df['txEnd'] - gene_df['txStart']
    return gene_df

In [2]:
def filter_gene_annots(in_df):
    out_df = in_df[['name', 'chrom', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount',
       'exonStarts', 'exonEnds', 'size']].copy()
    out_df['official_gene_symbol'] = in_df['name2']

    gene_list_out = out_df.query('chrom in @chroms').dropna(axis=0).sort_values(by='size', ascending=False).groupby('official_gene_symbol').first()
    return gene_list_out

# GrCh37

Load gene annotations (input is the file you generated following the instructions above from the UCSC Table Browser).

In [7]:
hg19_in = get_gene_annots('ncbi_ucsc_output_grch37.tsv')

Filter for canonical transcript.

In [None]:
gene_list_hg19 = filter_gene_annots(hg19_in)

Save to file.

In [None]:
gene_list_hg19.to_csv('gene_list_hg19.tsv', sep='\t')

# GrCh38 

Load gene annotations (input is the file you generated following the instructions above from the UCSC Table Browser).

In [6]:
hg38_in = get_gene_annots('ncbi_ucsc_output_grch38.tsv')

Filter for canonical transcript.

In [None]:
gene_list_hg38 = filter_gene_annots(hg38_in)

Save to file.

In [None]:
gene_list_hg38.to_csv('gene_list_hg38.tsv', sep='\t')