# Prepare Gene Table From GTF

In [1]:
import pandas as pd

In [2]:
# gtf table has some spetial formats, we need to set more parameters when read it in,
# here I just provide you the answer, you can search pandas.read_csv() documentation for more information
gtf = pd.read_csv(
    '../../../data/ref/GENCODEvM24/gencode.vM24.annotation.gtf.gz',
    comment='#',
    sep='\t',
    header=None,
    names=[
        'chrom', 'source', 'feature', 'start', 'end', 'na1', 'strand', 'na2',
        'annotation'
    ])
gtf.head()

Unnamed: 0,chrom,source,feature,start,end,na1,strand,na2,annotation
0,chr1,HAVANA,gene,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; gene_type ""TEC..."
1,chr1,HAVANA,transcript,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; transcript_id ..."
2,chr1,HAVANA,exon,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; transcript_id ..."
3,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842.1""; gene_type ""snR..."
4,chr1,ENSEMBL,transcript,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842.1""; transcript_id ..."


In [3]:
gene_gtf = gtf[gtf['feature'] == 'gene'].copy()
print(gene_gtf.shape)
gene_gtf.head()

(55385, 9)


Unnamed: 0,chrom,source,feature,start,end,na1,strand,na2,annotation
0,chr1,HAVANA,gene,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; gene_type ""TEC..."
3,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842.1""; gene_type ""snR..."
6,chr1,HAVANA,gene,3205901,3671498,.,-,.,"gene_id ""ENSMUSG00000051951.5""; gene_type ""pro..."
24,chr1,HAVANA,gene,3252757,3253236,.,+,.,"gene_id ""ENSMUSG00000102851.1""; gene_type ""pro..."
27,chr1,HAVANA,gene,3365731,3368549,.,-,.,"gene_id ""ENSMUSG00000103377.1""; gene_type ""TEC..."


In [4]:
example_annotation = gene_gtf['annotation'][0]
example_annotation

'gene_id "ENSMUSG00000102693.1"; gene_type "TEC"; gene_name "4933401J01Rik"; level 2; mgi_id "MGI:1918292"; havana_gene "OTTMUSG00000049935.1";'

In [5]:
# how to split everything into a dict
anno_dict = {}
for pair in example_annotation.strip(';').split(';'):
    k, v = pair.strip().split(' ')
    anno_dict[k] = v
anno_dict

{'gene_id': '"ENSMUSG00000102693.1"',
 'gene_type': '"TEC"',
 'gene_name': '"4933401J01Rik"',
 'level': '2',
 'mgi_id': '"MGI:1918292"',
 'havana_gene': '"OTTMUSG00000049935.1"'}

In [12]:
def anno_to_dict(anno_str):
    anno_dict = {}
    for pair in anno_str.strip(';').split(';'):
        k, v = pair.strip().split(' ')
        anno_dict[k] = v.strip('"')
    return pd.Series(anno_dict)

In [13]:
gene_anno_df = gene_gtf['annotation'].apply(anno_to_dict)

In [14]:
gene_anno_df

Unnamed: 0,gene_id,gene_type,gene_name,level,mgi_id,havana_gene,tag
0,ENSMUSG00000102693.1,TEC,4933401J01Rik,2,MGI:1918292,OTTMUSG00000049935.1,
3,ENSMUSG00000064842.1,snRNA,Gm26206,3,MGI:5455983,,
6,ENSMUSG00000051951.5,protein_coding,Xkr4,2,MGI:3528744,OTTMUSG00000026353.2,
24,ENSMUSG00000102851.1,processed_pseudogene,Gm18956,1,MGI:5011141,OTTMUSG00000049958.1,pseudo_consens
27,ENSMUSG00000103377.1,TEC,Gm37180,2,MGI:5610408,OTTMUSG00000049960.1,
...,...,...,...,...,...,...,...
1870748,ENSMUSG00000064368.1,protein_coding,mt-Nd6,3,MGI:102495,,
1870755,ENSMUSG00000064369.1,Mt_tRNA,mt-Te,3,MGI:102488,,
1870758,ENSMUSG00000064370.1,protein_coding,mt-Cytb,3,MGI:102501,,
1870763,ENSMUSG00000064371.1,Mt_tRNA,mt-Tt,3,MGI:102473,,


In [17]:
gene_flat_table = pd.concat([gene_gtf, gene_anno_df], axis=1)
del gene_flat_table['annotation']
gene_flat_table.head()

Unnamed: 0,chrom,source,feature,start,end,na1,strand,na2,gene_id,gene_type,gene_name,level,mgi_id,havana_gene,tag
0,chr1,HAVANA,gene,3073253,3074322,.,+,.,ENSMUSG00000102693.1,TEC,4933401J01Rik,2,MGI:1918292,OTTMUSG00000049935.1,
3,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,ENSMUSG00000064842.1,snRNA,Gm26206,3,MGI:5455983,,
6,chr1,HAVANA,gene,3205901,3671498,.,-,.,ENSMUSG00000051951.5,protein_coding,Xkr4,2,MGI:3528744,OTTMUSG00000026353.2,
24,chr1,HAVANA,gene,3252757,3253236,.,+,.,ENSMUSG00000102851.1,processed_pseudogene,Gm18956,1,MGI:5011141,OTTMUSG00000049958.1,pseudo_consens
27,chr1,HAVANA,gene,3365731,3368549,.,-,.,ENSMUSG00000103377.1,TEC,Gm37180,2,MGI:5610408,OTTMUSG00000049960.1,


In [18]:
# Let's save this into ref dir
gene_flat_table.to_csv('../../../data/ref/GENCODEvM24/gene_meta.csv.gz', index=None)

In [19]:
gene_bed = gene_flat_table[['chrom', 'start', 'end', 'gene_id']]
gene_bed.head()

Unnamed: 0,chrom,start,end,gene_id
0,chr1,3073253,3074322,ENSMUSG00000102693.1
3,chr1,3102016,3102125,ENSMUSG00000064842.1
6,chr1,3205901,3671498,ENSMUSG00000051951.5
24,chr1,3252757,3253236,ENSMUSG00000102851.1
27,chr1,3365731,3368549,ENSMUSG00000103377.1


In [20]:
gene_bed.to_csv('../../../data/ref/GENCODEvM24/gene_bed_with_id.bed', 
                sep='\t', header=None, index=None)

In [21]:
!head ../../../data/ref/GENCODEvM24/gene_bed_with_id.bed

chr1	3073253	3074322	ENSMUSG00000102693.1
chr1	3102016	3102125	ENSMUSG00000064842.1
chr1	3205901	3671498	ENSMUSG00000051951.5
chr1	3252757	3253236	ENSMUSG00000102851.1
chr1	3365731	3368549	ENSMUSG00000103377.1
chr1	3375556	3377788	ENSMUSG00000104017.1
chr1	3464977	3467285	ENSMUSG00000103025.1
chr1	3466587	3513553	ENSMUSG00000089699.1
chr1	3512451	3514507	ENSMUSG00000103201.1
chr1	3531795	3532720	ENSMUSG00000103147.1


## Let's do some filtering on high quality genes

- Remove blacklist
- Remove low confidence gene

### Remove blacklist


In [26]:
# Download ENCODE Blacklist
!wget https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/mm10-blacklist.v2.bed.gz
!gunzip mm10-blacklist.v2.bed.gz

--2020-05-31 20:39:19--  https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/mm10-blacklist.v2.bed.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.196.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.196.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30412 (30K) [application/octet-stream]
Saving to: ‘mm10-blacklist.v2.bed.gz’


2020-05-31 20:39:20 (217 KB/s) - ‘mm10-blacklist.v2.bed.gz’ saved [30412/30412]



In [34]:
!head mm10-blacklist.v2.bed

chr10	0	3135400	High Signal Region
chr10	3218900	3276600	Low Mappability
chr10	3576900	3627700	Low Mappability
chr10	4191100	4197600	Low Mappability
chr10	4613500	4615400	High Signal Region
chr10	4761300	4763900	High Signal Region
chr10	5080800	5096600	Low Mappability
chr10	5580100	5586600	Low Mappability
chr10	6281200	6286700	High Signal Region
chr10	6740200	6742100	High Signal Region


In [22]:
!bedtools intersect -a ../../../data/ref/GENCODEvM24/gene_bed_with_id.bed -b mm10-blacklist.v2.bed -v > gene_bed_white.bed

In [24]:
!wc -l gene_bed_white.bed

   50144 gene_bed_white.bed


In [25]:
!wc -l ../../../data/ref/GENCODEvM24/gene_bed_with_id.bed

   55385 ../../../data/ref/GENCODEvM24/gene_bed_with_id.bed


In [26]:
# Homework
# How to select the white gene rows from ../../../data/ref/GENCODEvM24/gene_meta.csv.gz 
# and save them into a table for white gene only?