# Prepare Gene Metadata

## Content
- GTF store all kinds of gene annotation information in a special format, especially the last column of GTF file contain lots of information in a string.
- Here we want to create a plain table, to extract all informations for each gene, and save it into a gene metadata

## Important pandas functions
- pd.read_csv
- pd.DataFrame.to_csv
- pd.DataFrame.apply
- pd.concat

## Load GTF

In [1]:
import pandas as pd

In [2]:
# change to your gtf location
gtf_path = '../../../data/ref/GENCODEvM24/gencode.vM24.annotation.gtf.gz'

# gtf table has some spetial formats, we need to set more parameters when read it in,
# here I just provide you the answer, you can search pandas.read_csv() documentation for more information
gtf = pd.read_csv(
    gtf_path,
    comment='#', # skip lines start with #
    sep='\t', # gtf col is sep by \t
    header=None, # no header, we will provide col names manually
    names=[
        'chrom', 'source', 'feature', 'start', 'end', 
        'na1', 'strand', 'na2', 'annotation'
    ])
gtf.head()

Unnamed: 0,chrom,source,feature,start,end,na1,strand,na2,annotation
0,chr1,HAVANA,gene,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; gene_type ""TEC..."
1,chr1,HAVANA,transcript,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; transcript_id ..."
2,chr1,HAVANA,exon,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; transcript_id ..."
3,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842.1""; gene_type ""snR..."
4,chr1,ENSEMBL,transcript,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842.1""; transcript_id ..."


In [3]:
# this will show you documentation of the function, but search google for more examples!
?pd.read_csv

## Explore GTF content

In [4]:
# gtf contain different type of features
gtf['feature'].value_counts()

exon              842873
CDS               528267
UTR               185941
transcript        142552
start_codon        59973
stop_codon         55713
gene               55385
Selenocysteine        65
Name: feature, dtype: int64

In [5]:
# here we only need those gene rows, so we do a filter here
gene_gtf = gtf[gtf['feature'] == 'gene'].copy().reset_index(drop=True)
print(f'the gene_gtf has {gene_gtf.shape[0]} rows')
gene_gtf.head()

the gene_gtf has 55385 rows


Unnamed: 0,chrom,source,feature,start,end,na1,strand,na2,annotation
0,chr1,HAVANA,gene,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; gene_type ""TEC..."
1,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842.1""; gene_type ""snR..."
2,chr1,HAVANA,gene,3205901,3671498,.,-,.,"gene_id ""ENSMUSG00000051951.5""; gene_type ""pro..."
3,chr1,HAVANA,gene,3252757,3253236,.,+,.,"gene_id ""ENSMUSG00000102851.1""; gene_type ""pro..."
4,chr1,HAVANA,gene,3365731,3368549,.,-,.,"gene_id ""ENSMUSG00000103377.1""; gene_type ""TEC..."


In [6]:
# the last column contain lots of information in a string
example_annotation = gene_gtf['annotation'][0]
example_annotation

'gene_id "ENSMUSG00000102693.1"; gene_type "TEC"; gene_name "4933401J01Rik"; level 2; mgi_id "MGI:1918292"; havana_gene "OTTMUSG00000049935.1";'

## Extract gene_id from the annotation column

Before prepare a whole gene metadata table, let's first do something simple to start

In [7]:
# we need to extract the gene_id from this string. here is how I do this with a function

def extract_gene_id(annotation):
    kv_pairs = annotation.split(';')  # split into key-value pairs
    for kv_pair in kv_pairs:  # iterate the key-value pairs
        kv_pair = kv_pair.strip(' ')  # strip removed the space
        if kv_pair.startswith('gene_id'):
            _, gene_id = kv_pair.split(' ')
            gene_id = gene_id.strip('"')  # strip removed the ""
            break
    return gene_id

extract_gene_id(example_annotation)

'ENSMUSG00000102693.1'

In [8]:
# now we need to apply this function on to each row of the transcript_gtf
gene_ids = gene_gtf['annotation'].apply(extract_gene_id)
# it take ~250ms in my computer
# search pandas.Dataframe.apply see more about this function
gene_ids

0        ENSMUSG00000102693.1
1        ENSMUSG00000064842.1
2        ENSMUSG00000051951.5
3        ENSMUSG00000102851.1
4        ENSMUSG00000103377.1
                 ...         
55380    ENSMUSG00000064368.1
55381    ENSMUSG00000064369.1
55382    ENSMUSG00000064370.1
55383    ENSMUSG00000064371.1
55384    ENSMUSG00000064372.1
Name: annotation, Length: 55385, dtype: object

In [9]:
?pd.DataFrame.apply

## Extract everything from the annotation column and create metadata

In [10]:
# the annotation string is basically key-value pairs, 
# we want to turn it into a dict, then create pandas series use the dict

# how to split everything into a dict
anno_dict = {}
for pair in example_annotation.strip(';').split(';'):  # strip and split are common string methods
    k, v = pair.strip().split(' ')
    anno_dict[k] = v
anno_dict

{'gene_id': '"ENSMUSG00000102693.1"',
 'gene_type': '"TEC"',
 'gene_name': '"4933401J01Rik"',
 'level': '2',
 'mgi_id': '"MGI:1918292"',
 'havana_gene': '"OTTMUSG00000049935.1"'}

In [11]:
# this function take anno str, return pd.Series
def anno_to_dict(anno_str):
    anno_dict = {}
    for pair in anno_str.strip(';').split(';'):
        k, v = pair.strip().split(' ')
        anno_dict[k] = v.strip('"')
    return pd.Series(anno_dict)

anno_to_dict(example_annotation)

gene_id        ENSMUSG00000102693.1
gene_type                       TEC
gene_name             4933401J01Rik
level                             2
mgi_id                  MGI:1918292
havana_gene    OTTMUSG00000049935.1
dtype: object

In [12]:
# now we can apply it into every row in the gtf
gene_anno_df = gene_gtf['annotation'].apply(anno_to_dict)
gene_anno_df.head()

# human table can be different from this

Unnamed: 0,gene_id,gene_type,gene_name,level,mgi_id,havana_gene,tag
0,ENSMUSG00000102693.1,TEC,4933401J01Rik,2,MGI:1918292,OTTMUSG00000049935.1,
1,ENSMUSG00000064842.1,snRNA,Gm26206,3,MGI:5455983,,
2,ENSMUSG00000051951.5,protein_coding,Xkr4,2,MGI:3528744,OTTMUSG00000026353.2,
3,ENSMUSG00000102851.1,processed_pseudogene,Gm18956,1,MGI:5011141,OTTMUSG00000049958.1,pseudo_consens
4,ENSMUSG00000103377.1,TEC,Gm37180,2,MGI:5610408,OTTMUSG00000049960.1,


In [13]:
# last step, we concatenate the gene_gtf with the gene_anno_df
gene_flat_table = pd.concat([gene_gtf, gene_anno_df], axis=1)

# delete some redundent or not used cols
del gene_flat_table['annotation']
del gene_flat_table['na1']
del gene_flat_table['na2']

gene_flat_table.head()

Unnamed: 0,chrom,source,feature,start,end,strand,gene_id,gene_type,gene_name,level,mgi_id,havana_gene,tag
0,chr1,HAVANA,gene,3073253,3074322,+,ENSMUSG00000102693.1,TEC,4933401J01Rik,2,MGI:1918292,OTTMUSG00000049935.1,
1,chr1,ENSEMBL,gene,3102016,3102125,+,ENSMUSG00000064842.1,snRNA,Gm26206,3,MGI:5455983,,
2,chr1,HAVANA,gene,3205901,3671498,-,ENSMUSG00000051951.5,protein_coding,Xkr4,2,MGI:3528744,OTTMUSG00000026353.2,
3,chr1,HAVANA,gene,3252757,3253236,+,ENSMUSG00000102851.1,processed_pseudogene,Gm18956,1,MGI:5011141,OTTMUSG00000049958.1,pseudo_consens
4,chr1,HAVANA,gene,3365731,3368549,-,ENSMUSG00000103377.1,TEC,Gm37180,2,MGI:5610408,OTTMUSG00000049960.1,


In [None]:
?pd.concat

In [14]:
# and we set gene_id as the index, which is unique and more meaningful
gene_flat_table.set_index('gene_id', inplace=True)  # inplace must be true, why? Homework
gene_flat_table.head()

Unnamed: 0_level_0,chrom,source,feature,start,end,strand,gene_type,gene_name,level,mgi_id,havana_gene,tag
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSMUSG00000102693.1,chr1,HAVANA,gene,3073253,3074322,+,TEC,4933401J01Rik,2,MGI:1918292,OTTMUSG00000049935.1,
ENSMUSG00000064842.1,chr1,ENSEMBL,gene,3102016,3102125,+,snRNA,Gm26206,3,MGI:5455983,,
ENSMUSG00000051951.5,chr1,HAVANA,gene,3205901,3671498,-,protein_coding,Xkr4,2,MGI:3528744,OTTMUSG00000026353.2,
ENSMUSG00000102851.1,chr1,HAVANA,gene,3252757,3253236,+,processed_pseudogene,Gm18956,1,MGI:5011141,OTTMUSG00000049958.1,pseudo_consens
ENSMUSG00000103377.1,chr1,HAVANA,gene,3365731,3368549,-,TEC,Gm37180,2,MGI:5610408,OTTMUSG00000049960.1,


## Save gene metadata

In [15]:
gene_flat_table.to_csv('gene_metadata.csv.gz')

In [17]:
?pd.DataFrame.to_csv