In [1]:
pip install gffpandas

Collecting gffpandas
  Downloading gffpandas-1.2.0.tar.gz (178 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.8/178.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: gffpandas
  Building wheel for gffpandas (setup.py) ... [?25ldone
[?25h  Created wheel for gffpandas: filename=gffpandas-1.2.0-py2.py3-none-any.whl size=6248 sha256=6dd3ad884f41e21f24f899e25e8ae44b9f4d210d0530de4e27a1c5f62362dd92
  Stored in directory: /mnt/nfs01/home/j_wang@intra.igr.fr/.cache/pip/wheels/57/87/f1/1d0c74fbc5151562ba7953dc110a7d8c63c6c3229d025bc8cd
Successfully built gffpandas
Installing collected packages: gffpandas
Successfully installed gffpandas-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import gffpandas.gffpandas as gffpd

In [3]:
## GRCh37, gencode, Release 44 
## Comprehensive gene annotation
## https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh37_mapping/gencode.v44lift37.annotation.gtf.gz
## https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh37_mapping/gencode.v44lift37.annotation.gff3.gz
gencode_df = gffpd.read_gff3("resources/gencode.v44lift37.annotation.gff3").df

In [4]:
triplets = ["AAA", "AAC", "AAG", "AAT", "CAA", "CAC", "CAG", "CAT",
            "GAA", "GAC", "GAG", "GAT", "TAA", "TAC", "TAG", "TAT", 
            "ACA", "ACC", "ACG", "ACT", "CCA", "CCC", "CCG", "CCT", 
            "GCA", "GCC", "GCG", "GCT", "TCA", "TCC", "TCG", "TCT", 
            "AGA", "AGC", "AGG", "AGT", "CGA", "CGC", "CGG", "CGT", 
            "GGA", "GGC", "GGG", "GGT", "TGA", "TGC", "TGG", "TGT", 
            "ATA", "ATC", "ATG", "ATT", "CTA", "CTC", "CTG", "CTT", 
            "GTA", "GTC", "GTG", "GTT", "TTA", "TTC", "TTG", "TTT"]

triplets_user = ["AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", 
                 "AGA", "AGC", "AGG", "AGT", "ATA", "ATC", "ATG", "ATT", 
                 "CAA", "CAC", "CAG", "CAT", "CCA", "CCC", "CCG", "CCT", 
                 "CGA", "CGC", "CGG", "CGT", "CTA", "CTC", "CTG", "CTT", 
                 "GAA", "GAC", "GAG", "GAT", "GCA", "GCC", "GCG", "GCT", 
                 "GGA", "GGC", "GGG", "GGT", "GTA", "GTC", "GTG", "GTT", 
                 "TAA", "TAC", "TAG", "TAT", "TCA", "TCC", "TCG", "TCT", 
                 "TGA", "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT"]

In [5]:
def get_gencode_attr_value(x, key):
    for pair in x.split(";"):
        (k,v) = pair.split("=")
        if k == key:
            return v

In [6]:
gencode_df = gencode_df[gencode_df.type == "exon"]

gencode_df["gene_name"]    = gencode_df.attributes.apply(lambda x: get_gencode_attr_value(x, "gene_name"))
gencode_df["exon_number"]  = gencode_df.attributes.apply(lambda x: get_gencode_attr_value(x, "exon_number"))
gencode_df["exon_id"]      = gencode_df.attributes.apply(lambda x: get_gencode_attr_value(x, "ID"))
gencode_df["transcript_id"]= gencode_df["exon_id"].apply(lambda x: x.split(":")[1])

### build bed file

In [8]:
def write_bed_dataframe(fd, bed_df):
    #### ATTENTION !!!
    #### convert 1-based to 0-based 
    #### start - 1
    for ind,row in bed_df.iterrows():
        fd.write("%s\t%d\t%d\t%s\t%s\t%s\n"%(row.Chromosome, row.chromStart_ext - 1, row.chromEnd_ext, row.gene_name, row.score, row.strand))

def ext_chromStart_splice_sites(x):
    if x.exon_number == 1:
        # extend only context of first codon
        return x.chromStart - 1
    else:
        # extend the start to splice sites if it is NOT the first codon
        return x.chromStart - 1 - 2
    
def ext_chromEnd_splice_sites(x):
    if x.exon_number == x.transcript_size:
        return x.chromEnd + 1
    else:
        # extend the end to splice sites if it is NOT the last codon
        return x.chromEnd + 1 + 2

def assemble_into_bed_format(chm:str, start:int, start_ext:int, end:int, end_ext:int, group_size:int, 
                             index:int, name:str, score:str, strand:str):
    ## bed format: 
    ## Chromosome: (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671) name
    ## chromStart: Start coordinate on the chromosome or scaffold for the sequence considered (the first base on the chromosome is numbered 0)
    ## chromEnd:   End coordinate on the chromosome or scaffold for the sequence considered. This position is non-inclusive, unlike chromStart.
    ## group_size: total number of exons of the transcription
    ## index: index number of exons in the transcription
    ## name: Name of the line in the BED file
    ## score: Score between 0 and 1000
    ## strand: DNA strand orientation (positive ["+"] or negative ["-"] or "." if no strand)
    return [chm.replace("chr",""), start, start_ext, end, end_ext, group_size, index, name, score, strand]

def assemble_target_region_by(gene_df):
    # chose the group which has max number of exons
    exon_trans_id  = gene_df.groupby("transcript_id").size().idxmax()
    transcript_size= gene_df.groupby("transcript_id").size().max()
    gene_exons_df  = gene_df[gene_df.transcript_id == exon_trans_id]
    
    # name: "gene_name:exons group size:exon number"
    bed_df = pd.DataFrame([assemble_into_bed_format(row.seq_id,row.start,row.start,row.end,row.end,transcript_size,int(row.exon_number),
                            "%s:%d:%d"%(row.gene_name,transcript_size,int(row.exon_number)),row.score,row.strand) 
                                for ind,row in gene_exons_df.iterrows()],
                          columns = ["Chromosome", "chromStart", "chromStart_ext", "chromEnd", "chromEnd_ext", 
                                        "transcript_size", "exon_number", "gene_name", "score", "strand"])
    
    bed_df["chromStart_ext"] = bed_df.apply(ext_chromStart_splice_sites, axis = 1)
    bed_df["chromEnd_ext"]   = bed_df.apply(ext_chromEnd_splice_sites,   axis = 1)
    
    return bed_df 


with open("genes.bed", "w") as fd:
    cbase_gene_df = pd.read_table("resources/gene_list.txt", header = None, names = ["xxx", "gene"])
    for gene_name in cbase_gene_df.gene:
        gene_df = gencode_df[gencode_df.gene_name == gene_name]
        ## Naive: taking only identified targets
        if len(gene_df) > 0:
            bed_df = assemble_target_region_by(gene_df)
            write_bed_dataframe(fd, bed_df)

In [36]:
import os

os.system('bedtools getfasta -fi /run/user/1000/gvfs/sftp:host=flamingo.intra.igr.fr,user=j_wang/mnt/beegfs/userdata/a_ivashkin/references/genome_data/gatk/human_g1k_v37.fasta -bed genes.bed -name -tab -fo gencode_codons.tsv')



0

In [4]:
codons_df = pd.read_csv("gencode_codons.tsv", header=None, names=["gene_code","exons_seq"], sep='\t')

In [5]:
codons_df["gene_name"]       = codons_df.gene_code.apply(lambda x: x.split(":")[0])
codons_df["exon_group_size"] = codons_df.gene_code.apply(lambda x: x.split(":")[1])
codons_df["exon_group_id"]   = codons_df.gene_code.apply(lambda x: x.split(":")[2])

In [8]:
codons_df.head()

Unnamed: 0,gene_code,exons_seq,gene_name,exon_group_size,exon_group_id
0,A3GALT2:5:1::1:33786675-33786702,CCTGAGTCCCTCCTTGAGAGCCATATG,A3GALT2,5,1
1,A3GALT2:5:2::1:33778404-33778494,TACCTGAATTTAGGGAGGCCATACAGAAACAGGCCTAAGAGGCCAA...,A3GALT2,5,2
2,A3GALT2:5:3::1:33778098-33778194,TACCAGGGACGCAGGGCACCTGTGAAGTTGTCTCTCAGCTGGGACA...,A3GALT2,5,3
3,A3GALT2:5:4::1:33777649-33777793,TACCTGCCTACAGCAAAGATAGTCAGCCCAATGGTGAGGTTCTGCT...,A3GALT2,5,4
4,A3GALT2:5:5::1:33772363-33773055,GCGCTAGTTCCGCAGCAGCCGGTACCCCTTGGGCGCCCACAGCAGT...,A3GALT2,5,5


In [12]:
## problem: splice sites may be located in a codon. 
## solution: store splice sites and exons seperately 

def get_splice_sites_context(gene_group): 
    
    spliceites = []
    
    for ind, row in gene_group.sort_values(['exon_group_id'],ascending=True).iterrows(): 
        try: 
            if int(row.exon_group_size) == 1:
                # if there is only one exon,
                # then do nothing
                continue
                
            elif int(row.exon_group_id) == 1:
                # splice site of first exon 
                spliceite_context_1 = triplets.index(row.exons_seq[-4:-1])
                splice_site_context_2 = triplets.index(row.exons_seq[-3:])
                spliceites.append([spliceite_context_1, spliceite_context_2])
            
            elif int(row.exon_group_id) == int(row.exon_group_size):
                # splicee of last exon 
                spliceite_context_1 = triplets.index(row.exons_seq[0:3])
                splice_site_context_2 = triplets.index(row.exons_seq[1:4])
                spliceites.append([spliceite_context_1, spliceite_context_2])

            else:
                # splice site on the left of exon
                spliceite_context_1 = triplets.index(row.exons_seq[0:3])
                spliceite_context_2 = triplets.index(row.exons_seq[1:4])
                splice_sites.append([splice_site_context_1, splice_site_context_2])

                # splice site on the right of exon
                splice_site_context_1 = triplets.index(row.exons_seq[-4:-1])
                splice_site_context_2 = triplets.index(row.exons_seq[-3:])
                splice_sites.append([splice_site_context_1, splice_site_context_2])
                
        except ValueError as err:
            print("gene name: %s, exon_group_size: %s, exons id: %s, sequence: %s"%(row.gene_name, row.exon_group_size, row.exon_group_id, row.exons_seq))
            print("[WARNING]:", err)
            
    return splice_sites
    
with open("splice_sites_by_gene.txt", "w") as fd:
    
    for gene_name, gene_group in codons_df.groupby("gene_name"):
        splice_sites = get_splice_sites_context(gene_group)            
        
        fd.write("gene\t%s\n"%gene_name)
        
        for ss in splice_sites:
            fd.write("%d\t%d\n"%(ss[0],ss[1]))

gene name: IL9R, exon_group_size: 10, exons id: 1, sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
gene name: IL9R, exon_group_size: 10, exons id: 10, sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

In [26]:
gencode_df[gencode_df.gene_name=="A3GALT2"]

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,gene_name,exon_number,exon_id,transcript_id
68478,chr1,HAVANA,exon,33786677,33786699,.,-,.,ID=exon:ENST00000442999.3:1;Parent=ENST0000044...,A3GALT2,1,exon:ENST00000442999.3:1,ENST00000442999.3
68481,chr1,HAVANA,exon,33778408,33778491,.,-,.,ID=exon:ENST00000442999.3:2;Parent=ENST0000044...,A3GALT2,2,exon:ENST00000442999.3:2,ENST00000442999.3
68483,chr1,HAVANA,exon,33778102,33778191,.,-,.,ID=exon:ENST00000442999.3:3;Parent=ENST0000044...,A3GALT2,3,exon:ENST00000442999.3:3,ENST00000442999.3
68485,chr1,HAVANA,exon,33777653,33777790,.,-,.,ID=exon:ENST00000442999.3:4;Parent=ENST0000044...,A3GALT2,4,exon:ENST00000442999.3:4,ENST00000442999.3
68487,chr1,HAVANA,exon,33772367,33773054,.,-,.,ID=exon:ENST00000442999.3:5;Parent=ENST0000044...,A3GALT2,5,exon:ENST00000442999.3:5,ENST00000442999.3
