In [16]:
import numpy as np
import pandas as pd
import gffpandas.gffpandas as gffpd

In [17]:
## GRCh37, gencode, Release 44 
## Comprehensive gene annotation
## https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh37_mapping/gencode.v44lift37.annotation.gtf.gz
## https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh37_mapping/gencode.v44lift37.annotation.gff3.gz
gencode_df      = gffpd.read_gff3("gencode.v44lift37.annotation.gff3").df

In [58]:
# gencode_df_origin = gffpd.read_gff3("gencode.v44lift37.annotation.gff3").df
gencode_df_CDS    = gencode_df_origin[gencode_df_origin.type == "CDS"]

In [65]:
gencode_df_CDS["gene_name"] = gencode_df_CDS.attributes.apply(lambda x: get_gencode_attr_value(x, "gene_name"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gencode_df_CDS["gene_name"] = gencode_df_CDS.attributes.apply(lambda x: get_gencode_attr_value(x, "gene_name"))


In [70]:
print("%d cds, %d exons"%(len(set(gencode_df_CDS["gene_name"])),len(set(gencode_df["gene_name"]))))

20537 cds, 62307 exons


In [71]:
cbase_gene_df = pd.read_table("gene_list.txt", header = None, names = ["xxx", "gene"])
len(cbase_gene_df.gene)

19620

In [75]:
set(cbase_gene_df.gene)-set(gencode_df_CDS["gene_name"])

{'C17orf76-AS1',
 'PRAC',
 'FLJ00322',
 'C17orf74',
 'HEATR2',
 'C3orf56',
 'SYT14L',
 'H2BFM',
 'C6orf48',
 'NSUN5P1',
 'PIPSL',
 'ST13P4',
 'FAM179B',
 'FAM179A',
 'TRAF3IP2-AS1',
 'BC023201',
 'SELK',
 'AX747402',
 'EFCAB4B',
 'AX746590',
 'WDR78',
 'ZNF204P',
 'C15orf38-AP3S2',
 'AGPAT6',
 'C2orf71',
 'AX748283',
 'LOC100616530',
 'AZI1',
 'FAM211B',
 'AES',
 'SGOL2',
 'ZNF876P',
 'C10orf137',
 'SEPT2',
 'C22orf46',
 'MKL2',
 'HIST1H3D',
 'CTGF',
 'LINC00488',
 'KIAA1598',
 'OFCC1',
 'HIST1H3I',
 'TCRBV1S1A1N1',
 'FAM69B',
 'BC126296',
 'CCDC108',
 'C16orf80',
 'DKFZP686I15217',
 'TCRBV5S2',
 'PVRL2',
 'C10orf107',
 'RARRES3',
 'SEPP1',
 'PTPLA',
 'FER1L4',
 'NACAP1',
 'EIF3IP1',
 'TTC18',
 'MYCNOS',
 'C1orf170',
 'C2orf53',
 'TCRBV15S1',
 'BC043529',
 'OR2M1P',
 'WAPAL',
 'QARS',
 'HIST1H2BC',
 'LINC00937',
 'PLA2G16',
 'C10orf54',
 'POU5F1P4',
 'BC037321',
 'FAM90A2P',
 'LOC285074',
 'FAM173B',
 'FBXL21',
 'KIAA0226L',
 'FTH1P18',
 'NUPR1L',
 'DNM1P46',
 'ZBED6CL',
 'LOC494141',


In [4]:
def get_gencode_attr_value(x, key):
    for pair in x.split(";"):
        (k,v) = pair.split("=")
        if k == key:
            return v

In [18]:
gencode_df = gencode_df[gencode_df.type == "exon"]

gencode_df["gene_name"]    = gencode_df.attributes.apply(lambda x: get_gencode_attr_value(x, "gene_name"))
gencode_df["exon_number"]  = gencode_df.attributes.apply(lambda x: get_gencode_attr_value(x, "exon_number"))
gencode_df["exon_id"]      = gencode_df.attributes.apply(lambda x: get_gencode_attr_value(x, "ID"))
gencode_df["transcript_id"]= gencode_df["exon_id"].apply(lambda x: x.split(":")[1])

In [44]:
triplets = ["AAA", "AAC", "AAG", "AAT", "CAA", "CAC", "CAG", "CAT",
            "GAA", "GAC", "GAG", "GAT", "TAA", "TAC", "TAG", "TAT", 
            "ACA", "ACC", "ACG", "ACT", "CCA", "CCC", "CCG", "CCT", 
            "GCA", "GCC", "GCG", "GCT", "TCA", "TCC", "TCG", "TCT", 
            "AGA", "AGC", "AGG", "AGT", "CGA", "CGC", "CGG", "CGT", 
            "GGA", "GGC", "GGG", "GGT", "TGA", "TGC", "TGG", "TGT", 
            "ATA", "ATC", "ATG", "ATT", "CTA", "CTC", "CTG", "CTT", 
            "GTA", "GTC", "GTG", "GTT", "TTA", "TTC", "TTG", "TTT"]

### build bed file

In [33]:
def write_bed_dataframe(fd, bed_df):
    #### ATTENTION !!!
    #### convert 1-based to 0-based 
    #### start - 1
    for ind,row in bed_df.iterrows():
        fd.write("%s\t%d\t%d\t%s\t%s\t%s\n"%(row.Chromosome, row.chromStart_ext - 1, row.chromEnd_ext, row.gene_name, row.score, row.strand))

def ext_chromStart_splicing_sites(x):
    if x.exon_number == 1:
        # extend only context of first codon
        return x.chromStart - 1
    else:
        # extend the start to splicing sites if it is NOT the first codon
        return x.chromStart - 1 - 2
    
def ext_chromEnd_splicing_sites(x):
    if x.exon_number == x.transcript_size:
        return x.chromEnd + 1
    else:
        # extend the end to splicing sites if it is NOT the last codon
        return x.chromEnd + 1 + 2

def assemble_into_bed_format(chm:str, start:int, start_ext:int, end:int, end_ext:int, group_size:int, 
                             index:int, name:str, score:str, strand:str):
    ## bed format: 
    ## Chromosome: (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671) name
    ## chromStart: Start coordinate on the chromosome or scaffold for the sequence considered (the first base on the chromosome is numbered 0)
    ## chromEnd:   End coordinate on the chromosome or scaffold for the sequence considered. This position is non-inclusive, unlike chromStart.
    ## group_size: total number of exons of the transcription
    ## index: index number of exons in the transcription
    ## name: Name of the line in the BED file
    ## score: Score between 0 and 1000
    ## strand: DNA strand orientation (positive ["+"] or negative ["-"] or "." if no strand)
    return [chm.replace("chr",""), start, start_ext, end, end_ext, group_size, index, name, score, strand]

def assemble_target_region_by(gene_df):
    # chose the group which has max number of exons
    exon_trans_id  = gene_df.groupby("transcript_id").size().idxmax()
    transcript_size= gene_df.groupby("transcript_id").size().max()
    gene_exons_df  = gene_df[gene_df.transcript_id == exon_trans_id]
    
    # name: "gene_name:exons group size:exon number"
    bed_df = pd.DataFrame([assemble_into_bed_format(row.seq_id,row.start,row.start,row.end,row.end,transcript_size,int(row.exon_number),
                            "%s:%d:%d"%(row.gene_name,transcript_size,int(row.exon_number)),row.score,row.strand) 
                                for ind,row in gene_exons_df.iterrows()],
                          columns = ["Chromosome", "chromStart", "chromStart_ext", "chromEnd", "chromEnd_ext", 
                                        "transcript_size", "exon_number", "gene_name", "score", "strand"])
    
    bed_df["chromStart_ext"] = bed_df.apply(ext_chromStart_splicing_sites, axis = 1)
    bed_df["chromEnd_ext"]   = bed_df.apply(ext_chromEnd_splicing_sites,   axis = 1)
    
    return bed_df 


with open("genes.bed", "w") as fd:
    cbase_gene_df = pd.read_table("gene_list.txt", header = None, names = ["xxx", "gene"])
    for gene_name in cbase_gene_df.gene:
        gene_df = gencode_df[gencode_df.gene_name == gene_name]
        ## Naive: taking only identified targets
        if len(gene_df) > 0:
            bed_df = assemble_target_region_by(gene_df)
            write_bed_dataframe(fd, bed_df)

In [36]:
import os

os.system('bedtools getfasta -fi /run/user/1000/gvfs/sftp:host=flamingo.intra.igr.fr,user=j_wang/mnt/beegfs/userdata/a_ivashkin/references/genome_data/gatk/human_g1k_v37.fasta -bed genes.bed -name -tab -fo gencode_codons.tsv')



0

In [38]:
codons_df = pd.read_csv("gencode_codons.tsv", header=None, names=["gene_code","exons_seq"], sep='\t')

In [39]:
codons_df["gene_name"]       = codons_df.gene_code.apply(lambda x: x.split(":")[0])
codons_df["exon_group_size"] = codons_df.gene_code.apply(lambda x: x.split(":")[1])
codons_df["exon_group_id"]   = codons_df.gene_code.apply(lambda x: x.split(":")[2])

In [41]:
codons_df.head()

Unnamed: 0,gene_code,exons_seq,gene_name,exon_group_size,exon_group_id
0,A3GALT2:5:1::1:33786675-33786702,CCTGAGTCCCTCCTTGAGAGCCATATG,A3GALT2,5,1
1,A3GALT2:5:2::1:33778404-33778494,TACCTGAATTTAGGGAGGCCATACAGAAACAGGCCTAAGAGGCCAA...,A3GALT2,5,2
2,A3GALT2:5:3::1:33778098-33778194,TACCAGGGACGCAGGGCACCTGTGAAGTTGTCTCTCAGCTGGGACA...,A3GALT2,5,3
3,A3GALT2:5:4::1:33777649-33777793,TACCTGCCTACAGCAAAGATAGTCAGCCCAATGGTGAGGTTCTGCT...,A3GALT2,5,4
4,A3GALT2:5:5::1:33772363-33773055,GCGCTAGTTCCGCAGCAGCCGGTACCCCTTGGGCGCCCACAGCAGT...,A3GALT2,5,5


In [56]:
## problem: splicing sites may be located in a codon. 
## solution: store splicing sites and exons seperately 

def get_splicing_sites_context(gene_group): 
    
    splicing_sites = []
    
    for ind, row in gene_group.sort_values(['exon_group_id'],ascending=True).iterrows(): 
        
        if row.exon_group_id == 1:
            # splicing site of first exon 
            splicing_site_context_1 = triplets.index(row.exons_seq[-4:-1])
            splicing_site_context_2 = triplets.index(row.exons_seq[-3:])
            splicing_sites.append([splicing_site_context_1, splicing_site_context_2])
            
        elif row.exon_group_id == row.exon_group_size:
            # splicing site of last exon 
            splicing_site_context_1 = triplets.index(row.exons_seq[0:3])
            splicing_site_context_2 = triplets.index(row.exons_seq[1:4])
            splicing_sites.append([splicing_site_context_1, splicing_site_context_2])
        
        else:
            # splicing site on the left of exon
            splicing_site_context_1 = triplets.index(row.exons_seq[0:3])
            splicing_site_context_2 = triplets.index(row.exons_seq[1:4])
            splicing_sites.append([splicing_site_context_1, splicing_site_context_2])
            
            # splicing site on the right of exon
            splicing_site_context_1 = triplets.index(row.exons_seq[-4:-1])
            splicing_site_context_2 = triplets.index(row.exons_seq[-3:])
            splicing_sites.append([splicing_site_context_1, splicing_site_context_2])
            
    return splicing_sites
    
def get_exons_context(gene_group): 
    
    exons = []
    
    for ind, row in gene_group.sort_values(['exon_group_id'], ascending=True).iterrows():
        
        print("gene_name: [%s], index: [%d], length: [%d] %s"%(row.gene_name, int(row.exon_group_id), len(row.exons_seq), row.exons_seq))
        
        if row.exon_group_id == 1 :
            # first exon
            exon_seq = row.exons_seq[:-2]
            exons = exons + [ triplets.index(exon_seq[i:i+3]) for i in range(len(exon_seq)-2) ]
            
        elif row.exon_group_id == row.exon_group_size :
            # last exon 
            exon_seq = row.exons_seq[2:]
            exons = exons + [ triplets.index(exon_seq[i:i+3]) for i in range(len(exon_seq)-2) ]
            
        else:
            exon_seq = row.exons_seq[2:-2]
            exons = exons + [ triplets.index(exon_seq[i:i+3]) for i in range(len(exon_seq)-2) ]
    
    if (len(exons)) % 3 != 0:
        # raise ValueError("%s has only %d nucleotide!"%(exons[1:-1], len(exons)))
        raise ValueError("it has only %d context !"%(len(exons)))
        
    exons = [[triplets.index(exons[i:i+3]), triplets.index(exons[i+1:i+4]), triplets.index(exons[i+2:i+5])]
                for i in range(0,len(exons_seq),3)]
    
    return exons

with open("gencode_codons_by_gene.txt", "w") as fd:
    
    for gene_name, gene_group in codons_df.groupby("gene_name"):
        splicing_sites = get_splicing_sites_context(gene_group)
        exons          = get_exons_context(gene_group)
                
        break
        
#         fd.write("gene\t%s\n"%gene_name)
        
#         for ss in splicing_sites:
#             fd.write("%d\t%d\n"%(ss[0],ss[1]))
            
#         for ex in exons:
#             fd.write("%d\t%d\t%d\n"%(ex[0],ex[1],ex[2]))
            

gene_name: [A1BG], index: [1], length: [93] CCCCACAGCAAGAGAAAGACCACGAGCATGGACATGATGGTCGCGCTCACTCCGGTGCAGTGAGTGTCTGGGGTGAGCGTCTGCAGCAATGAG
gene_name: [A1BG], index: [2], length: [42] CACATATGGCTGCTTCTGTCACTGGGCCCCAGGTGACACCTG
gene_name: [A1BG], index: [3], length: [276] CACTTGGCCCTGTCAGCTCCAGGAGCTTGCTCAGCTGGGTCCATCCTGTGGACAAGCCCGAGCGGCAGCGGTAGCGGCCCTGGGTGTCACCCGTCAGCAGGAACTGGTGCTTGATGGCAGGTGAGTCAAGGTGCACAGGCTCCTGGGCCACCCCATTCTTGAACAGCTGGAAGTCTGGAGTCTCCAGGTGGGCCTGGCACGTCAGCGTCACATTGGCCAAGGGTTTCAGCAGTGATTCGGACTCTGCCCACAGGCTGGGCTGCGTCTCATAAACTG
gene_name: [A1BG], index: [4], length: [279] CACCGAGCTCCTCAATGGTCACAGTAGCGCTGGGCTCAGAGAGGGCGCCTTCCCCATCGGTCCGGTAGCTGCAGCTGTAGTTGCCAGGCTGATGGACTGGAAAGGTGGCCTCCACATCCTCCTGGGCCTCAGGCACCTCCAGAAACTCATGGTCGCCCTCCCGCCTCAGCAGAAAAGTCACACCCCGCAGCACACCTCGGCACACTGCTGTTGTTTTCAGGCCGGGGGTGATCCAGGACACTGGCGCCATCGAGAGCCAGGGAGCAGGCAAGGACTCTG
gene_name: [A1BG], index: [5], length: [303] CACCATCGCTCAGAATCAGCTCGACCGGCGCGCTGTCCCCGGACCAGCCGTTTTGGTTGTCATGCAGCCGGTAGCGGCAGGTG

ValueError: it has only 3380 context !

In [26]:
gencode_df[gencode_df.gene_name=="A3GALT2"]

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,gene_name,exon_number,exon_id,transcript_id
68478,chr1,HAVANA,exon,33786677,33786699,.,-,.,ID=exon:ENST00000442999.3:1;Parent=ENST0000044...,A3GALT2,1,exon:ENST00000442999.3:1,ENST00000442999.3
68481,chr1,HAVANA,exon,33778408,33778491,.,-,.,ID=exon:ENST00000442999.3:2;Parent=ENST0000044...,A3GALT2,2,exon:ENST00000442999.3:2,ENST00000442999.3
68483,chr1,HAVANA,exon,33778102,33778191,.,-,.,ID=exon:ENST00000442999.3:3;Parent=ENST0000044...,A3GALT2,3,exon:ENST00000442999.3:3,ENST00000442999.3
68485,chr1,HAVANA,exon,33777653,33777790,.,-,.,ID=exon:ENST00000442999.3:4;Parent=ENST0000044...,A3GALT2,4,exon:ENST00000442999.3:4,ENST00000442999.3
68487,chr1,HAVANA,exon,33772367,33773054,.,-,.,ID=exon:ENST00000442999.3:5;Parent=ENST0000044...,A3GALT2,5,exon:ENST00000442999.3:5,ENST00000442999.3
