In [1]:
import Bio
import BCBio
import numpy as np
import pandas as pd
import gffpandas.gffpandas as gffpd

In [2]:
## GRCh37, gencode, Release 44 
## Comprehensive gene annotation
## https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh37_mapping/gencode.v44lift37.annotation.gtf.gz
## https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh37_mapping/gencode.v44lift37.annotation.gff3.gz

gencode_gffdf = gffpd.read_gff3("gencode.v44lift37.annotation.gff3").df
gencode_gffdf.head()

## GRCh37, ensembl human, release-110
## https://ftp.ensembl.org/pub/release-110/gff3/homo_sapiens/
## https://ftp.ensembl.org/pub/release-110/gtf/homo_sapiens/

# with open("Homo_sapiens.GRCh37.87.gtf") as fd:
#     for rec in BCBio.GFF.parse(fd):
#         print(rec)

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes
0,chr1,HAVANA,gene,12010,13670,.,+,.,ID=ENSG00000223972.6;gene_id=ENSG00000223972.6...
1,chr1,HAVANA,transcript,12010,13670,.,+,.,ID=ENST00000450305.2;Parent=ENSG00000223972.6;...
2,chr1,HAVANA,exon,12010,12057,.,+,.,ID=exon:ENST00000450305.2:1;Parent=ENST0000045...
3,chr1,HAVANA,exon,12179,12227,.,+,.,ID=exon:ENST00000450305.2:2;Parent=ENST0000045...
4,chr1,HAVANA,exon,12613,12697,.,+,.,ID=exon:ENST00000450305.2:3;Parent=ENST0000045...


In [3]:
## HGNC: HUGO Gene Nomenclature Committee
## https://www.genenames.org/download/archive/#!/#tocAnchor-1-2
## https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt
hgnc_df = pd.read_csv("hgnc_complete_set.txt", sep = "\t", header=0, keep_default_na=False)

# gene_alias_df[["hgnc_id", "symbol","name","alias_symbol","alias_name","prev_symbol","prev_name"]][gene_alias_df.hgnc_id == "HGNC:33584"]

In [4]:
gene_alias_df = hgnc_df[["hgnc_id", "symbol", "name", "alias_symbol", "alias_name", "prev_symbol", "prev_name"]]
gene_alias_df = gene_alias_df.astype(str)
gene_alias_df.head()

Unnamed: 0,hgnc_id,symbol,name,alias_symbol,alias_name,prev_symbol,prev_name
0,HGNC:5,A1BG,alpha-1-B glycoprotein,,,,
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,FLJ23569,,NCRNA00181|A1BGAS|A1BG-AS,non-protein coding RNA 181|A1BG antisense RNA ...
2,HGNC:24086,A1CF,APOBEC1 complementation factor,ACF|ASP|ACF64|ACF65|APOBEC1CF,,,
3,HGNC:7,A2M,alpha-2-macroglobulin,FWP007|S863-7|CPAMD5,,,
4,HGNC:27057,A2M-AS1,A2M antisense RNA 1,,,,A2M antisense RNA 1 (non-protein coding)|A2M a...


In [5]:
gene_list = pd.read_table("gene_list.txt", header=None, names=["xxx", "gene"])
gene_list.head()

Unnamed: 0,xxx,gene
0,gene,A3GALT2
1,gene,AADACL3
2,gene,AADACL4
3,gene,ABCA4
4,gene,ABCB10


In [6]:
gencode_gffdf = gencode_gffdf[gencode_gffdf.type == "exon"]

In [7]:
def get_value(x, key):
    for pair in x.split(";"):
        (k,v) = pair.split("=")
        if k == key:
            return v

In [8]:
gencode_gffdf["gene_name"]  = gencode_gffdf.attributes.apply(lambda x: get_value(x, "gene_name"))
gencode_gffdf["exon_number"]= gencode_gffdf.attributes.apply(lambda x: get_value(x, "exon_number"))
gencode_gffdf["exon_id"]    = gencode_gffdf.attributes.apply(lambda x: get_value(x, "ID"))
gencode_gffdf["exon_group_id"] = gencode_gffdf["exon_id"].apply(lambda x: x.split(":")[1])
gencode_gffdf.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,gene_name,exon_number,exon_id,exon_group_id
2,chr1,HAVANA,exon,12010,12057,.,+,.,ID=exon:ENST00000450305.2:1;Parent=ENST0000045...,DDX11L1,1,exon:ENST00000450305.2:1,ENST00000450305.2
3,chr1,HAVANA,exon,12179,12227,.,+,.,ID=exon:ENST00000450305.2:2;Parent=ENST0000045...,DDX11L1,2,exon:ENST00000450305.2:2,ENST00000450305.2
4,chr1,HAVANA,exon,12613,12697,.,+,.,ID=exon:ENST00000450305.2:3;Parent=ENST0000045...,DDX11L1,3,exon:ENST00000450305.2:3,ENST00000450305.2
5,chr1,HAVANA,exon,12975,13052,.,+,.,ID=exon:ENST00000450305.2:4;Parent=ENST0000045...,DDX11L1,4,exon:ENST00000450305.2:4,ENST00000450305.2
6,chr1,HAVANA,exon,13221,13374,.,+,.,ID=exon:ENST00000450305.2:5;Parent=ENST0000045...,DDX11L1,5,exon:ENST00000450305.2:5,ENST00000450305.2


In [9]:
import warnings

# ["hgnc_id", "symbol","name","alias_symbol","alias_name","prev_symbol","prev_name"]

def is_symbol(gene):
    rec = gene_alias_df[gene_alias_df.symbol == gene]
    if len(rec) > 0 :
        return True
    return False

def convert2symbol(gene):
    for symbol in ["alias_symbol","prev_symbol","name","alias_name","prev_name"]:
        rec = gene_alias_df[gene_alias_df[symbol].apply(lambda x: True if x.split("|").count(gene) > 0 else False)]
        if len(rec) > 0 :
            return rec.symbol.iloc[0]
            
    warnings.warn("Symbol %s is not matched in HGNC!"%gene, ResourceWarning)
    return None

In [10]:
## correct gencode gene name to standard symbol
# symbol_convert_dict = {}
# for gene_name in set(gencode_gffdf.gene_name):
#     if not is_symbol(gene_name):
#         symbol_convert_dict[gene_name] = convert2symbol(gene_name)      

In [11]:
# print(symbol_convert_dict)
# gencode_gffdf[gencode_gffdf.gene_name == "ENSG00000283839"] #.attributes.iloc[0]

In [12]:
def assemble_into_bed_format(chm, start, end, name, score, strand):
    ## bed format: 
    ## Chromosome: (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671) name
    ## chromStart: Start coordinate on the chromosome or scaffold for the sequence considered (the first base on the chromosome is numbered 0)
    ## chromEnd:   End coordinate on the chromosome or scaffold for the sequence considered. This position is non-inclusive, unlike chromStart.
    ## name: Name of the line in the BED file
    ## score: Score between 0 and 1000
    ## strand: DNA strand orientation (positive ["+"] or negative ["-"] or "." if no strand)
    return [chm.replace("chr",""), int(start), int(end), name, score, strand]

def assemble_target_region_by(gene_name):
    gene_exons_df = gencode_gffdf[gencode_gffdf.gene_name == gene_name]
    if len(gene_exons_df) > 0 :
        # chose the group which has max number of exons
        exon_group_id  = gene_exons_df.groupby("exon_group_id").size().idxmax()
        exon_group_size= gene_exons_df.groupby("exon_group_id").size().max()
        gene_exons_df  = gene_exons_df[gene_exons_df.exon_group_id == exon_group_id]
        
        # name: "gene_name:exons group size:exon number"
        return pd.DataFrame([ assemble_into_bed_format(row.seq_id, row.start, row.end, 
                                        "%s:%d:%d"%(row.gene_name,exon_group_size,int(row.exon_number)),
                                        row.score, row.strand) for ind,row in gene_exons_df.iterrows()], 
                               columns = ["Chromosome", "chromStart", "chromEnd", "gene_name", "score", "strand"])
        
    else:
        return pd.DataFrame([], columns = ["Chromosome", "chromStart", "chromEnd", "gene_name", "score", "strand"])
    
def write_bed_dataframe(fd, bed_df):
    for ind,row in bed_df.iterrows():
        fd.write("%s\t%d\t%d\t%s\t%s\t%s\n"%(row.Chromosome, row.chromStart, row.chromEnd, row.gene_name, row.score, row.strand))
    

In [13]:
import warnings

gene_missmatch_list = []

with open("genes.bed", "w") as fd:
    for gene_name in gene_list.gene:
    
        bed_df = assemble_target_region_by(gene_name)
    
        if len(bed_df) > 0:
            write_bed_dataframe(fd, bed_df)
            continue
            
        if not is_symbol(gene_name): 
            # convert gene name to stardard symbol
            symbol = convert2symbol(gene_name)
            bed_df   = assemble_target_region_by(symbol)
            if len(bed_df) > 0:
                write_bed_dataframe(fd, bed_df)
                continue
        
        # gene_name in genocode is not standard symbol
        # TODO: match all the possible symbols
        # TODO: raise warning and correct HGNC(HUGO Gene Nomenclature Committee)
        # warnings.warn("gene name [%s] is mismatched in HGNC!\n"%gene_name)
        gene_missmatch_list.append(gene_name)
        

In [19]:
mismatch_df = pd.DataFrame(gene_missmatch_list, columns=["mismatch"])

In [26]:
mismatch_df["is_symbol"] = mismatch_df.mismatch.apply(is_symbol)

In [47]:
mismatch_df # [mismatch_df.mismatch.apply(lambda x: x[:2] == "AK")]

Unnamed: 0,mismatch,is_symbol
0,AK022898,False
1,AK095633,False
2,AK097814,False
3,AK125437,False
4,AK127270,False
...,...,...
604,LOC100129520,False
605,LOC100132831,False
606,LOC100133957,False
607,U00684,False


In [45]:
gencode_gffdf[gencode_gffdf.gene_name == "UXT-AS1"]

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,gene_name,exon_number,exon_id,exon_group_id
3328939,chrX,HAVANA,exon,47518232,47518579,.,+,.,ID=exon:ENST00000591832.1:1;Parent=ENST0000059...,UXT-AS1,1,exon:ENST00000591832.1:1,ENST00000591832.1
3328940,chrX,HAVANA,exon,47519294,47519510,.,+,.,ID=exon:ENST00000591832.1:2;Parent=ENST0000059...,UXT-AS1,2,exon:ENST00000591832.1:2,ENST00000591832.1
3328942,chrX,HAVANA,exon,47518252,47519776,.,+,.,ID=exon:ENST00000590504.1:1;Parent=ENST0000059...,UXT-AS1,1,exon:ENST00000590504.1:1,ENST00000590504.1
3328944,chrX,HAVANA,exon,47518490,47518579,.,+,.,ID=exon:ENST00000664698.1:1;Parent=ENST0000066...,UXT-AS1,1,exon:ENST00000664698.1:1,ENST00000664698.1
3328945,chrX,HAVANA,exon,47531903,47531992,.,+,.,ID=exon:ENST00000664698.1:2;Parent=ENST0000066...,UXT-AS1,2,exon:ENST00000664698.1:2,ENST00000664698.1
3328946,chrX,HAVANA,exon,47548461,47551725,.,+,.,ID=exon:ENST00000664698.1:3;Parent=ENST0000066...,UXT-AS1,3,exon:ENST00000664698.1:3,ENST00000664698.1
3328948,chrX,HAVANA,exon,47518634,47518800,.,+,.,ID=exon:ENST00000670339.1:1;Parent=ENST0000067...,UXT-AS1,1,exon:ENST00000670339.1:1,ENST00000670339.1
3328949,chrX,HAVANA,exon,47519294,47521298,.,+,.,ID=exon:ENST00000670339.1:2;Parent=ENST0000067...,UXT-AS1,2,exon:ENST00000670339.1:2,ENST00000670339.1
3328951,chrX,HAVANA,exon,47531939,47531992,.,+,.,ID=exon:ENST00000658400.1:1;Parent=ENST0000065...,UXT-AS1,1,exon:ENST00000658400.1:1,ENST00000658400.1
3328952,chrX,HAVANA,exon,47549074,47551705,.,+,.,ID=exon:ENST00000658400.1:2;Parent=ENST0000065...,UXT-AS1,2,exon:ENST00000658400.1:2,ENST00000658400.1


In [39]:
hgnc_df[hgnc_df.symbol == "LEMD1-DT"]

Unnamed: 0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_ids,lncipedia,gtrnadb,agr,mane_select,gencc
14797,HGNC:27631,LEMD1-DT,LEMD1 divergent transcript,non-coding RNA,"RNA, long non-coding",Approved,1q32.1,01q32.1,,,...,,,,,,,,HGNC:27631,,
