## gencode.v39.annotation.gff3
gff["attributes"]に様々な情報が格納されている。各情報は";"で区切られているので`split(";")`で情報を抽出できる。

In [2]:
import pandas as pd
import gffpandas.gffpandas as gffpd

In [3]:
# gencode.v39.annotation.gff3 was downloaded from (https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/gencode.v39.annotation.gff3.gz)

annotation = gffpd.read_gff3("gencode.v39.annotation.gff3")
gff = annotation.df

print(len(gff))

gff.head()

3238846


Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes
0,chr1,HAVANA,gene,11869,14409,.,+,.,ID=ENSG00000223972.5;gene_id=ENSG00000223972.5...
1,chr1,HAVANA,transcript,11869,14409,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000223972.5;...
2,chr1,HAVANA,exon,11869,12227,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...
3,chr1,HAVANA,exon,12613,12721,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...
4,chr1,HAVANA,exon,13221,14409,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...


タンパク質をコードする遺伝子（`gene_type="protein_coding"`）については、GFFファイル中に3'UTRの情報(`type="three_prime_RNA"`)が含まれているため、その情報を抽出すればOK。一方、lncRNA（`gene_type=lncRNA`）については、GFFファイル中に3'UTRの情報が含まれていないため、代わりに遺伝子全長の情報（`type="gene"`）を抽出する。

必要な情報は`gene_id`, `gene_name`, `gene_type` の3種類。これらの情報を抽出しする関数を作成

In [4]:
def get_attributes(df):
    attributes = df.split(";")
    output_name = [s for s in attributes if "gene_name=" in s][0].replace("gene_name=", "")
    output_id = [s for s in attributes if "gene_id=" in s][0].replace("gene_id=", "")
    output_type = [s for s in attributes if "gene_type=" in s][0].replace("gene_type=", "")
    return output_name + "|" + output_id + "|" + output_type

def get_gene_type(df):
    attributes = df.split(";")
    output_type = [s for s in attributes if "gene_type=" in s][0].replace("gene_type=", "")
    return output_type

In [5]:
gff["gene"] = gff["attributes"].apply(get_attributes)
gff["gene_type"] = gff["attributes"].apply(get_gene_type)

In [6]:
gff = gff[((gff["gene_type"] == "protein_coding") & (gff["type"] == "three_prime_UTR")) | 
               ((gff["gene_type"] == "lncRNA") & (gff["type"] == "gene"))]

In [8]:
gff = gff[gff["gene_type"] == "protein_coding"]
gff.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,gene,gene_type
68,chr1,HAVANA,three_prime_UTR,70009,71585,.,+,.,ID=UTR3:ENST00000641515.2;Parent=ENST000006415...,OR4F5|ENSG00000186092.7|protein_coding,protein_coding
971,chr1,HAVANA,three_prime_UTR,944154,944574,.,+,.,ID=UTR3:ENST00000616016.5;Parent=ENST000006160...,SAMD11|ENSG00000187634.13|protein_coding,protein_coding
1004,chr1,HAVANA,three_prime_UTR,944154,944574,.,+,.,ID=UTR3:ENST00000618323.5;Parent=ENST000006183...,SAMD11|ENSG00000187634.13|protein_coding,protein_coding
1050,chr1,HAVANA,three_prime_UTR,944154,944574,.,+,.,ID=UTR3:ENST00000342066.8;Parent=ENST000003420...,SAMD11|ENSG00000187634.13|protein_coding,protein_coding
1205,chr1,HAVANA,three_prime_UTR,944154,944575,.,+,.,ID=UTR3:ENST00000341065.8;Parent=ENST000003410...,SAMD11|ENSG00000187634.13|protein_coding,protein_coding


In [10]:
gff = gff[["seq_id", "start", "end", "gene", "score", "strand"]].drop_duplicates().reset_index(drop=True)
gff["score"] = 0

gff.head()

Unnamed: 0,seq_id,start,end,gene,score,strand
0,chr1,70009,71585,OR4F5|ENSG00000186092.7|protein_coding,0,+
1,chr1,944154,944574,SAMD11|ENSG00000187634.13|protein_coding,0,+
2,chr1,944154,944575,SAMD11|ENSG00000187634.13|protein_coding,0,+
3,chr1,944154,944259,SAMD11|ENSG00000187634.13|protein_coding,0,+
4,chr1,944203,944693,NOC2L|ENSG00000188976.11|protein_coding,0,-


各遺伝子毎にかぶっている領域をマージする

In [11]:
gene_list = gff["gene"].unique()
output_df = pd.DataFrame(columns=["chr", "start", "end", "gene", "score", "strand"])

for gene in gene_list:
    tmp_df = gff[gff["gene"] == gene]    
    tmp_chr = tmp_df.iloc[0, 0]
    tmp_start = tmp_df["start"].min()
    tmp_end = tmp_df["end"].max()
    tmp_strand = tmp_df.iloc[0, 5]
    
    add_row = pd.DataFrame([tmp_chr, tmp_start, tmp_end, gene, 0, tmp_strand], index=output_df.columns).T
    
    output_df = pd.concat([output_df, add_row])

In [12]:
output_df.to_csv("gencode.v39.costomized.bed", sep="\t", index=False, header=False)