# GTF

## API

In [1]:
from annokit.gtf import GTF

In [2]:
gtf = GTF()

In [3]:
gtf.name

In [4]:
gtf.version

In [5]:
gtf.URL

In [6]:
gtf.anno_map

{'gene': 'gene',
 'trans': 'transcript',
 'exon': 'exon',
 'CDS': 'CDS',
 'start_codon': 'start_codon',
 'stop_codon': 'stop_codon',
 'UTR5': 'five_prime_utr',
 'UTR3': 'three_prime_utr',
 'other': 'other'}

### gtf file read

#### raw anno_map

In [7]:
gtf_file = "./test_data/Homo_sapiens.GRCh38.111.gtf"
gtf.read(gtf_file, name="hg38", version="GRCh38", URL="https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz")



In [8]:
gtf.err[0]

'1\thavana\tSelenocysteine\t25802093\t25802095\t.\t+\t.\tgene_id "ENSG00000162430"; gene_version "18"; transcript_id "ENST00000361547"; transcript_version "7"; gene_name "SELENON"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "SELENON-202"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41282"; tag "seleno"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; transcript_support_level "1 (assigned to previous version 6)";'

#### new anno_map

In [9]:
gtf = GTF()

In [10]:
gtf.read(gtf_file, name="hg38", version="GRCh38", 
         URL="https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz",
         anno_map="other,Selenocysteine"
        )

In [11]:
gtf.err

[]

In [12]:
gtf.anno_map

{'gene': 'gene',
 'trans': 'transcript',
 'exon': 'exon',
 'CDS': 'CDS',
 'start_codon': 'start_codon',
 'stop_codon': 'stop_codon',
 'UTR5': 'five_prime_utr',
 'UTR3': 'three_prime_utr',
 'other': 'Selenocysteine'}

In [13]:
gtf.name

'hg38'

In [14]:
gtf.version

'GRCh38'

In [15]:
gtf.URL

'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz'

### interval search

#### raw key

In [16]:
loc = "1:200000:300000"

In [17]:
genes = gtf.searchs(loc)
genes

[Interval(257864, 359681, 'ENSG00000292994'),
 Interval(258568, 259024, 'ENSG00000228463'),
 Interval(266855, 268655, 'ENSG00000286448')]

In [18]:
genes[0].data

'ENSG00000292994'

#### chr key

In [19]:
loc = "chr1:200000:300000"

In [20]:
genes = gtf.searchs(loc)
genes

[Interval(257864, 359681, 'ENSG00000292994'),
 Interval(258568, 259024, 'ENSG00000228463'),
 Interval(266855, 268655, 'ENSG00000286448')]

### name2id or id2name

In [25]:
# In the gtf file, if there is no gene name, it will be replaced by gene ID by default.
gtf.maps("ENSG00000188886,ENSG00000228463", mapType="i2n")

{'ENSG00000188886': 'ASTL', 'ENSG00000228463': 'ENSG00000228463'}

In [26]:
dict_maps = gtf.maps("ENSG00000188886,ENSG00010228463", mapType="i2n")



In [27]:
# If the query gene is not found in the gtf file, return 'None'
dict_maps

{'ENSG00000188886': 'ASTL', 'ENSG00010228463': 'None'}

In [28]:
gtf.maps("ASTL,IGKV2-10", mapType="n2i")

{'ASTL': 'ENSG00000188886', 'IGKV2-10': 'ENSG00000253278'}

In [29]:
gtf.maps("ASTL,IGKV2-11", mapType="n2i")



{'ASTL': 'ENSG00000188886', 'IGKV2-11': 'None'}

### gene inquires

In [30]:
gtf.inquires("ENSG00000188886,ENSG00000253278", itype="id", ilevel="gene")

Unnamed: 0,geneid,genename,chr,start,end,strand
0,ENSG00000188886,ASTL,2,96122818,96138502,-
1,ENSG00000253278,IGKV2-10,2,89019992,89020686,-


In [31]:
gtf.inquires("ENSG00000188886,ENSG00010228463", itype="id", ilevel="trans")



Unnamed: 0,geneid,genename,chr,start,end,strand,transid,transname,transstart,transend
0,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502
1,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000470582,ASTL-202,96133684,96137777
2,ENSG00010228463,-,-,0,0,-,-,-,0,0


In [33]:
gtf.inquires("ENSG00000188886,ENSG00010228463", itype="id", ilevel="exon")



Unnamed: 0,geneid,genename,chr,start,end,strand,transid,transname,transstart,transend,exonid,exonstart,exonend
0,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00001404132,96138382,96138502
1,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00001384861,96137575,96137700
2,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00003563298,96135351,96135412
3,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00001374492,96133965,96134058
4,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00001382523,96133425,96133542
5,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00001391218,96132540,96132721
6,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00001366112,96130064,96130145
7,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00001364407,96129824,96129978
8,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000342380,ASTL-201,96122818,96138502,ENSE00001369654,96122818,96124271
9,ENSG00000188886,ASTL,2,96122818,96138502,-,ENST00000470582,ASTL-202,96133684,96137777,ENSE00001840774,96137575,96137777


## CLI

### searchs

In [4]:
# default anno_map with log
!AnnoGtf -t searchs -l 1:200000:300000 -g ./test_data/Homo_sapiens.GRCh38.111.gtf -od test_searchs -o test_hg38 -log test_hg38



In [2]:
# new anno_map
!AnnoGtf -t searchs -l 1:200000:300000 -g ./test_data/Homo_sapiens.GRCh38.111.gtf -od test_searchs -o test_hg38_map -log test_hg38_map -am other,Selenocysteine



### map

In [10]:
!AnnoGtf -t maps -g test_data/Homo_sapiens.GRCh38.111.gtf -m i2n -gs ENSG00000188886,ENSG00010228463 -o test_hg38 -od test_maps -am other,Selenocysteine



In [1]:
# file
!AnnoGtf -t maps -g test_data/Homo_sapiens.GRCh38.111.gtf -m i2n -gf geneid_map.txt -o test_hg38_file -od test_maps -am other,Selenocysteine



### inquires

In [6]:
!AnnoGtf -t inquires -g test_data/Homo_sapiens.GRCh38.111.gtf -it id -il gene -gs  ENSG00000188886,ENSG00000253278 -o test_hg38_genes -od test_inquires -am other,Selenocysteine

In [7]:
# ilevel=trans
!AnnoGtf -t inquires -g test_data/Homo_sapiens.GRCh38.111.gtf -it id -il trans -gs  ENSG00000188886,ENSG00000253278 -o test_hg38_trans -od test_inquires -am other,Selenocysteine



In [9]:
# file genename
!AnnoGtf -t inquires -g test_data/Homo_sapiens.GRCh38.111.gtf -it name -il trans -gf genename_inquires.txt -o test_hg38_file -od test_inquires -am other,Selenocysteine

