In [2]:
import os
from tqdm.notebook import tqdm
from Bio import SeqIO
import gffutils
import pandas as pd

In [1]:
GENOME_PATH = "GCF_000146045.2/GCF_000146045.2_R64_genomic.fna"
GFF_PATH = "GCF_000146045.2/genomic.gff"

### Load Genome

In [3]:
print('loading genome...')
#reading entire genome into memory - this would NOT work if we use a big genome like humans
genome = SeqIO.to_dict(SeqIO.parse(GENOME_PATH, "fasta"))
print('done!')

loading genome...
done!


#### Genome exploration

In [9]:
genome

{'BK006935.2': SeqRecord(seq=Seq('ccacaccacacccacacacccacacaccacaccacacaccacaccacacccaca...ggg'), id='BK006935.2', name='BK006935.2', description='BK006935.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome I, complete sequence', dbxrefs=[]),
 'BK006936.2': SeqRecord(seq=Seq('AAATAGCCCTCATGTACGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTT...tgt'), id='BK006936.2', name='BK006936.2', description='BK006936.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome II, complete sequence', dbxrefs=[]),
 'BK006937.2': SeqRecord(seq=Seq('cccacacaccacacccacaccacacccacacaccacacacaccacacccacaca...gtg'), id='BK006937.2', name='BK006937.2', description='BK006937.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome III, complete sequence', dbxrefs=[]),
 'BK006938.2': SeqRecord(seq=Seq('acaccacacccacaccacacccacacacaccacacccacacaccacacccacac...TGG'), id='BK006938.2', name='BK006938.2', description='BK006938.2 TPA_inf: Saccharomyces cerevisiae S288C chromosome IV, complete sequence', dbxrefs=[]),
 'BK006939.2': S

### Build GFF DB

In [5]:
if not os.path.exists("genome.db"):
    print("creating GFF database...")
    db = gffutils.create_db(
        GFF_PATH,
        dbfn="genome.db",
        force=True,
        keep_order=True,
        merge_strategy="merge",
        sort_attribute_values=True
    )
else:
    print("loading existing GFF database...")
    db = gffutils.FeatureDB("genome.db", keep_order=True)
print("done!")

loading existing GFF database...
done!


#### DB Exploration

In [6]:
print([f for f in db.featuretypes()])

['CDS', 'RNase_MRP_RNA', 'RNase_P_RNA', 'SRP_RNA', 'antisense_RNA', 'centromere', 'exon', 'gene', 'long_terminal_repeat', 'mRNA', 'mobile_genetic_element', 'ncRNA', 'origin_of_replication', 'pseudogene', 'rRNA', 'region', 'regulatory_region', 'sequence_feature', 'snRNA', 'snoRNA', 'tRNA', 'telomerase_RNA', 'telomere', 'transcript']


### Parse data from DB into .CSV