In [1]:
import os
from Bio import SeqIO
import gffutils
import pandas as pd
from collections import defaultdict

In [2]:
GENOME_PATH = "GCF_000146045.2/GCF_000146045.2_R64_genomic.fna"
GFF_PATH = "GCF_000146045.2/genomic.gff"

### Load Genome

In [3]:
print('loading genome...')
#reading entire genome into memory - this would NOT work if we use a big genome like humans
genome = SeqIO.to_dict(SeqIO.parse(GENOME_PATH, "fasta"))
print('done!')

loading genome...
done!


#### Genome exploration

In [4]:
genome

{'NC_001133.9': SeqRecord(seq=Seq('ccacaccacacccacacacccacacaccacaccacacaccacaccacacccaca...ggg'), id='NC_001133.9', name='NC_001133.9', description='NC_001133.9 Saccharomyces cerevisiae S288C chromosome I, complete sequence', dbxrefs=[]),
 'NC_001134.8': SeqRecord(seq=Seq('AAATAGCCCTCATGTACGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTT...tgt'), id='NC_001134.8', name='NC_001134.8', description='NC_001134.8 Saccharomyces cerevisiae S288C chromosome II, complete sequence', dbxrefs=[]),
 'NC_001135.5': SeqRecord(seq=Seq('cccacacaccacacccacaccacacccacacaccacacacaccacacccacaca...gtg'), id='NC_001135.5', name='NC_001135.5', description='NC_001135.5 Saccharomyces cerevisiae S288C chromosome III, complete sequence', dbxrefs=[]),
 'NC_001136.10': SeqRecord(seq=Seq('acaccacacccacaccacacccacacacaccacacccacacaccacacccacac...TGG'), id='NC_001136.10', name='NC_001136.10', description='NC_001136.10 Saccharomyces cerevisiae S288C chromosome IV, complete sequence', dbxrefs=[]),
 'NC_001137.3': SeqRecord(seq=Se

### Build GFF DB

In [5]:
if not os.path.exists("genome.db"):
    print("creating GFF database...")
    db = gffutils.create_db(
        GFF_PATH,
        dbfn="genome.db",
        force=True,
        keep_order=True,
        merge_strategy="merge",
        sort_attribute_values=True
    )
else:
    print("loading existing GFF database...")
    db = gffutils.FeatureDB("genome.db", keep_order=True)
print("done!")

loading existing GFF database...
done!


#### DB Exploration

In [6]:
print([f for f in db.featuretypes()])

['CDS', 'RNase_MRP_RNA', 'RNase_P_RNA', 'SRP_RNA', 'antisense_RNA', 'centromere', 'exon', 'gene', 'long_terminal_repeat', 'mRNA', 'mobile_genetic_element', 'ncRNA', 'origin_of_replication', 'pseudogene', 'rRNA', 'region', 'regulatory_region', 'sequence_feature', 'snRNA', 'snoRNA', 'tRNA', 'telomerase_RNA', 'telomere', 'transcript']


In [7]:
feat = next(db.features_of_type('CDS'))
print(feat)
feature_sequence = genome[feat.seqid].seq[feat.start - 1 : feat.end]
print(feature_sequence)
print(f'{len(feature_sequence) == (feat.end - feat.start + 1)}')


NC_001133.9	RefSeq	CDS	1807	2169	.	-	0	ID=cds-NP_009332.1;Dbxref=SGD:S000002142,GeneID:851229,GenBank:NP_009332.1;Name=NP_009332.1;gbkey=CDS;Note=hypothetical protein%3B member of the seripauperin multigene family encoded mainly in subtelomeric regions;gene=PAU8;locus_tag=YAL068C;Parent=rna-NM_001180043.1;product=seripauperin PAU8;experiment=EXISTENCE:mutant phenotype:GO:0030437 ascospore formation [PMID:12586695];protein_id=NP_009332.1
CTAGTTTGCGATAGTGTAGATACCGTCCTTGGATAGAGCACTGGAGATGGCTGGCTTTAATCTGCTGGAGTACCATGGAACACCGGTGATCATTCTGGTCACTTGGTCTGGAGCAATACCGGTCAACATGGTGGTGAAGTCACCGTAGTTGAAAACGGCTTCAGCAACTTCGACTGGGTAGGTTTCAGTTGGGTGGGCGGCTTGGAACATGTAGTATTGGGCTAAGTGAGCTCTGATATCAGAGACGTAGACACCCAATTCCACCAAGTTGACTCTTTCGTCAGATTGAGCTAGAGTGGTGGTTGCAGAAGCAGTAGCAGCGATGGCAGCGACACCAGCGGCGATTGAAGTTAATTTGACCat
True


### Parse data from DB into .CSV

#### Get genome sequence and state sequence
For the purpose of this project, we define a coding sequence ONLY as a sequence labeled with `CDS` in the GFF file. Everything else we consider to be non-coding.

In [14]:
#get cds intervals
cds_intervals = defaultdict(list)

for cds in db.features_of_type('CDS'):
    cds_intervals[cds.seqid].append((cds.start, cds.end))

len(cds_intervals.keys())

17

In [20]:
#get coding and noncoding sequences
sequences = []
state_mask = []

for seqid, seqinfo in genome.items():
    chromosome = seqinfo.seq

    labels = [False] * len(chromosome)

    for start, end in cds_intervals[seqid]:
        for i in range(start - 1, end):
            labels[i] = True
    
    base_seq = "".join([b for b in chromosome])
    label_seq = "".join(['1' if c else '0' for c in labels])

    sequences.append(base_seq)
    state_mask.append(label_seq)

In [25]:
#entire genome in my memory
entire_genome_sequence = "".join(sequences).upper()
entire_state_sequence = "".join(state_mask).upper()
print(len(entire_genome_sequence) == len(entire_state_sequence))

True


#### Parse into k-mers and CSV
Will be using 3-mers but this can easily be changed

In [None]:
#UNFINISHED (and also the wrong approach I think)
with open("observation_and_states.csv", "w") as f:
    f.write("3mer,coding\n")
    for idx in range(len(entire_genome_sequence)):
        try:
            three_mer = entire_genome_sequence[i:i+3]
        except IndexError as e:
            break
#UNFINISHED
