In [1]:
import sqlite3
from Bio import SeqIO
import os
from torch.utils.data import Dataset,DataLoader
import torch
import random
from dataclasses import dataclass

Create databases with sqlite 3, and call it something like 'spike_prot.db'

In [2]:
conn = sqlite3.connect("spike_prot.db")
db_cursor = conn.cursor()

Now lets create a data table for trainign sequences with it's simple data structure and one for test sequences.

In [3]:
#create train sequences table

#TODO: add auto incrementing primary key to sequences table
db_cursor.execute('''CREATE TABLE train_sequences
             (id INTEGER PRIMARY KEY,
              header TEXT,
              sequence TEXT)''')

#create test sequences table
db_cursor.execute('''CREATE TABLE test_sequences
             (id INTEGER PRIMARY KEY,
              header TEXT,
              sequence TEXT)''')

<sqlite3.Cursor at 0x7f26800ffce0>

Read the fasta files in and distrubute them to their correct collections 

In [4]:
training_seqs = SeqIO.parse(open(os.path.abspath('../data/spikeprot0203.clean.uniq.training.fasta')),'fasta')

for i, fasta in enumerate(training_seqs):
    header, seq = fasta.id, str(fasta.seq)
    db_cursor.execute("INSERT INTO train_sequences (header, sequence) VALUES (?,?)", (header,seq))

conn.commit()
conn.close()

In [5]:
conn = sqlite3.connect("spike_prot.db")
db_cursor = conn.cursor()

testing_seqs = SeqIO.parse(open(os.path.abspath('../data/spikeprot0203.clean.uniq.testing.fasta')), 'fasta')

for i, fasta in enumerate(testing_seqs):
    header, seq = fasta.id, str(fasta.seq)
    db_cursor.execute("INSERT INTO test_sequences (header, sequence) VALUES (?,?)", (header,seq))
    
conn.commit()
conn.close()

Test query:

In [6]:
conn = sqlite3.connect("spike_prot.db")
db_cursor = conn.cursor()

db_cursor.execute("SELECT sequence FROM train_sequences")
train_result = db_cursor.fetchone()
train_sequence = train_result[0]
print(train_sequence)


MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGR

In [7]:
db_cursor.execute("SELECT sequence FROM test_sequences")
test_result = db_cursor.fetchone()
test_sequence = test_result[0]
print(test_sequence)

MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSVLEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYXTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGR

Now let's see how to use this as way to load in our data. 

In [30]:
class FastaDataset(Dataset):
    def __init__(self, db_path: str, table_name: str):
        # Connect to the sqlite database
        self.conn = sqlite3.connect(db_path)
        self.db_cursor = self.conn.cursor()
        self.db_length = self.db_cursor.execute('''SELECT COUNT(*) as total_seq FROM train_sequences''').fetchone()[0]
        self.table_name = table_name

    def __len__(self):
        return self.db_length

    def __getitem__(self, index):
        # Fetch the sequence from the sqlite database
        seq_idx_result = self.db_cursor.execute(f'''SELECT id, sequence FROM {self.table_name} WHERE id = {index}''').fetchone()
        seqeunce = seq_idx_result[1].replace('*', '')
        
        return seqeunce
    
    def __del__(self):
        # Close the connection when the dataset object is deleted
        self.conn.close()

In [31]:
class FastaDataLoader:
    """Wrapper for fasta dataloader
    """
    def __init__(self, db_path: str, table_name: str, batch_size: int, shuffle=True):
        self.dataset = FastaDataset(db_path, table_name)
        self.dataloader = DataLoader(self.dataset, batch_size=batch_size, shuffle=shuffle)

    def __len__(self):
        return len(self.dataset)

    def __iter__(self):
        return iter(self.dataloader)

In [34]:
@dataclass
class TrainingParams:
    db_path: str = "spike_prot.db"
    table_name: str = "train_sequences"
    batch_size: int = 10
    shuffle: bool = True

train_loader = FastaDataLoader(TrainingParams.db_path, TrainingParams.table_name, TrainingParams.batch_size, TrainingParams.shuffle)

for i, batch in enumerate(train_loader):
    print(i, batch)
    if i == 1:
        break

0 ['MFVFLVLLPLVSSQCVNLTTRTQLPPAYTXXXXRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWXXXXXXSKXXXXXXXXXXXFRKSNLKPFERDISTEIYQAGSTPCNXXXXXXXXXXXXSYGFQPTNGVGYQPYRVVVLSFELLHXXXXXXXPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSXLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRL