In [1]:
import re
import csv
import math
import random
from itertools import combinations
from difflib import SequenceMatcher
from Bio import SeqIO, Seq
from tqdm import tqdm

In [2]:
corona = list(SeqIO.parse("sequences.fasta", "fasta"))[0]

In [3]:
class Sequence:
    def __init__(self, reverse, frame, codons):
        self.reverse = reverse
        self.frame = frame
        self.codons = codons

In [4]:
class OpenReadingFrame:
    def __init__(self, sequence, codons):
        self.sequence = sequence
        self.codons = codons
        
    @property
    def length(self):
        return len(''.join(self.codons))

In [5]:
def get_codons(nucleotides):
    return [nucleotides[0+i:3+i] for i in range(0, len(nucleotides), 3)]

In [6]:
def get_six_sequences(nucleotides):
    sequences = []
    coding_strand = str(nucleotides.reverse_complement().seq)
    template_strand = str(nucleotides.seq)
    for i in range(3):
        sequences.append(Sequence(True, i, get_codons(coding_strand[i:])))
        sequences.append(Sequence(False, i, get_codons(template_strand[i:])))
    return sequences

In [7]:
sequences = get_six_sequences(corona)

In [8]:
def get_reading_frames(sequence):
    start_codons = ['ATG']
    stop_codons = ['TAG', 'TAA', 'TGA']
    found_start_codon = False
    reading_frames = []
    for codon in sequence.codons:
        if found_start_codon:
            reading_frames[-1].codons.append(codon)
            if codon in stop_codons:
                found_start_codon = False
        elif codon in start_codons:
            found_start_codon = True
            reading_frames.append(OpenReadingFrame(sequence, []))
            reading_frames[-1].codons.append(codon)
    return reading_frames

In [9]:
def get_all_reading_frames(sequences, min_length):
    all_reading_frames = []
    for sequence in sequences:
        orfs = get_reading_frames(sequence)
        for orf in orfs:
            if orf.length > min_length + 6:  # +6 to require nt's that aren't start/stop codons
                all_reading_frames.append(str(Seq.Seq(''.join(orf.codons)).translate()))
    return all_reading_frames


In [10]:
all_reading_frames = get_all_reading_frames(sequences, 70)
len(all_reading_frames)

159

In [11]:
def get_similarity_score(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [12]:
def load_samples(samples_path):
    return list(SeqIO.parse("samples/sequences.fasta", "fasta"))

In [13]:
def get_orfs_from_samples(samples, save_to_file):
    samples_orfs = []
    for sample in tqdm(samples):
        sequences = get_six_sequences(sample)
        all_reading_frames = get_all_reading_frames(sequences, 1)
        samples_orfs.append(all_reading_frames)
    if save_to_file:
        with open("samples/samples_orfs.csv","w") as f:
            wr = csv.writer(f)
            wr.writerows(samples_orfs)
    return samples_orfs

In [14]:
def load_samples_orfs_from_csv(path):
    with open('samples/samples_orfs.csv', newline='\n') as f:
        reader = csv.reader(f)
        samples_orfs = list(reader)
        return samples_orfs

In [15]:
def find_in_list_of_lists(list_of_lists, item):
    for i in range(len(list_of_lists)):
        for j in range(len(list_of_lists[i])):
            if list_of_lists[i][j] == item:
                return i
    return -1

In [17]:
def get_random_elements(full_list, portion):
    amount_of_elements = math.floor(len(full_list) * portion)
    return random.choices(full_list, k=amount_of_elements)

In [50]:
def get_similarity_groups(samples_orfs, min_similarity_score=0.9):
    # random_samples = get_random_elements(samples_orfs, 0.3)
    random_samples = samples_orfs[0:2]
    orfs_similarity_groups = []
    for sample_orfs in tqdm(random_samples):
        for orf in sample_orfs:
            is_orf_in_similarity_group = False
            for similarity_group in orfs_similarity_groups:
                similarity_sum = sum(get_similarity_score(orf, similiar_orf) for similiar_orf in similarity_group)
                average_similarity = similarity_sum / len(similarity_group)
                if average_similarity > min_similarity_score:
                    similarity_group.append(orf)
                    is_orf_in_similarity_group = True
            if not is_orf_in_similarity_group:
                orfs_similarity_groups.append([orf])
    return orfs_similarity_groups

In [19]:
samples_orfs = load_samples_orfs_from_csv('./samples/samples_orfs.csv')

In [51]:
orfs_similarity_groups = get_similarity_groups(samples_orfs[0:2], 0.1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2000.62it/s]


In [41]:
import time

start = time.time()
for 
end = time.time()
print(end - start)

KeyboardInterrupt: 

In [45]:
for similarity_group in orfs_similarity_groups:
    if len(similarity_group) > 1:
        print(similarity_group)

['MGIALLKLILHIRALPYRQLSLALFTVHSIVLRVASVKMWWLFQVLPNVTH*', 'MFCMRQYAYSAK*', 'ML*', 'MRNEKRLDCRLCSLLRRSLLAMLFLEEVVARLQHC*', 'MIL*', 'MLST*', 'MILQFK*', 'MNSLGETDLARN*', 'MFV*', 'MRFMM*', 'METLKVLIIISNISAIVT*', 'MVCV*', 'MVSSNTKMSTKDHSSDYEFTF*', 'MEWHVEKNVSFWIE*', 'MASVTSKNTTKARKRSTLLTINVPVSSETNEYISSYSSACAYKGTLVVVVGSS*', 'MKKVTCSTPVSVLN*', 'MTIEEVTLL*', 'MILEAL*', 'MCNVI*', 'MAILYTAHTFQVLGDR*', 'MYDS*', 'MSLIQVAKDHKLCC*', 'MCCL*', 'MSANFEVCL*', 'MVFVMQHLHQRSNQK*', 'MTSCLVLMDLVIFVKN*', 'MSWS*', 'MSC*', 'MV*', 'MVGNQHH*', 'MIVKESNN*', 'MNLHRH*', 'MMRNYIGQNNQQHSC*', 'MQIWWR*', 'MVLKPKNPEGDHALN*', 'MDHYKIEIHRL*', 'MVGQGYQTS*', 'MQGVIEFWL*', 'MSNKE*', 'MT*', 'MSL*', 'MRSHLLAVCTVAQSPIKVESASETKSLRSESTSSVPVGNHCLKTAVPGATPLSEPAPKCITLIL*', 'MPLGNVALSP*', 'MK*', 'MNSSIANSKKSISI*', 'MSVIEQTPIVDICAGASLL*', 'MSTPKLLSIFTSGTGLMLRLAQSSNATLTGNVVLFSNNSTSTPSTFV*', 'MIETGTSPCCPSKCPLFTTLKATFSKL*', 'ML*', 'MELAGYLSTLQFQNRHTPSVNLSECVA*', 'MS*', 'MC*', 'MIASQLATCALPWT*', 'MRCLN*', 'MLG*', 'MSLL*', 'MSLGMPGMSTHKPSVL

In [42]:
sum = 0
for sample in samples_orfs:
    sum += len(sample)
print(sum)

17509533


In [49]:
a = 'MFLLTTKRTMFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT'
b = 'MFLLTTKRTMFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDXLQPELDSFKEELDKYFKNHTSPDVDLVDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT'
get_similarity_score(a, b)

0.9773790951638065