micromamba create -n rh_env python=3.10.12 pip

micromamba activate rh_env

git clone https://github.com/tomalf2/recombinhunt-cov.git

pip install recombinhunt-7.1.0-py3-none-any.whl

In [1]:
from pathlib import Path

from recombinhunt.core.method import *
from recombinhunt.core.environment import Environment

env = Environment("env_nextstrain_2023_03_30")

In [2]:
def parse_fasta(filepath):
    with open(filepath, "r") as f:
        header = None
        seq_chunks = []
        for line in f:
            line = line.strip()
            if not line:
                continue  # skip empty lines
            if line.startswith(">"):
                if header:
                    yield (header, "".join(seq_chunks))
                header = line[1:]  # remove '>'
                seq_chunks = []
            else:
                seq_chunks.append(line)
        if header:
            yield (header, "".join(seq_chunks))

In [3]:
def get_mutations(ref_seq, alt_seq):
    """
    Given a pair of aligned sequences, return a list of:
      - SNVs as 'pos_ref|alt'
      - Deletions as 'start_end' for contiguous '-' in alt.
    """
    if len(ref_seq) != len(alt_seq):
        raise ValueError("Aligned sequences must be the same length.")

    mutations = []
    pos = 1
    in_deletion = False
    del_start = None

    for r, a in zip(ref_seq, alt_seq):
        if r == '-':  
            # Insertion in alt — skip
            continue
        if a == '-':
            # Start or continue a deletion
            if not in_deletion:
                del_start = pos
                in_deletion = True
        else:
            if in_deletion:
                # End of a deletion block
                del_end = pos - 1
                mutations.append(f"{del_start}_{del_end}")
                in_deletion = False
            # Check for SNV
            if r != a:
                mutations.append(f"{pos}_{r}|{a}")
        pos += 1

    # If the sequence ends in a deletion, close it
    if in_deletion:
        del_end = pos - 1
        mutations.append(f"{del_start}_{del_end}")

    return mutations

In [4]:
ref_file = Path("../../sc2ts/sc2ts/data/reference.fasta")
ref_name, ref_seq = "", ""
for header, seq in parse_fasta(ref_file):
    ref_name, ref_seq = header, seq
assert not ('-' in ref_seq)
print(ref_name)
print(ref_seq)

MN908947 (Wuhan-Hu-1/2019)
ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCAT

In [5]:
in_dir = Path("../../sc2ts-paper/3seq/sc2ts_v1_2023-02-21_pr_pp_mp_aph_bps_pango_dated_3seq_triplets")
in_file = in_dir / "478620.fasta"
targ_name, targ_seq = "", ""
for header, seq in parse_fasta(in_file):
    targ_name, targ_seq = header, seq
    if targ_name == "sample":
        break
print(targ_name)
print(targ_seq)

sample
ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTTTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGA

In [6]:
experiment = Experiment(environment=env)

In [7]:
clone_muts = get_mutations(ref_seq, targ_seq)
experiment.set_target(clone_muts)
result = experiment.run()
print(result)

  target length : 53 
  designated candidates :  AY.4  
  region details :   1 
                       pos_start_in_t : 1 
                       pos_end_in_t : 53 
                       designated :  AY.4  
                       good alternative candidates :       
  AIK :   AY.4 : 510.43666610875954   general_sc2_model : 451.5576915621987  
  p_values :   AY.4 vs general_sc2_model :  6.10e+12   
  flags :  Model_1BP_L1eqL2  Model_2BP_Bad_L2  SingleCandidateGenome  


In [8]:
recomb_muts = ['22029_22034', '28248_28253', '28271_28271', '22204_.|GAGCCAGAA', '210_G|T', '241_C|T', '1321_A|C', '3037_C|T', '4181_G|T', '4890_C|T', '6402_C|T', '7124_C|T', '7851_C|T', '8723_A|G', '8986_C|T', '9053_G|T', '10029_C|T', '11201_A|G', '11332_A|G', '14407_C|T', '14408_C|T', '15264_T|C', '15451_G|A', '16466_C|T', '18366_A|G', '19220_C|T', '20032_C|T', '21618_C|G', '21641_G|T', '21846_C|T', '21987_G|A', '22578_G|A', '22673_T|C', '22674_C|T', '22679_T|C', '22686_C|T', '22813_G|T', '22882_T|G', '22898_G|A', '22992_G|A', '22995_C|A', '23013_A|C', '23040_A|G', '23048_G|A', '23055_A|G', '23063_A|T', '23075_T|C', '23202_C|A', '23403_A|G', '23525_C|T', '23599_T|G', '23604_C|A', '23854_C|A', '23948_G|T', '24130_C|A', '24424_A|T', '24469_T|A', '24503_C|T', '25000_C|T', '25667_C|T', '25855_G|T', '26767_T|C', '27638_T|C', '27752_C|T', '27874_C|T', '28461_A|G', '28881_G|T', '28916_G|T', '29402_G|T', '29540_G|A', '29645_G|T', '29742_G|T'] # = list of mutations in the target sequence
experiment.set_target(recomb_muts)
result = experiment.run()
print(result)

  target length : 72 
  designated candidates :  AY.4 + BA.1.15.3 + AY.4  
  region details :   1 
                       pos_start_in_t : 1 
                       pos_end_in_t : 28 
                       designated :  AY.4  
                       good alternative candidates :      
                     2 
                       pos_start_in_t : 29 
                       pos_end_in_t : 57 
                       designated :  BA.1.15.3  
                       good alternative candidates :   BA.1     
                     3 
                       pos_start_in_t : 58 
                       pos_end_in_t : 72 
                       designated :  AY.4  
                       good alternative candidates :       
  AIK :   AY.4 : 423.8493440198204 
          BA.1.15.3 : 1551.1371774244783 
          general_sc2_model : 378.86002833860846 
          AY.4 + BA.1.15.3 + AY.4 : -361.39185665772266  
  p_values :   AY.4 + BA.1.15.3 + AY.4 vs AY.4 :  3.07e-171  
               AY.4 + BA.1.