### Experiment 6 - Evaluate sourmash distances in the context of recombination

It looks like the approach WILL cause recombinant viruses to merge into the same cluster because they still have a high sourmash distance. 

i.e. two viruses that are 99% similar to eachother, if they recombine, they will still appear to be 99% similar to the originals and therefore, we would collapse them all into one cluster.

In [36]:
from Bio import SeqIO
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import time
import math

%matplotlib inline

In [37]:
filename = "../refs/rhinovirus_NC_001617.fasta" #"chkv_NC_004162.fasta" #"GCF_900618125.1_NCTC11032_genomic.fna"

original_record = SeqIO.read(filename, "fasta") # NOTE: this only works with single-sequence .fasta files

id_thresholds = [0.99, 0.97, 0.95, 0.92, 0.9, 0.85, 0.8]  # specified list of thresholds

def mutate_sequence(sequence, pid):
    '''
    Given a sequence and a % ID threshold, mutate the sequence at random positions to produce
    a new sequence with the specified % ID. Return the new sequence string.
    '''
    
    swap_values = ['A','C','G','T']
    
    len_of_seq = len(sequence)
    num_bp_to_change = (np.round(len_of_seq) * (1-pid))
    all_bp_locs = [b for b in range(len_of_seq)]    
    bp_locs_to_change = random.sample(all_bp_locs, int(num_bp_to_change))
    
    # loop through indices of bases to change and swap out values
    new_seq = list(sequence)
    for bp in bp_locs_to_change:
        new_seq[bp] = random.sample([i for i in swap_values if i != new_seq[bp] ], 1)[0]
        
    return("".join(new_seq))

# currate a dictionary of mutated sequences at specified %ID thresholds
sequence_dict = {original_record.id : original_record.seq}
for i in id_thresholds:
    for j in range(2):
        this_seq = mutate_sequence(original_record.seq, i)
        sequence_dict[str(i) + '-' + str(j+1)] = this_seq
        
# write the mutated sequences to a fasta file
with open("all_simulated_seqs.fasta", 'w') as f: 
    for s in sequence_dict:
        f.write('>' + s + '\n' + str(sequence_dict[s]) + '\n')
f.close()

In [38]:
def generate_recombinant(seq1, seq2, proportion):
    cutoff = int(math.floor(proportion * len(seq1)))
    part_a = seq1[0:cutoff]
    part_b = seq2[cutoff:]
    recombinant_seq = part_a + part_b
    return recombinant_seq

#generate_recombinant('AAAAAAAAAA', 'TTTTTTTTTT', .7)

In [39]:
recombinant_sequence_dict = {}
threshold = .5

for s1 in sequence_dict.keys():
    for s2 in sequence_dict.keys():
        recombinant_sequence_dict[s1 + '--' + s2 + '--' + str(threshold)] = generate_recombinant(sequence_dict[s1], 
                                                                         sequence_dict[s2], 
                                                                         threshold)
        
# write the recombined sequences to a fasta file
with open("all_simulated_recombined_seqs.fasta", 'w') as f: 
    for s in sequence_dict:
        f.write('>' + s + '\n' + str(sequence_dict[s]) + '\n')
    for s in recombinant_sequence_dict:
        f.write('>' + s + '\n' + str(recombinant_sequence_dict[s]) + '\n')
f.close()

In [40]:
K = 31
ST = 1000

! rm cmp*;
! rm *sig;
# num was scaled
! sourmash sketch dna -p k={K},scaled={ST} --singleton all_simulated_recombined_seqs.fasta;
! sourmash compare *.sig --containment -o cmp.dist;
! sourmash compare *.sig --containment -o cmp.dist --csv cmp.csv;


df = pd.read_csv("cmp.csv")
df.index = df.columns

[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: all_simulated_recombined_seqs.fasta
[KComputing a total of 1 signature(s) for each input.
[Kcalculated 240 signatures for 240 sequences in all_simulated_recombined_seqs.fasta
[Ksaved 240 signature(s) to 'all_simulated_recombined_seqs.fasta.sig'. Note: signature license is CC0.
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 240 signatures total.                                                   
[K
min similarity in matrix: 0.000
[Ksaving labels to: cmp.dist.labels.txt
[Ksaving comparison matrix to: cmp.dist
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 240 signatures total.                                                   
[K
min similarity in matrix: 0.000
[Ksaving labels to: cmp.

In [52]:
pd.set_option('display.max_rows', None)
df[['NC_001617.1']]

Unnamed: 0,NC_001617.1
NC_001617.1,1.0
0.99-1,0.666667
0.99-2,0.666667
0.97-1,0.5
0.97-2,0.333333
0.95-1,0.166667
0.95-2,0.333333
0.92-1,0.0
0.92-2,0.0
0.9-1,0.0
