In [34]:
from Bio import SeqIO
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns

%matplotlib inline

In [35]:
def greedy_partition(df, threshold):

    ignore_index = []
    tally_dict = {}

    debug=False

    for row_id in df.index:

        if debug:
            print(row_id)
            print(ignore_index)

        # if we've already assigned this index as a child, don't need to consider it's row
        if not(row_id in ignore_index):

            # get the full dataframe row for this sample
            this_row_init = df.loc[row_id]  

            # remove columns that are already children of other nodes
            this_row = this_row_init.drop(labels = ignore_index)  
            #print(this_row)
            if debug:
                print(this_row_init.shape)
                print(this_row.shape)

            # children of this node are those with values greather than the threshold...
            children = list(this_row[this_row > threshold].index)
            children_filtered = [i for i in children if i != row_id ] #...not inlucding the node name itself

            if debug:
                print("len of children_filtered: " + str(len(children_filtered)) + '\n\n')

            # add the node + children to the dictionary, 
            # then add node + children to the set to ignore in future iterations
            tally_dict[row_id] = children_filtered
            ignore_index += children_filtered
            ignore_index += [row_id]

    return(tally_dict)

In [None]:
# taxon-specific data dumps are downloaded from GenBank using the corresponding query, written to .fasta, 
# ... and put in a location available to this script

#this_dump = "moraxellacatarhalis_29497_dump"
#this_dump = "candidaauris_498019_dump"
#this_dump = "rhinovirusc_463676_dump"
this_dump = "chkv_txid37124_dump"

In [38]:
# Run Sourmash on the input fasta containing all accessions for that species

K = 31
ST = 1000
    
start = time.time()

! mkdir {this_dump}
! rm cmp*;
! rm *sig;
! sourmash sketch dna -p k={K},scaled={ST} --singleton {this_dump}.fasta;
! sourmash compare *.sig --containment -o {this_dump}/cmp.dist;
! sourmash compare *.sig --containment -o {this_dump}/cmp.dist --csv {this_dump}/cmp.csv;
! mv {this_dump}.fasta.sig {this_dump}

end = time.time()
elapsed = end - start


mkdir: moraxellacatarhalis_29497_dump: File exists
zsh:1: no matches found: cmp*
zsh:1: no matches found: *sig
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: moraxellacatarhalis_29497_dump.fasta
[KComputing a total of 1 signature(s) for each input.
[Kcalculated 19717 signatures for 19717 sequences in moraxellacatarhalis_29497_dump.fasta
[Ksaved 19717 signature(s) to 'moraxellacatarhalis_29497_dump.fasta.sig'. Note: signature license is CC0.
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 19717 signatures total.                                                 
[K
min similarity in matrix: 0.000
[Ksaving labels to: moraxellacatarhalis_29497_dump/cmp.dist.labels.txt
[Ksaving comparison matrix to: moraxellacatarhalis_29497_dump/cmp.dist
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber

In [39]:
# Read in the sourmash results and apply greedy clustering to generate the clustered result
result_df = pd.read_csv(this_dump + "/cmp.csv")
result_df.index = result_df.columns
result_df.shape

result_dict = greedy_partition(result_df, .6) # do greedy clustering

In [40]:
# Write out the clustered result for inspection (via BLAST, etc)
len_of_key_pair = []
for key in result_dict:
    print(key)
    print(result_dict[key])
    print('\n')
    len_of_key_pair.append(len(result_dict[key]))

NZ_WSZE01000055.1 Moraxella catarrhalis strain 65/74 contig00055, whole genome shotgun sequence
['NZ_WSZE01000050.1 Moraxella catarrhalis strain 65/74 contig00050, whole genome shotgun sequence', 'NZ_NXCT01000039.1 Moraxella catarrhalis strain 4431503J 4431503J_NODE_45, whole genome shotgun sequence']


NZ_WSZE01000054.1 Moraxella catarrhalis strain 65/74 contig00054, whole genome shotgun sequence
['NZ_WSZE01000041.1 Moraxella catarrhalis strain 65/74 contig00041, whole genome shotgun sequence', 'NZ_NXCT01000026.1 Moraxella catarrhalis strain 4431503J 4431503J_NODE_33, whole genome shotgun sequence', 'NZ_JVRZ01000026.1 Moraxella catarrhalis strain 157.rep1_MCAT 268_959_64130_169+,32_, whole genome shotgun sequence', 'NZ_QZHR01000026.1 Moraxella catarrhalis strain COPD_M32 NODE_27_length_1068_cov_78.3092, whole genome shotgun sequence', 'NZ_AERE01000075.1 Moraxella catarrhalis 103P14B1 ctg00102, whole genome shotgun sequence', 'NZ_JVRY01000023.1 Moraxella catarrhalis strain 157.rep2_MCA

[]


NZ_RCJY01000074.1 Moraxella catarrhalis strain 3/22/2001 NODE_74_length_1296_cov_13.733105, whole genome shotgun sequence
[]


NZ_RCJY01000073.1 Moraxella catarrhalis strain 3/22/2001 NODE_73_length_1298_cov_26.004270, whole genome shotgun sequence
['NZ_RCJN01000050.1 Moraxella catarrhalis strain 76/204/3 NODE_50_length_1318_cov_30.177162, whole genome shotgun sequence', 'NZ_RCJJ01000027.1 Moraxella catarrhalis strain 78/325/3 NODE_27_length_1312_cov_31.086076, whole genome shotgun sequence', 'NZ_RCJX01000042.1 Moraxella catarrhalis strain 3/7/2001 NODE_42_length_1298_cov_35.935098, whole genome shotgun sequence', 'NZ_RCJO01000035.1 Moraxella catarrhalis strain 73/187/1 NODE_35_length_1312_cov_45.919831, whole genome shotgun sequence', 'NZ_RCKJ01000034.1 Moraxella catarrhalis strain 130/563/4 NODE_34_length_1318_cov_19.476071, whole genome shotgun sequence']


NZ_RCJY01000070.1 Moraxella catarrhalis strain 3/22/2001 NODE_70_length_1527_cov_3.906429, whole genome shotgun sequence
[

[]


NZ_RCJC01000039.1 Moraxella catarrhalis strain K117 NODE_39_length_514_cov_0.689922, whole genome shotgun sequence
[]


NZ_RCJC01000038.1 Moraxella catarrhalis strain K117 NODE_38_length_514_cov_0.961240, whole genome shotgun sequence
[]


NZ_RCJC01000037.1 Moraxella catarrhalis strain K117 NODE_37_length_583_cov_1.629386, whole genome shotgun sequence
[]


NZ_RCJC01000036.1 Moraxella catarrhalis strain K117 NODE_36_length_617_cov_1.008163, whole genome shotgun sequence
[]


NZ_RCJC01000034.1 Moraxella catarrhalis strain K117 NODE_34_length_647_cov_0.946154, whole genome shotgun sequence
[]


NZ_RCJC01000031.1 Moraxella catarrhalis strain K117 NODE_31_length_1595_cov_5.491144, whole genome shotgun sequence
[]


NZ_RCJA01000030.1 Moraxella catarrhalis strain T12 NODE_30_length_546_cov_0.885442, whole genome shotgun sequence
[]


NZ_RCJA01000029.1 Moraxella catarrhalis strain T12 NODE_29_length_577_cov_82.746667, whole genome shotgun sequence
[]


NZ_RCJA01000028.1 Moraxella catarrh

NZ_RCJO01000044.1 Moraxella catarrhalis strain 73/187/1 NODE_44_length_600_cov_0.782241, whole genome shotgun sequence
[]


NZ_RCJO01000043.1 Moraxella catarrhalis strain 73/187/1 NODE_43_length_630_cov_0.759443, whole genome shotgun sequence
[]


NZ_RCJO01000041.1 Moraxella catarrhalis strain 73/187/1 NODE_41_length_708_cov_0.850258, whole genome shotgun sequence
[]


NZ_RCJO01000040.1 Moraxella catarrhalis strain 73/187/1 NODE_40_length_726_cov_0.826377, whole genome shotgun sequence
[]


NZ_RCJO01000039.1 Moraxella catarrhalis strain 73/187/1 NODE_39_length_735_cov_0.794408, whole genome shotgun sequence
[]


NZ_RCJO01000038.1 Moraxella catarrhalis strain 73/187/1 NODE_38_length_829_cov_0.881766, whole genome shotgun sequence
[]


NZ_RCJO01000032.1 Moraxella catarrhalis strain 73/187/1 NODE_32_length_1943_cov_23.475771, whole genome shotgun sequence
[]


NZ_RCJF01000089.1 Moraxella catarrhalis strain 96/281/2 NODE_89_length_501_cov_1.312834, whole genome shotgun sequence
[]


NZ_RCJ



NZ_QZGQ01000033.1 Moraxella catarrhalis strain COPD_M133 NODE_33_length_466_cov_1.02065, whole genome shotgun sequence
['QZGQ01000033.1 Moraxella catarrhalis strain COPD_M133 NODE_33_length_466_cov_1.02065, whole genome shotgun sequence']


NZ_QZGQ01000032.1 Moraxella catarrhalis strain COPD_M133 NODE_32_length_467_cov_0.935294, whole genome shotgun sequence
['QZGQ01000032.1 Moraxella catarrhalis strain COPD_M133 NODE_32_length_467_cov_0.935294, whole genome shotgun sequence']


NZ_QZGQ01000031.1 Moraxella catarrhalis strain COPD_M133 NODE_31_length_474_cov_0.850144, whole genome shotgun sequence
[]


NZ_QZGQ01000030.1 Moraxella catarrhalis strain COPD_M133 NODE_30_length_483_cov_0.719101, whole genome shotgun sequence
[]


NZ_QZGQ01000029.1 Moraxella catarrhalis strain COPD_M133 NODE_29_length_489_cov_0.748619, whole genome shotgun sequence
['QZGQ01000029.1 Moraxella catarrhalis strain COPD_M133 NODE_29_length_489_cov_0.748619, whole genome shotgun sequence']


NZ_QZGQ01000028.1 Mor

[]


NZ_LXHB01000089.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-8000002, whole genome shotgun sequence
[]


NZ_LXHB01000083.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-11000000, whole genome shotgun sequence
[]


NZ_LXHB01000077.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-8000009, whole genome shotgun sequence
[]


NZ_LXHB01000074.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-9000004, whole genome shotgun sequence
[]


NZ_LXHB01000069.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-14000012, whole genome shotgun sequence
[]


NZ_LXHB01000067.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-8000012, whole genome shotgun sequence
[]


NZ_LXHB01000066.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-8000007, whole genome shotgun sequence
[]


NZ_LXHB01000060.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-6000006, whole genome shotgun sequence
[]


NZ_LXHB01000055.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-7000004, whole genome shotgun sequence
[]


NZ_LXHB01000045.1 Moraxella ca



NZ_JAABLD010000067.1 Moraxella catarrhalis strain AS012765 AS012765_67, whole genome shotgun sequence
[]


NZ_JAABLD010000066.1 Moraxella catarrhalis strain AS012765 AS012765_66, whole genome shotgun sequence
[]


NZ_JAABLD010000061.1 Moraxella catarrhalis strain AS012765 AS012765_61, whole genome shotgun sequence
[]


NZ_JAABLD010000060.1 Moraxella catarrhalis strain AS012765 AS012765_60, whole genome shotgun sequence
[]


NZ_JAABLD010000057.1 Moraxella catarrhalis strain AS012765 AS012765_57, whole genome shotgun sequence
['JAABLD010000057.1 Moraxella catarrhalis strain AS012765 AS012765_57, whole genome shotgun sequence']


NZ_JAABLD010000055.1 Moraxella catarrhalis strain AS012765 AS012765_55, whole genome shotgun sequence
[]


NZ_JAABLD010000054.1 Moraxella catarrhalis strain AS012765 AS012765_54, whole genome shotgun sequence
[]


NZ_JAABLD010000053.1 Moraxella catarrhalis strain AS012765 AS012765_53, whole genome shotgun sequence
['JAABLD010000053.1 Moraxella catarrhalis strai

[]


JAABKZ010000272.1 Moraxella catarrhalis strain AS012769 AS012769_272, whole genome shotgun sequence
[]


JAABKZ010000273.1 Moraxella catarrhalis strain AS012769 AS012769_273, whole genome shotgun sequence
[]


JAABKZ010000274.1 Moraxella catarrhalis strain AS012769 AS012769_274, whole genome shotgun sequence
[]


JAABKZ010000275.1 Moraxella catarrhalis strain AS012769 AS012769_275, whole genome shotgun sequence
[]


JAABKZ010000276.1 Moraxella catarrhalis strain AS012769 AS012769_276, whole genome shotgun sequence
[]


JAABKZ010000277.1 Moraxella catarrhalis strain AS012769 AS012769_277, whole genome shotgun sequence
[]


JAABKZ010000278.1 Moraxella catarrhalis strain AS012769 AS012769_278, whole genome shotgun sequence
[]


JAABKZ010000279.1 Moraxella catarrhalis strain AS012769 AS012769_279, whole genome shotgun sequence
[]


JAABKZ010000280.1 Moraxella catarrhalis strain AS012769 AS012769_280, whole genome shotgun sequence
[]


JAABKZ010000281.1 Moraxella catarrhalis strain AS0

[]


VLEN01000261.1 Moraxella catarrhalis strain AS012771 AS012771_263, whole genome shotgun sequence
[]


VLEN01000262.1 Moraxella catarrhalis strain AS012771 AS012771_264, whole genome shotgun sequence
[]


VLEN01000263.1 Moraxella catarrhalis strain AS012771 AS012771_265, whole genome shotgun sequence
[]


VLEN01000264.1 Moraxella catarrhalis strain AS012771 AS012771_266, whole genome shotgun sequence
[]


VLEN01000265.1 Moraxella catarrhalis strain AS012771 AS012771_267, whole genome shotgun sequence
[]


VLEN01000266.1 Moraxella catarrhalis strain AS012771 AS012771_268, whole genome shotgun sequence
[]


VLEN01000267.1 Moraxella catarrhalis strain AS012771 AS012771_269, whole genome shotgun sequence
[]


VLEN01000268.1 Moraxella catarrhalis strain AS012771 AS012771_270, whole genome shotgun sequence
[]


VLEN01000269.1 Moraxella catarrhalis strain AS012771 AS012771_271, whole genome shotgun sequence
[]


VLEN01000270.1 Moraxella catarrhalis strain AS012771 AS012771_272, whole geno


VLEN01000787.1 Moraxella catarrhalis strain AS012771 AS012771_790, whole genome shotgun sequence
[]


VLEN01000788.1 Moraxella catarrhalis strain AS012771 AS012771_791, whole genome shotgun sequence
[]


VLEN01000789.1 Moraxella catarrhalis strain AS012771 AS012771_792, whole genome shotgun sequence
[]


VLEN01000790.1 Moraxella catarrhalis strain AS012771 AS012771_793, whole genome shotgun sequence
[]


VLEN01000791.1 Moraxella catarrhalis strain AS012771 AS012771_794, whole genome shotgun sequence
[]


VLEN01000792.1 Moraxella catarrhalis strain AS012771 AS012771_795, whole genome shotgun sequence
[]


VLEN01000793.1 Moraxella catarrhalis strain AS012771 AS012771_796, whole genome shotgun sequence
[]


VLEN01000794.1 Moraxella catarrhalis strain AS012771 AS012771_797, whole genome shotgun sequence
[]


VLEN01000795.1 Moraxella catarrhalis strain AS012771 AS012771_798, whole genome shotgun sequence
[]


VLEN01000796.1 Moraxella catarrhalis strain AS012771 AS012771_799, whole genome s

VLEN01001147.1 Moraxella catarrhalis strain AS012771 AS012771_1153, whole genome shotgun sequence
[]


JAABLD010000182.1 Moraxella catarrhalis strain AS012765 AS012765_182, whole genome shotgun sequence
[]


VLEN01001148.1 Moraxella catarrhalis strain AS012771 AS012771_1154, whole genome shotgun sequence
[]


JAABLD010000183.1 Moraxella catarrhalis strain AS012765 AS012765_183, whole genome shotgun sequence
[]


VLEN01001149.1 Moraxella catarrhalis strain AS012771 AS012771_1155, whole genome shotgun sequence
[]


JAABLD010000184.1 Moraxella catarrhalis strain AS012765 AS012765_184, whole genome shotgun sequence
[]


VLEN01001150.1 Moraxella catarrhalis strain AS012771 AS012771_1156, whole genome shotgun sequence
[]


VLEN01001151.1 Moraxella catarrhalis strain AS012771 AS012771_1157, whole genome shotgun sequence
[]


JAABLD010000186.1 Moraxella catarrhalis strain AS012765 AS012765_186, whole genome shotgun sequence
[]


VLEN01001152.1 Moraxella catarrhalis strain AS012771 AS012771_115


VLEN01001472.1 Moraxella catarrhalis strain AS012771 AS012771_1479, whole genome shotgun sequence
[]


VLEN01001473.1 Moraxella catarrhalis strain AS012771 AS012771_1480, whole genome shotgun sequence
[]


VLEN01001474.1 Moraxella catarrhalis strain AS012771 AS012771_1481, whole genome shotgun sequence
[]


VLEN01001475.1 Moraxella catarrhalis strain AS012771 AS012771_1482, whole genome shotgun sequence
[]


VLEN01001476.1 Moraxella catarrhalis strain AS012771 AS012771_1483, whole genome shotgun sequence
[]


VLEN01001477.1 Moraxella catarrhalis strain AS012771 AS012771_1484, whole genome shotgun sequence
[]


VLEN01001478.1 Moraxella catarrhalis strain AS012771 AS012771_1485, whole genome shotgun sequence
[]


VLEN01001479.1 Moraxella catarrhalis strain AS012771 AS012771_1486, whole genome shotgun sequence
[]


VLEN01001480.1 Moraxella catarrhalis strain AS012771 AS012771_1487, whole genome shotgun sequence
[]


VLEN01001481.1 Moraxella catarrhalis strain AS012771 AS012771_1488, whol



JAABKX010000086.1 Moraxella catarrhalis strain AS012772 AS012772_86, whole genome shotgun sequence
[]


JAABKX010000088.1 Moraxella catarrhalis strain AS012772 AS012772_88, whole genome shotgun sequence
[]


JAABKX010000091.1 Moraxella catarrhalis strain AS012772 AS012772_91, whole genome shotgun sequence
[]


JAABKX010000092.1 Moraxella catarrhalis strain AS012772 AS012772_93, whole genome shotgun sequence
[]


JAABKX010000093.1 Moraxella catarrhalis strain AS012772 AS012772_94, whole genome shotgun sequence
[]


JAABKX010000094.1 Moraxella catarrhalis strain AS012772 AS012772_95, whole genome shotgun sequence
[]


JAABKX010000096.1 Moraxella catarrhalis strain AS012772 AS012772_97, whole genome shotgun sequence
[]


JAABKX010000097.1 Moraxella catarrhalis strain AS012772 AS012772_98, whole genome shotgun sequence
[]


JAABKX010000098.1 Moraxella catarrhalis strain AS012772 AS012772_99, whole genome shotgun sequence
[]


JAABKX010000099.1 Moraxella catarrhalis strain AS012772 AS0127

[]


QZIB01000142.1 Moraxella catarrhalis strain COPD_M4 NODE_144_length_901_cov_2.1615, whole genome shotgun sequence
[]


QZIB01000149.1 Moraxella catarrhalis strain COPD_M4 NODE_153_length_725_cov_1.48328, whole genome shotgun sequence
[]


QZIB01000150.1 Moraxella catarrhalis strain COPD_M4 NODE_154_length_674_cov_1.82084, whole genome shotgun sequence
[]


QZIB01000152.1 Moraxella catarrhalis strain COPD_M4 NODE_156_length_556_cov_0.787879, whole genome shotgun sequence
[]


QZIB01000153.1 Moraxella catarrhalis strain COPD_M4 NODE_157_length_541_cov_0.768116, whole genome shotgun sequence
[]


QZIB01000154.1 Moraxella catarrhalis strain COPD_M4 NODE_158_length_531_cov_1.81683, whole genome shotgun sequence
[]


QZIB01000155.1 Moraxella catarrhalis strain COPD_M4 NODE_159_length_498_cov_0.938005, whole genome shotgun sequence
[]


QZIB01000157.1 Moraxella catarrhalis strain COPD_M4 NODE_161_length_459_cov_1.65361, whole genome shotgun sequence
[]


QZIA01000086.1 Moraxella catarrha

[]


LXHF01000010.1 Moraxella catarrhalis strain S11 S11ctg116, whole genome shotgun sequence
[]


LXHG01000106.1 Moraxella catarrhalis strain MX1 MX1ctg186, whole genome shotgun sequence
[]


LXHF01000019.1 Moraxella catarrhalis strain S11 S11ctg93, whole genome shotgun sequence
[]


LXHB01000001.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-6000007, whole genome shotgun sequence
[]


LXHB01000005.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-6000003, whole genome shotgun sequence
[]


LXHB01000009.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-8000004, whole genome shotgun sequence
[]


LXHB01000011.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-8000006, whole genome shotgun sequence
[]


LXHB01000012.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-7000009, whole genome shotgun sequence
[]


LXHG01000139.1 Moraxella catarrhalis strain MX1 MX1ctg187, whole genome shotgun sequence
[]


LXHB01000014.1 Moraxella catarrhalis strain Z7546 Z7546_ctg-9000007, whole genome shotgun sequence
[

[]


DJ047819.1 SPECIFIC AND UNIVERSAL PROBES AND AMPLIFICATION PRIMERS TO RAPIDLY DETECT AND IDENTIFY COMMON BACTERIAL PATHOGENS AND ANTIBIOTIC RESISTANCE GENES FROM CLINICAL SPECIMENS FOR ROUTINE DIAGNOSIS IN MICROBIOLOGY LABORATORIES
[]


DJ047818.1 SPECIFIC AND UNIVERSAL PROBES AND AMPLIFICATION PRIMERS TO RAPIDLY DETECT AND IDENTIFY COMMON BACTERIAL PATHOGENS AND ANTIBIOTIC RESISTANCE GENES FROM CLINICAL SPECIMENS FOR ROUTINE DIAGNOSIS IN MICROBIOLOGY LABORATORIES
[]


DJ047739.1 SPECIFIC AND UNIVERSAL PROBES AND AMPLIFICATION PRIMERS TO RAPIDLY DETECT AND IDENTIFY COMMON BACTERIAL PATHOGENS AND ANTIBIOTIC RESISTANCE GENES FROM CLINICAL SPECIMENS FOR ROUTINE DIAGNOSIS IN MICROBIOLOGY LABORATORIES
[]


DJ047738.1 SPECIFIC AND UNIVERSAL PROBES AND AMPLIFICATION PRIMERS TO RAPIDLY DETECT AND IDENTIFY COMMON BACTERIAL PATHOGENS AND ANTIBIOTIC RESISTANCE GENES FROM CLINICAL SPECIMENS FOR ROUTINE DIAGNOSIS IN MICROBIOLOGY LABORATORIES
[]


EF032481.1 Moraxella catarrhalis strain ATCC 25

In [47]:
# Compute the compression ratio = initial_size / current_size of DB

# sanity checking the calculations around compression ratio
len_of_key_pair.sort()
#print(sum(len_of_key_pair))
#print(len(result_df.index) - sum(len_of_key_pair))
#len_of_key_pair[-10:]

# compression ratio
print("initial count of accession: " + str(len(result_df.index)))
print("final count after clustering: " + str(len(result_dict.keys())))
print("compression ratio: " + str(len(result_df.index) / len(result_dict.keys())))

11733
7984
initial count of accession: 19717
final count after clustering: 7984
compression ratio: 2.469564128256513


### Take the clustered data and generate a new .fasta file of accessions

In [54]:
records = list(SeqIO.parse(this_dump + ".fasta", "fasta"))
new_dump_fasta = this_dump + '.new.fasta'
with open(new_dump_fasta, 'w') as f: 
    for r in records:
        seqid = r.description
        if seqid in result_dict.keys():
            f.write('>' + seqid + '\n' + str(r.seq) + '\n')
f.close()

ID: NZ_WSZE01000054.1
Name: NZ_WSZE01000054.1
Description: NZ_WSZE01000054.1 Moraxella catarrhalis strain 65/74 contig00054, whole genome shotgun sequence
Number of features: 0
Seq('TTCATTGTCCCTTAACTTACAGTGGGCTAATAAACCGCTACATCTGTCCAAAGT...CCA')


### vv remove the below section when it completes in experiment-5 vv ----

### Evaluate the time difference req'd to run minimap2 against the initial v. new .fasta file

In [75]:
this_genome = "refs_to_simulate_reads_from/moraxellacatarhalis_NZ_CP018059.fasta"

records = list(SeqIO.parse(this_genome, "fasta"))
print(len(records[0].seq))

def mutate_sequence(sequence, pid):
    '''
    Given a sequence and a % ID threshold, mutate the sequence at random positions to produce
    a new sequence with the specified % ID. Return the new sequence string.
    '''
    
    swap_values = ['A','C','G','T']
    
    len_of_seq = len(sequence)
    num_bp_to_change = (np.round(len_of_seq) * (1-pid))
    all_bp_locs = [b for b in range(len_of_seq)]    
    bp_locs_to_change = random.sample(all_bp_locs, int(num_bp_to_change))
    
    # loop through indices of bases to change and swap out values
    new_seq = list(sequence)
    for bp in bp_locs_to_change:
        new_seq[bp] = random.sample([i for i in swap_values if i != new_seq[bp] ], 1)[0]
        
    return("".join(new_seq))

seq_to_simulate = mutate_sequence(records[0].seq, .998)
with open("simulation_reference.fasta", 'w') as f: 
    f.write('>' + records[0].id + ' -- mutated\n' + str(seq_to_simulate) + '\n')
f.close()

1954607


In [76]:
# simulate sequences from the input file
#! iss generate --genomes {this_genome} --model miseq --output miseq_reads --n_reads 100000
! iss generate --genomes simulation_reference.fasta --model miseq --output miseq_reads --n_reads 100000


INFO:iss.app:Starting iss generate
INFO:iss.app:Using kde ErrorModel
INFO:iss.util:Stitching input files together
INFO:iss.app:Using lognormal abundance distribution
INFO:iss.app:Using 2 cpus for read generation
INFO:iss.app:Generating 100000 reads
INFO:iss.app:Generating reads for record: NZ_CP018059.1
INFO:iss.util:Stitching input files together
INFO:iss.util:Stitching input files together
INFO:iss.util:Cleaning up
INFO:iss.app:Read generation complete


In [77]:
# map to original dump

this_dump_fasta = this_dump + '.fasta'

#! minimap2 -ax sr {this_dump_fasta} miseq_reads_R1.fastq miseq_reads_R2.fastq > aln.sam
# separate indexing from alignment time
print("CREATING INDEX")
! minimap2 -x sr -d ref.mmi {this_dump_fasta} 
print("RUNNING ALIGNMENT")
start = time.time()  # start after indexing, but before alignment
! minimap2 -ax sr ref.mmi miseq_reads_R1.fastq miseq_reads_R2.fastq > aln.sam
end = time.time()   # end directly after alignment to capture ONLY alignment time
print("ALIGNMENT COMPLETE, collecting stats")
! samtools stats aln.sam > alnstats.txt
unmapped_reads_string = ! grep "reads unmapped:" alnstats.txt
unmapped_reads_count = int(unmapped_reads_string[0].split(':')[-1].strip())    
orig_elapsed = end - start
print('\n\n')
print("unmapped_reads_count: " + str(unmapped_reads_count))
print("time elapsed: " + str(orig_elapsed))
print('\n\n')


# map to clustered dump

start = time.time()

#! minimap2 -ax sr {new_dump_fasta} miseq_reads_R1.fastq miseq_reads_R2.fastq > aln.sam
# separate indexing from alignment time
print("CREATING INDEX")
! minimap2 -x sr -d ref.mmi {new_dump_fasta} 
print("RUNNING ALIGNMENT")
start = time.time()  # start after indexing, but before alignment
! minimap2 -ax sr ref.mmi miseq_reads_R1.fastq miseq_reads_R2.fastq > aln.sam
end = time.time()   # end directly after alignment to capture ONLY alignment time
print("ALIGNMENT COMPLETE, collecting stats")
! samtools stats aln.sam > alnstats.txt
unmapped_reads_string = ! grep "reads unmapped:" alnstats.txt
unmapped_reads_count = int(unmapped_reads_string[0].split(':')[-1].strip())
elapsed = end - start

print('\n\n')
print("unmapped_reads_count: " + str(unmapped_reads_count))
print("time elapsed: " + str(elapsed))
print('\n\n')

CREATING INDEX
[M::mm_idx_gen::12.307*1.52] collected minimizers
[M::mm_idx_gen::15.251*1.78] sorted minimizers
[M::main::17.307*1.68] loaded/built the index for 19717 target sequence(s)
[M::mm_idx_stat] kmer size: 21; skip: 11; is_hpc: 0; #seq: 19717
[M::mm_idx_stat::17.331*1.67] distinct minimizers: 1884822 (17.20% are singletons); average occurrences: 57.418; average spacing: 5.996; total length: 648952447
[M::main] Version: 2.22-r1101
[M::main] CMD: minimap2 -x sr -d ref.mmi moraxellacatarhalis_29497_dump.fasta
[M::main] Real time: 17.841 sec; CPU: 29.515 sec; Peak RSS: 2.959 GB
RUNNING ALIGNMENT
[M::main::0.836*1.02] loaded/built the index for 19717 target sequence(s)
[M::mm_mapopt_update::0.836*1.02] mid_occ = 1000
[M::mm_idx_stat] kmer size: 21; skip: 11; is_hpc: 0; #seq: 19717
[M::mm_idx_stat::0.866*1.02] distinct minimizers: 1884822 (17.20% are singletons); average occurrences: 57.418; average spacing: 5.996; total length: 648952447
[M::worker_pipeline::75.546*2.94] mapped 100