In [5]:
from Bio import SeqIO
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns

%matplotlib inline

In [3]:
def greedy_partition(df, threshold):

    ignore_index = []
    tally_dict = {}

    debug=False

    for row_id in df.index:

        if debug:
            print(row_id)
            print(ignore_index)

        # if we've already assigned this index as a child, don't need to consider it's row
        if not(row_id in ignore_index):

            # get the full dataframe row for this sample
            this_row_init = df.loc[row_id]  

            # remove columns that are already children of other nodes
            this_row = this_row_init.drop(labels = ignore_index)  
            #print(this_row)
            if debug:
                print(this_row_init.shape)
                print(this_row.shape)

            # children of this node are those with values greather than the threshold...
            children = list(this_row[this_row > threshold].index)
            children_filtered = [i for i in children if i != row_id ] #...not inlucding the node name itself

            if debug:
                print("len of children_filtered: " + str(len(children_filtered)) + '\n\n')

            # add the node + children to the dictionary, 
            # then add node + children to the set to ignore in future iterations
            tally_dict[row_id] = children_filtered
            ignore_index += children_filtered
            ignore_index += [row_id]

    return(tally_dict)

In [8]:
# Iteratively run sourmash at varying *scaled* parameters...
# and plot the sourmash distances as a function of the known percent identity

this_dump = "chkv_txid37124_dump"

#%%capture
K = 31
ST = 1000
    
start = time.time()

! mkdir {this_dump}
! rm cmp*;
! rm *sig;
! sourmash sketch dna -p k={K},scaled={ST} --singleton {this_dump}.fasta;
! sourmash compare *.sig --containment -o {this_dump}/cmp.dist;
! sourmash compare *.sig --containment -o {this_dump}/cmp.dist --csv {this_dump}/cmp.csv;

end = time.time()
elapsed = end - start


mkdir: chkv_txid37124_dump: File exists
zsh:1: no matches found: cmp*
zsh:1: no matches found: *sig
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: chkv_txid37124_dump.fasta
[KComputing a total of 1 signature(s) for each input.
[Kcalculated 6947 signatures for 6947 sequences in chkv_txid37124_dump.fasta
[Ksaved 6947 signature(s) to 'chkv_txid37124_dump.fasta.sig'. Note: signature license is CC0.
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 6947 signatures total.                                                  
[K
min similarity in matrix: 0.000
[Ksaving labels to: chkv_txid37124_dump/cmp.dist.labels.txt
[Ksaving comparison matrix to: chkv_txid37124_dump/cmp.dist
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 6947 signatures total

In [9]:
result_df = pd.read_csv(this_dump + "/cmp.csv")
result_df.index = result_df.columns
result_df.shape

result_dict = greedy_partition(result_df, .6)

In [15]:
for key in result_dict:
    print(key)
    print(result_dict[key])
    print('\n')

gi|2440741213|gb|ON453887.1| Chikungunya virus isolate SGEHIKJ71albY08 envelope 1 gene, partial cds
[]


gi|2440740930|gb|ON406435.1| Chikungunya virus isolate TM198 structural polyprotein gene, partial cds
[]


gi|2440740928|gb|ON406434.1| Chikungunya virus isolate TM033 structural polyprotein gene, partial cds
[]


gi|2440740926|gb|ON406433.1| Chikungunya virus isolate TM026 structural polyprotein gene, partial cds
[]


gi|2440740924|gb|ON406432.1| Chikungunya virus isolate TM025 structural polyprotein gene, partial cds
[]


gi|2440740922|gb|ON406431.1| Chikungunya virus isolate TM015 structural polyprotein gene, partial cds
[]


gi|2440740920|gb|ON406430.1| Chikungunya virus isolate TM008 structural polyprotein gene, partial cds
[]


gi|2440740918|gb|ON406429.1| Chikungunya virus isolate TM007 structural polyprotein gene, partial cds
[]


gi|2440740916|gb|ON406428.1| Chikungunya virus isolate TM004 structural polyprotein gene, partial cds
[]


gi|2440740914|gb|ON406427.1| Chikunguny

gi|1846226772|gb|MT348984.1| Chikungunya virus isolate BK1133 structural protein E1 gene, partial cds
[]


gi|1846226770|gb|MT348983.1| Chikungunya virus isolate BK1131 structural protein E1 gene, partial cds
[]


gi|1846226768|gb|MT348982.1| Chikungunya virus isolate BK1128 structural protein E1 gene, partial cds
[]


gi|1846226766|gb|MT348981.1| Chikungunya virus isolate BK1120 structural protein E1 gene, partial cds
[]


gi|1846226764|gb|MT348980.1| Chikungunya virus isolate BK1118 structural protein E1 gene, partial cds
[]


gi|1846226762|gb|MT348979.1| Chikungunya virus isolate BK1116 structural protein E1 gene, partial cds
[]


gi|1846226760|gb|MT348978.1| Chikungunya virus isolate BK1113 structural protein E1 gene, partial cds
[]


gi|1846226758|gb|MT348977.1| Chikungunya virus isolate BK1108 structural protein E1 gene, partial cds
[]


gi|1846226756|gb|MT348976.1| Chikungunya virus isolate BK1106 structural protein E1 gene, partial cds
[]


gi|1846226754|gb|MT348975.1| Chikungu

gi|1818765670|gb|MN191602.1| Chikungunya virus isolate 15-1854 E1 protein gene, partial cds
[]


gi|1818765668|gb|MN191601.1| Chikungunya virus isolate 15-1880 E1 protein gene, partial cds
[]


gi|1818765666|gb|MN191600.1| Chikungunya virus isolate 15-1884 E1 protein gene, partial cds
[]


gi|1818765664|gb|MN191599.1| Chikungunya virus isolate 15-1907 E1 protein gene, partial cds
[]


gi|1811086422|gb|MK355076.1| Chikungunya virus isolate PREDICT_GVF-CM-ECO50751 nonstructural protein 4 gene, partial cds
[]


gi|1806552996|gb|MN080498.1| Chikungunya virus strain Chik/ex-Thailand2018 envelope protein gene, partial cds
[]


gi|1802801129|gb|MN022239.1| Chikungunya virus isolate 19CG103 E1 protein gene, partial cds
[]


gi|1802801127|gb|MN022238.1| Chikungunya virus isolate 19CG94 E1 protein gene, partial cds
[]


gi|1802801125|gb|MN022237.1| Chikungunya virus isolate 19CG75 E1 protein gene, partial cds
[]


gi|1802801123|gb|MN022236.1| Chikungunya virus isolate 19CG68 E1 protein gene, par

[]


gi|1066401407|dbj|LC147063.1| Chikungunya virus E1 gene for E1 envelope protein, partial cds, isolate: 2013-CPSC27
[]


gi|1066401405|dbj|LC147062.1| Chikungunya virus E1 gene for E1 envelope protein, partial cds, isolate: 2013-CPSC5
[]


gi|1059844099|gb|KU727181.1| Chikungunya virus isolate patient14 nonstructural protein 2 gene, partial cds
[]


gi|1059844097|gb|KU727180.1| Chikungunya virus isolate patient13 nonstructural protein 2 gene, partial cds
[]


gi|1059844095|gb|KU727179.1| Chikungunya virus isolate patient12 nonstructural protein 2 gene, partial cds
[]


gi|1059844093|gb|KU727178.1| Chikungunya virus isolate patient11 nonstructural protein 2 gene, partial cds
[]


gi|1059844091|gb|KU727177.1| Chikungunya virus isolate patient14 envelope protein 2 gene, partial cds
[]


gi|1059844089|gb|KU727176.1| Chikungunya virus isolate patient13 envelope protein 2 gene, partial cds
[]


gi|1059844087|gb|KU727175.1| Chikungunya virus isolate patient12 envelope protein 2 gene, part

[]


gi|260907771|gb|GQ889486.1| Chikungunya virus strain M125 E1 protein gene, partial cds
[]


gi|256374006|gb|FJ617290.1| Chikungunya virus isolate DK-2 envelope glycoprotein 1 gene, partial cds
[]


gi|256374004|gb|FJ617289.1| Chikungunya virus isolate DK-1 envelope glycoprotein 1 gene, partial cds
[]


gi|256374002|gb|FJ617288.1| Chikungunya virus isolate KSGD-8 envelope glycoprotein 1 gene, partial cds
[]


gi|256374000|gb|FJ617287.1| Chikungunya virus isolate KSGD-7 envelope glycoprotein 1 gene, partial cds
[]


gi|256373998|gb|FJ617286.1| Chikungunya virus isolate KSGD-6 envelope glycoprotein 1 gene, partial cds
[]


gi|256373996|gb|FJ617285.1| Chikungunya virus isolate KSGD-4 envelope glycoprotein 1 gene, partial cds
[]


gi|256373994|gb|FJ617284.1| Chikungunya virus isolate KSGD-3 envelope glycoprotein 1 gene, partial cds
[]


gi|256373992|gb|FJ617283.1| Chikungunya virus isolate KSGD-2 envelope glycoprotein 1 gene, partial cds
[]


gi|256373990|gb|FJ617282.1| Chikungunya vir


gi|750273707|emb|JC991330.1| Sequence 93 from Patent EP2808405
[]


gi|750273704|emb|JC991329.1| Sequence 92 from Patent EP2808405
[]


gi|750273702|emb|JC991328.1| Sequence 91 from Patent EP2808405
[]


gi|750273697|emb|JC991326.1| Sequence 89 from Patent EP2808405
[]


gi|750273695|emb|JC991325.1| Sequence 88 from Patent EP2808405
[]


gi|750273693|emb|JC991324.1| Sequence 87 from Patent EP2808405
[]


gi|750273690|emb|JC991323.1| Sequence 86 from Patent EP2808405
[]


gi|750273688|emb|JC991322.1| Sequence 85 from Patent EP2808405
[]


gi|750273686|emb|JC991321.1| Sequence 84 from Patent EP2808405
[]


gi|750273684|emb|JC991320.1| Sequence 83 from Patent EP2808405
[]


gi|750273682|emb|JC991319.1| Sequence 82 from Patent EP2808405
[]


gi|750273681|emb|JC991318.1| Sequence 81 from Patent EP2808405
[]


gi|750273679|emb|JC991317.1| Sequence 80 from Patent EP2808405
[]


gi|750273677|emb|JC991316.1| Sequence 79 from Patent EP2808405
[]


gi|750273675|emb|JC991315.1| Sequence 78 from P



gi|422036362|gb|JX839794.1| Chikungunya virus isolate 26chik10 E1 protein gene, partial cds
[]


gi|422036360|gb|JX839793.1| Chikungunya virus isolate 24chik10 E1 protein gene, partial cds
[]


gi|422036358|gb|JX839792.1| Chikungunya virus isolate 22chik10 E1 protein gene, partial cds
[]


gi|422036356|gb|JX839791.1| Chikungunya virus isolate 19chik10 E1 protein gene, partial cds
[]


gi|422036354|gb|JX839790.1| Chikungunya virus isolate 16chik10 E1 protein gene, partial cds
[]


gi|422036352|gb|JX839789.1| Chikungunya virus isolate 13chik10 E1 protein gene, partial cds
[]


gi|422036350|gb|JX839788.1| Chikungunya virus isolate 9chik10 E1 protein gene, partial cds
[]


gi|422036348|gb|JX839787.1| Chikungunya virus isolate 7chik10 E1 protein gene, partial cds
[]


gi|422036346|gb|JX839786.1| Chikungunya virus isolate 4chik10 E1 protein gene, partial cds
[]


gi|422036344|gb|JX839785.1| Chikungunya virus isolate 1chik10 E1 protein gene, partial cds
[]


gi|422036342|gb|JX839784.1| Chik

gi|218101259|emb|GM839466.1| Sequence 95 from Patent EP1986009
[]


gi|218101230|emb|GM839465.1| Sequence 94 from Patent EP1986009
[]


gi|218101229|emb|GM839464.1| Sequence 93 from Patent EP1986009
[]


gi|218101227|emb|GM839463.1| Sequence 92 from Patent EP1986009
[]


gi|218101226|emb|GM839462.1| Sequence 91 from Patent EP1986009
[]


gi|218101225|emb|GM839461.1| Sequence 90 from Patent EP1986009
[]


gi|218101224|emb|GM839460.1| Sequence 89 from Patent EP1986009
[]


gi|218101223|emb|GM839459.1| Sequence 88 from Patent EP1986009
[]


gi|218101222|emb|GM839458.1| Sequence 87 from Patent EP1986009
[]


gi|218101218|emb|GM839453.1| Sequence 82 from Patent EP1986009
[]


gi|218101217|emb|GM839452.1| Sequence 81 from Patent EP1986009
[]


gi|218101216|emb|GM839385.1| Sequence 14 from Patent EP1986009
[]


gi|218101215|emb|GM839384.1| Sequence 13 from Patent EP1986009
[]


gi|218101214|emb|GM839383.1| Sequence 12 from Patent EP1986009
[]


gi|218101213|emb|GM839382.1| Sequence 11 from Pa

In [14]:
print(len(result_df.index))
print(len(result_dict.keys()))

print(len(result_df.index) / len(result_dict.keys()))

6947
4132
1.6812681510164569
