In [22]:
from Bio import SeqIO
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns

%matplotlib inline

In [23]:
def greedy_partition(df, threshold):

    ignore_index = []
    tally_dict = {}

    debug=False

    for row_id in df.index:

        if debug:
            print(row_id)
            print(ignore_index)

        # if we've already assigned this index as a child, don't need to consider it's row
        if not(row_id in ignore_index):

            # get the full dataframe row for this sample
            this_row_init = df.loc[row_id]  

            # remove columns that are already children of other nodes
            this_row = this_row_init.drop(labels = ignore_index)  
            #print(this_row)
            if debug:
                print(this_row_init.shape)
                print(this_row.shape)

            # children of this node are those with values greather than the threshold...
            children = list(this_row[this_row > threshold].index)
            children_filtered = [i for i in children if i != row_id ] #...not inlucding the node name itself

            if debug:
                print("len of children_filtered: " + str(len(children_filtered)) + '\n\n')

            # add the node + children to the dictionary, 
            # then add node + children to the set to ignore in future iterations
            tally_dict[row_id] = children_filtered
            ignore_index += children_filtered
            ignore_index += [row_id]

    return(tally_dict)

In [24]:
# taxon-specific data dumps are downloaded from GenBank using the corresponding query, written to .fasta, 
# ... and put in a location available to this script

#this_dump = "moraxellacatarhalis_29497_dump"
this_dump = "candidaauris_498019_dump"
#this_dump = "rhinovirusc_463676_dump"
#this_dump = "chkv_txid37124_dump"

In [25]:
# Run Sourmash on the input fasta containing all accessions for that species

K = 31
ST = 1000
    
start = time.time()

! mkdir {this_dump}
! rm cmp*;
! rm *sig;
! sourmash sketch dna -p k={K},scaled={ST} --singleton {this_dump}.fasta;
! sourmash compare *.sig --containment -o {this_dump}/cmp.dist --csv {this_dump}/cmp.csv;
! mv {this_dump}.fasta.sig {this_dump}

end = time.time()
elapsed = end - start


mkdir: candidaauris_498019_dump: File exists
zsh:1: no matches found: cmp*
zsh:1: no matches found: *sig
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: candidaauris_498019_dump.fasta
[KComputing a total of 1 signature(s) for each input.
[Kcalculated 8585 signatures for 8585 sequences in candidaauris_498019_dump.fasta
[Ksaved 8585 signature(s) to 'candidaauris_498019_dump.fasta.sig'. Note: signature license is CC0.
[K
== This is sourmash version 4.6.1. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 8585 signatures total.                                                  
[K
min similarity in matrix: 0.000
[Ksaving labels to: candidaauris_498019_dump/cmp.dist.labels.txt
[Ksaving comparison matrix to: candidaauris_498019_dump/cmp.dist


In [26]:
# Read in the sourmash results and apply greedy clustering to generate the clustered result
result_df = pd.read_csv(this_dump + "/cmp.csv")
result_df.index = result_df.columns
result_df.shape

result_dict = greedy_partition(result_df, .6) # do greedy clustering

In [27]:
# Write out the clustered result for inspection (via BLAST, etc)
len_of_key_pair = []
for key in result_dict:
    print(key)
    print(result_dict[key])
    print('\n')
    len_of_key_pair.append(len(result_dict[key]))

OQ513466.1 [Candida] auris isolate CAu10_KSU small subunit ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and large subunit ribosomal RNA gene, partial sequence
[]


OQ513465.1 [Candida] auris isolate CAu9_KSU small subunit ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and large subunit ribosomal RNA gene, partial sequence
[]


OQ513464.1 [Candida] auris isolate CAu8_KSU small subunit ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and large subunit ribosomal RNA gene, partial sequence
[]


OQ513463.1 [Candida] auris isolate CAu7_KSU small subunit ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete seque

[]


XM_029036903.1 [Candida] auris tubulin alpha chain (CJI97_004953), partial mRNA
[]


XM_029036901.1 [Candida] auris D-arabinono-1,4-lactone oxidase (CJI97_004951), partial mRNA
[]


XM_029036897.1 [Candida] auris uncharacterized protein (CJI97_004947), partial mRNA
[]


XM_029036896.1 [Candida] auris uncharacterized protein (CJI97_004946), partial mRNA
[]


XM_029036894.1 [Candida] auris ubiquitin-conjugating enzyme E2 7 (CJI97_004944), partial mRNA
[]


XM_029036892.1 [Candida] auris uncharacterized protein (CJI97_004942), partial mRNA
[]


XM_029036891.1 [Candida] auris uncharacterized protein (CJI97_004941), partial mRNA
[]


XM_029036890.1 [Candida] auris uncharacterized protein (CJI97_004940), partial mRNA
[]


XM_029036887.1 [Candida] auris uncharacterized protein (CJI97_004937), partial mRNA
[]


XM_029036885.1 [Candida] auris uncharacterized protein (CJI97_004935), partial mRNA
[]


XM_029036883.1 [Candida] auris uncharacterized protein (CJI97_004933), partial mRNA
[]


XM

[]


XM_029035436.1 [Candida] auris uncharacterized protein (CJI97_003448), partial mRNA
[]


XM_029035433.1 [Candida] auris uncharacterized protein (CJI97_003445), partial mRNA
[]


XM_029035432.1 [Candida] auris histone H4 (CJI97_003444), partial mRNA
[]


XM_029035431.1 [Candida] auris histone (CJI97_003443), partial mRNA
[]


XM_029035430.1 [Candida] auris uncharacterized protein (CJI97_003442), partial mRNA
[]


XM_029035426.1 [Candida] auris uncharacterized protein (CJI97_003436), partial mRNA
[]


XM_029035422.1 [Candida] auris proteasome regulatory particle lid subunit RPN13 (CJI97_003432), partial mRNA
[]


XM_029035415.1 [Candida] auris uncharacterized protein (CJI97_003425), partial mRNA
[]


XM_029035414.1 [Candida] auris tryptophan synthase (CJI97_003424), partial mRNA
[]


XM_029035411.1 [Candida] auris uncharacterized protein (CJI97_003421), partial mRNA
[]


XM_029035410.1 [Candida] auris uncharacterized protein (CJI97_003420), partial mRNA
[]


XM_029035409.1 [Candida]

XM_029033956.1 [Candida] auris uncharacterized protein (CJI97_001922), partial mRNA
[]


XM_029033954.1 [Candida] auris saccharopine dehydrogenase (NADP+, L-glutamate-forming) (CJI97_001920), partial mRNA
[]


XM_029033951.1 [Candida] auris uncharacterized protein (CJI97_001917), partial mRNA
[]


XM_029033949.1 [Candida] auris mediator complex subunit MED7 (CJI97_001915), partial mRNA
[]


XM_029033948.1 [Candida] auris SCF ubiquitin ligase complex subunit HRT1 (CJI97_001914), partial mRNA
[]


XM_029033945.1 [Candida] auris cytochrome-c peroxidase (CJI97_001911), partial mRNA
[]


XM_029033943.1 [Candida] auris uncharacterized protein (CJI97_001909), partial mRNA
[]


XM_029033939.1 [Candida] auris protein disulfide-isomerase domain (CJI97_001905), partial mRNA
[]


XM_029033938.1 [Candida] auris uncharacterized protein (CJI97_001904), partial mRNA
[]


XM_029033936.1 [Candida] auris uncharacterized protein (CJI97_001902), partial mRNA
[]


XM_029033930.1 [Candida] auris magnesium-de

[]


XM_029032182.1 [Candida] auris tubulin-binding prefolding complex subunit PAC10 (CJI97_000093), partial mRNA
[]


XM_029032181.1 [Candida] auris MCM DNA helicase complex subunit MCM5 (CJI97_000092), partial mRNA
[]


XM_029032178.1 [Candida] auris uncharacterized protein (CJI97_000089), partial mRNA
[]


XM_029032177.1 [Candida] auris uncharacterized protein (CJI97_000088), partial mRNA
[]


XM_029032170.1 [Candida] auris uncharacterized protein (CJI97_000081), partial mRNA
[]


XM_029032169.1 [Candida] auris inosine-5'-monophosphate dehydrogenase (CJI97_000080), partial mRNA
[]


XM_029032168.1 [Candida] auris protein phosphatase 2A regulatory subunit RTS1 (CJI97_000079), partial mRNA
[]


XM_029032167.1 [Candida] auris uncharacterized protein (CJI97_000078), partial mRNA
[]


XM_029032164.1 [Candida] auris uncharacterized protein (CJI97_000075), partial mRNA
[]


XM_029032160.1 [Candida] auris uncharacterized protein (CJI97_000071), partial mRNA
[]


XM_029032159.1 [Candida] aur

MN338178.1 [Candida] auris isolate Clin-54-1 internal transcribed spacer 1, partial sequence; 5.8S ribosomal RNA gene, complete sequence; and internal transcribed spacer 2, partial sequence
[]


MN338177.1 [Candida] auris isolate Clin-96-1 internal transcribed spacer 1, partial sequence; 5.8S ribosomal RNA gene, complete sequence; and internal transcribed spacer 2, partial sequence
[]


MN338176.1 [Candida] auris isolate Clin-95-1 internal transcribed spacer 1, partial sequence; 5.8S ribosomal RNA gene, complete sequence; and internal transcribed spacer 2, partial sequence
[]


MN338175.1 [Candida] auris isolate Clin-94-1 internal transcribed spacer 1, partial sequence; 5.8S ribosomal RNA gene, complete sequence; and internal transcribed spacer 2, partial sequence
[]


MN338174.1 [Candida] auris isolate Clin-53-1 internal transcribed spacer 1, partial sequence; 5.8S ribosomal RNA gene, complete sequence; and internal transcribed spacer 2, partial sequence
[]


MN338173.1 [Candida] auri


PYGM01000033.1 [Candida] auris strain B11243 scaffold00033, whole genome shotgun sequence
[]


PYGM01000038.1 [Candida] auris strain B11243 scaffold00038, whole genome shotgun sequence
[]


PYGM01000042.1 [Candida] auris strain B11243 scaffold00042, whole genome shotgun sequence
[]


PYGM01000052.1 [Candida] auris strain B11243 scaffold00052, whole genome shotgun sequence
[]


PYGM01000054.1 [Candida] auris strain B11243 scaffold00054, whole genome shotgun sequence
[]


PYGM01000055.1 [Candida] auris strain B11243 scaffold00055, whole genome shotgun sequence
[]


PYGM01000070.1 [Candida] auris strain B11243 scaffold00070, whole genome shotgun sequence
[]


PYGM01000077.1 [Candida] auris strain B11243 scaffold00077, whole genome shotgun sequence
[]


PYGM01000079.1 [Candida] auris strain B11243 scaffold00079, whole genome shotgun sequence
[]


PYGM01000080.1 [Candida] auris strain B11243 scaffold00080, whole genome shotgun sequence
[]


PYGM01000084.1 [Candida] auris strain B11243 scaf

JX459779.1 Candida auris strain KCTC17810 26S ribosomal RNA gene, partial sequence
[]


JX459679.1 Candida auris strain KCTC17810 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1 and 5.8S ribosomal RNA gene, complete sequence; and internal transcribed spacer 2, partial sequence
[]


HE797775.1 Candida auris partial 28S rRNA gene, strain CH2
[]


HE797774.1 Candida auris partial 28S rRNA gene, strain CH1
[]


HE797773.1 Candida auris genomic DNA containing 18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2, 28S rRNA gene, strain CH2
[]


HE797772.1 Candida auris genomic DNA containing 18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2, 28S rRNA gene, strain CH1
[]


JQ219332.1 Candida auris strain TA296 26S ribosomal RNA gene, partial sequence
[]


JQ219331.1 Candida auris strain BJ113 26S ribosomal RNA gene, partial sequence
[]


AB375773.1 Candida auris gene for 26S ribosomal RNA, partial sequence
[]


AB375772.1 Candida auris genes for 18S rRNA, ITS1, 5.8S rRNA, ITS2, 28S rRNA, 

In [28]:
# Compute the compression ratio = initial_size / current_size of DB

# sanity checking the calculations around compression ratio
len_of_key_pair.sort()
#print(sum(len_of_key_pair))
#print(len(result_df.index) - sum(len_of_key_pair))
#len_of_key_pair[-10:]

# compression ratio
print("initial count of accession: " + str(len(result_df.index)))
print("final count after clustering: " + str(len(result_dict.keys())))
print("compression ratio: " + str(len(result_df.index) / len(result_dict.keys())))

initial count of accession: 8585
final count after clustering: 3796
compression ratio: 2.26159114857745


### Take the clustered data and generate a new .fasta file of accessions

In [29]:
records = list(SeqIO.parse(this_dump + ".fasta", "fasta"))
new_dump_fasta = this_dump + '.new.fasta'
with open(new_dump_fasta, 'w') as f: 
    for r in records:
        seqid = r.description
        if seqid in result_dict.keys():
            f.write('>' + seqid + '\n' + str(r.seq) + '\n')
f.close()