# Diversity notebook

The purpose of this notebook is to enable illustration of diversity measures. The input for this notebook is a matrix-style file along the lines of either a MLST profile file, or a Roary pangenome file. 

In [None]:
import pandas as pd
import numpy as np

In [None]:
# test data

inputdata = """
header   ST adk fumC gyrB icd mdh purA recA
97-Q04-03_ACTCGCTA-AAGGAGTA      393 18 106 17 6 5 5 4
194-2014-01-7050_TAAGGCGA-AAGGCTAT       752 10 11 4 8 8 8 49
128-Q03-53_TACGCTGC-ACTGCATA     131 53 40 47 13 36 28 29
188-E4-60_TCGACGTC-TCTCTCCG      131 53 40 47 13 36 28 29
161-Q03-44_CCTAAGAC-AAGGAGTA     131 53 40 47 13 36 28 29
116-Q01-25_GCGTAGTA-TCTCTCCG     1193 14 14 10 200 17 7 10
138-Q01-08_ATGCGCAG-CTAAGCCT     216 10 11 57 8 7 18 6
234-2014-01-6815_TAGGCATG-AAGGCTAT       355 36 24 10 13 17 10 25
29-2014-01-6_TCCTGAGC-AAGGAGTA   355 36 24 10 13 17 10 25
149-Q02-57_ACTGAGCG-CTCTCTAT     69 21 35 27 6 5 5 4
107-Q04-14_GGAGCTAC-CGTCTAAT     69 21 35 27 6 5 5 4
145-Q01-27_TAGCGCTC-AAGGAGTA     95 37 38 19 37 17 11 26
142-Q02-75_TAGCGCTC-TATCCTCT     131 53 40 47 13 36 28 29
228-2014-01-2451_GGACTCCT-TTATGCGA       5840 164 210 174 597 5 16 4
196-2014-01-6041_TAAGGCGA-TTATGCGA       349 34 36 39 87 67 16 4
227-2014-01-4267_GGACTCCT-GAGCCTTA       602 6 19 33 26 11 8 6
162-Q02-54_CCTAAGAC-CTAAGCCT     69 21 35 27 6 5 5 4
18-2014-01-4270_AGGCAGAA-TATCCTCT        1246 6 4 15 56 11 26 6
41-2014-01-7136_TAGGCATG-CTCTCTAT        162 9 65 5 1 9 13 6
125-Q02-45_TACGCTGC-CTCTCTAT     131 53 40 47 13 36 28 29
218-2014-01-7490_TCCTGAGC-AAGGCTAT       10 10 11 4 8 8 8 2
9-2014-01-2144_CGTACTAG-CTCTCTAT         117 20 45 41 43 5 32 2
204-2014-01-4991_CGTACTAG-TTATGCGA       57 6 31 5 28 1 1 2
124-Q02-15_CGGAGCCT-TCTCTCCG     131 53 40 47 13 36 28 29
38-2014-01-6926_GGACTCCT-CTAAGCCT        10 10 11 4 8 8 8 2
148-Q03-78_TAGCGCTC-TCTCTCCG     69 21 35 27 6 5 5 4
118-Q02-78_CGGAGCCT-TATCCTCT     69 21 35 27 6 5 5 4
35-2014-01-6710_GGACTCCT-GTAAGGAG        162 9 65 5 1 9 13 6
153-Q03-55_ACTGAGCG-AAGGAGTA     131 53 40 47 13 36 28 29
243-2014-01-3802_CTCTCTAC-GAGCCTTA       355 36 24 10 13 17 10 25
208-2014-01-3683_AGGCAGAA-GCGTAAGA       355 36 24 10 13 17 10 25
109-Q03-01_GCGTAGTA-CTCTCTAT     69 21 35 27 6 5 5 4
46-2014-01-7512_TAGGCATG-CTAAGCCT        355 36 24 10 13 17 10 25
21-2014-01-4901_AGGCAGAA-AAGGAGTA        453 99 6 33 33 24 8 7
102-Q02-17_GGAGCTAC-TATCCTCT     155 6 4 14 16 24 8 14
229-2014-01-6044_TAGGCATG-TCGACTAG       117 20 45 41 43 5 32 2
122-Q03-33_CGGAGCCT-CTAAGCCT     136 38 39 30 13 17 11 28
129-Q02-44_TACGCTGC-AAGGAGTA     131 53 40 47 13 36 28 29
146-Q01-56_TAGCGCTC-CTAAGCCT     131 53 40 47 13 36 28 29
20-2014-01-4821_AGGCAGAA-ACTGCATA        155 6 4 14 16 24 8 14
183-E5-42_TCGACGTC-GTAAGGAG      841 6 4 5 1 9 2 7
168-Q02-33_CGATCAGT-ACTGCATA     38 4 26 2 25 5 5 19
239-2014-01-6043_CTCTCTAC-CCTAGAGT       355 36 24 10 13 17 10 25
12-2014-01-2992_CGTACTAG-ACTGCATA        1011 6 4 159 44 112 1 17
119-Q03-47_CGGAGCCT-GTAAGGAG     7330 687 103 19 36 23 44 26
159-Q01-60_CCTAAGAC-GTAAGGAG     636 13 108 10 97 18 68 93
203-2014-01-1675_CGTACTAG-GAGCCTTA       1914 83 306 245 260 209 16 181
115-Q02-40_GCGTAGTA-CGTCTAAT     88 6 4 12 1 20 12 7
215-2014-01-1755_TCCTGAGC-CCTAGAGT       131 53 40 47 13 36 28 29
31-2014-01-6291_TCCTGAGC-CGTCTAAT        115 4 26 39 25 5 31 19
13-2014-01-3123_CGTACTAG-AAGGAGTA        10 10 11 4 8 8 8 2
158-Q01-24_CCTAAGAC-TATCCTCT     69 21 35 27 6 5 5 4
232-2014-01-1754_TAGGCATG-GCGTAAGA       355 36 24 10 13 17 10 25
103-Q02-41_GGAGCTAC-GTAAGGAG     405 35 37 29 25 4 5 73
216-2014-01-7291_TCCTGAGC-GCGTAAGA       752 10 11 4 8 8 8 49
209-2014-01-3685_AGGCAGAA-CTATTAAG       10 10 11 4 8 8 8 2
121-Q02-39_CGGAGCCT-AAGGAGTA     10 10 11 4 8 8 8 2
112-Q03-23_GCGTAGTA-ACTGCATA     57 6 31 5 28 1 1 2
176-E7-47_TGCAGCTA-ACTGCATA      38 4 26 2 25 5 5 19
14-2014-01-3384_CGTACTAG-CTAAGCCT        10 10 11 4 8 8 8 2
244-2014-01-6040_CTCTCTAC-TTATGCGA       355 36 24 10 13 17 10 25
156-Q01-57_ACTGAGCG-TCTCTCCG     131 53 40 47 13 36 28 29
217-2014-01-1674_TCCTGAGC-CTATTAAG       117 20 45 41 43 5 32 2
7-2014-01-2070_TAAGGCGA-CGTCTAAT         355 36 24 10 13 17 10 25
10-2014-01-2366_CGTACTAG-TATCCTCT        10 10 11 4 8 8 8 2
241-2014-01-5912_CTCTCTAC-CTATTAAG       355 36 24 10 13 17 10 25
233-2014-01-6814_TAGGCATG-CTATTAAG       355 36 24 10 13 17 10 25
193-2014-01-1336_TAAGGCGA-CTATTAAG       1079 6 19 14 16 11 12 2
226-2014-01-6611_GGACTCCT-AAGGCTAT       453 99 6 33 33 24 8 7
166-Q02-11_CGATCAGT-TATCCTCT     95 37 38 19 37 17 11 26
224-2014-01-860_GGACTCCT-GCGTAAGA        115 4 26 39 25 5 31 19
231-2014-01-5565_TAGGCATG-CCTAGAGT       349 34 36 39 87 67 16 4
143-Q01-04_TAGCGCTC-GTAAGGAG     58 6 4 4 16 24 8 14
185-E7-05_TCGACGTC-AAGGAGTA      117 20 45 41 43 5 32 2
175-E4-64_TGCAGCTA-GTAAGGAG      10 10 11 4 8 8 8 2
136-Q01-38_ATGCGCAG-ACTGCATA     4434 6 19 32 135 11 8 6
120-Q02-38_CGGAGCCT-ACTGCATA     295 6 4 12 1 9 2 7
186-E6-27_TCGACGTC-CTAAGCCT      38 4 26 2 25 5 5 19
110-Q04-13_GCGTAGTA-TATCCTCT     453 99 6 33 33 24 8 7
44-2014-01-7375_TAGGCATG-ACTGCATA        453 99 6 33 33 24 8 7
27-2014-01-5917_TCCTGAGC-GTAAGGAG        155 6 4 14 16 24 8 14
100-Q02-32_ACTCGCTA-TCTCTCCG     12 13 13 9 13 16 10 9
219-2014-01-4651_TCCTGAGC-GAGCCTTA       10 10 11 4 8 8 8 2
245-2014-01-5916_CGAGGCTG-TCGACTAG       355 36 24 10 13 17 10 25
147-Q01-52_TAGCGCTC-CGTCTAAT     131 53 40 47 13 36 28 29
36-2014-01-6838_GGACTCCT-ACTGCATA        4994 52 116 55 101 113 31 38
132-Q02-29_TACGCTGC-TCTCTCCG     131 53 40 47 13 36 28 29
95-Q03-48_ACTCGCTA-GTAAGGAG      393 18 106 17 6 5 5 4
99-Q01-17_ACTCGCTA-CGTCTAAT      73 36 24 9 13 17 11 25
113-Q03-42_GCGTAGTA-AAGGAGTA     131 53 40 47 13 36 28 29
178-E8-06_TGCAGCTA-CTAAGCCT      2554 13 39 50 13 16 230 25
165-Q01-73_CGATCAGT-CTCTCTAT     12 13 13 9 13 16 10 9
24-2014-01-5748_AGGCAGAA-TCTCTCCG        752 10 11 4 8 8 8 49
11-2014-01-2773_CGTACTAG-GTAAGGAG        162 9 65 5 1 9 13 6
198-2014-01-12_CGTACTAG-TTCTAGCT         131 53 40 47 13 36 28 29
192-E7-44_TAAGGCGA-GCGTAAGA      52 6 11 5 1 20 8 2
236-2014-01-5751_TAGGCATG-TTATGCGA       355 36 24 10 13 17 10 25
6-2014-01-2069_TAAGGCGA-CTAAGCCT         453 99 6 33 33 24 8 7
213-2014-01-911_TCCTGAGC-TCGACTAG        10 10 11 4 8 8 8 2
30-2014-01-6122_TCCTGAGC-CTAAGCCT        355 36 24 10 13 17 10 25
40-2014-01-7010_GGACTCCT-TCTCTCCG        2079 6 4 4 18 11 7 7
96-Q04-12_ACTCGCTA-ACTGCATA      393 18 106 17 6 5 5 4
238-2014-01-8_CTCTCTAC-TTCTAGCT  131 53 40 47 13 36 28 29
17-2014-01-3681_AGGCAGAA-CTCTCTAT        162 9 65 5 1 9 13 6
190-E6-60_TAAGGCGA-TTCTAGCT      38 4 26 2 25 5 5 19
135-Q03-37_ATGCGCAG-GTAAGGAG     998 13 52 156 14 17 25 17
105-Q01-80_GGAGCTAC-AAGGAGTA     410 6 4 12 1 20 18 7
32-2014-01-6321_TCCTGAGC-TCTCTCCG        355 36 24 10 13 17 10 25
131-Q03-67_TACGCTGC-CGTCTAAT     131 53 40 47 13 36 28 29
201-2014-01-1041_CGTACTAG-CTATTAAG       131 53 40 47 13 36 28 29
187-E6-65_TCGACGTC-CGTCTAAT      131 53 40 47 13 36 28 29
2-2014-01-1499_TAAGGCGA-TATCCTCT         641 9 6 33 131 24 8 7
8-2014-01-2071_TAAGGCGA-TCTCTCCG         10 10 11 4 8 8 8 2
3-2014-01-1649_TAAGGCGA-GTAAGGAG         155 6 4 14 16 24 8 14
240-2014-01-3677_CTCTCTAC-GCGTAAGA       355 36 24 10 13 17 10 25
48-2014-01-861_TAGGCATG-TCTCTCCG         10 10 11 4 8 8 8 2
106-Q04-11_GGAGCTAC-CTAAGCCT     162 9 65 5 1 9 13 6
177-E7-71_TGCAGCTA-AAGGAGTA      746 10 7 4 8 12 8 2
45-2014-01-7443_TAGGCATG-AAGGAGTA        191 6 29 5 18 11 8 41
26-2014-01-5915_TCCTGAGC-TATCCTCT        3190 6 69 12 18 9 8 7
195-2014-01-1043_TAAGGCGA-GAGCCTTA       2309 271 26 39 25 5 31 19
230-2014-01-6123_TAGGCATG-TTCTAGCT       349 34 36 39 87 67 16 4
170-E8-08_CGATCAGT-CTAAGCCT      131 53 40 47 13 36 28 29
182-E5-78_TCGACGTC-TATCCTCT      69 21 35 27 6 5 5 4
173-E8-09_TGCAGCTA-CTCTCTAT      10 10 11 4 8 8 8 2
98-Q01-44_ACTCGCTA-CTAAGCCT      38 4 26 2 25 5 5 19
47-2014-01-856_TAGGCATG-CGTCTAAT         155 6 4 14 16 24 8 14
152-Q03-45_ACTGAGCG-ACTGCATA     10 10 11 4 8 8 8 2
214-2014-01-3128_TCCTGAGC-TTCTAGCT       162 9 65 5 1 9 13 6
202-2014-01-7584_CGTACTAG-AAGGCTAT       355 36 24 10 13 17 10 25
199-2014-01-5792_CGTACTAG-CCTAGAGT       349 34 36 39 87 67 16 4
15-2014-01-3678_CGTACTAG-CGTCTAAT        752 10 11 4 8 8 8 49
220-2014-01-6811_TCCTGAGC-TTATGCGA       349 34 36 39 87 67 16 4
169-Q02-02_CGATCAGT-AAGGAGTA     69 21 35 27 6 5 5 4
141-Q03-56_TAGCGCTC-CTCTCTAT     73 36 24 9 13 17 11 25
23-2014-01-5572_AGGCAGAA-CGTCTAAT        1642 6 4 5 18 11 8 6
34-2014-01-6625_GGACTCCT-TATCCTCT        162 9 65 5 1 9 13 6
130-Q02-49_TACGCTGC-CTAAGCCT     131 53 40 47 13 36 28 29
134-Q03-20_ATGCGCAG-TATCCTCT     131 53 40 47 13 36 28 29
144-Q02-07_TAGCGCTC-ACTGCATA     969 13 43 13 14 128 94 92
172-E7-75_CGATCAGT-TCTCTCCG      127 13 14 19 36 23 11 10
111-Q03-58_GCGTAGTA-GTAAGGAG     162 9 65 5 1 9 13 6
28-2014-01-5983_TCCTGAGC-ACTGCATA        371 6 31 83 28 1 1 67
222-2014-01-3676_GGACTCCT-TTCTAGCT       355 36 24 10 13 17 10 25
211-2014-01-10_AGGCAGAA-GAGCCTTA         6635 602 11 4 10 7 8 2
94-Q04-02_ACTCGCTA-TATCCTCT      69 21 35 27 6 5 5 4
174-E5-22_TGCAGCTA-TATCCTCT      131 53 40 47 13 36 28 29
5-2014-01-1742_TAAGGCGA-AAGGAGTA         453 99 6 33 33 24 8 7
191-E7-51_TAAGGCGA-CCTAGAGT      3877 10 11 369 8 8 8 2
37-2014-01-6924_GGACTCCT-AAGGAGTA        453 99 6 33 33 24 8 7
180-E6-47_TGCAGCTA-TCTCTCCG      744 10 11 135 8 8 8 2
163-Q02-55_CCTAAGAC-CGTCTAAT     131 53 40 47 13 36 28 29
140-Q03-52_ATGCGCAG-TCTCTCCG     4381 6 11 5 1 20 306 2
16-2014-01-3680_CGTACTAG-TCTCTCCG        355 36 24 10 13 17 10 25
22-2014-01-4921_AGGCAGAA-CTAAGCCT        10 10 11 4 8 8 8 2
25-2014-01-5749_TCCTGAGC-CTCTCTAT        453 99 6 33 33 24 8 7
108-Q02-31_GGAGCTAC-TCTCTCCG     69 21 35 27 6 5 5 4
93-Q01-81_ACTCGCTA-CTCTCTAT      162 9 65 5 1 9 13 6
154-Q01-03_ACTGAGCG-CTAAGCCT     131 53 40 47 13 36 28 29
225-2014-01-6035_GGACTCCT-CTATTAAG       1642 6 4 5 18 11 8 6
210-2014-01-7479_AGGCAGAA-AAGGCTAT       602 6 19 33 26 11 8 6
242-2014-01-1042_CTCTCTAC-AAGGCTAT       355 36 24 10 13 17 10 25
39-2014-01-7009_GGACTCCT-CGTCTAAT        162 9 65 5 1 9 13 6
221-2014-01-5657_GGACTCCT-TCGACTAG       23 6 4 12 1 20 13 7
207-2014-01-6135_AGGCAGAA-CCTAGAGT       115 4 26 39 25 5 31 19
171-E4-71_CGATCAGT-CGTCTAAT      131 53 40 47 13 36 28 29
117-Q02-16_CGGAGCCT-CTCTCTAT     38 4 26 2 25 5 5 19
179-E7-64_TGCAGCTA-CGTCTAAT      6338 290 54 55 352 514 323 38
114-Q03-54_GCGTAGTA-CTAAGCCT     1642 6 4 5 18 11 8 6
126-Q02-30_TACGCTGC-TATCCTCT     14 14 14 10 14 17 7 10
33-2014-01-6530_GGACTCCT-CTCTCTAT        3107 10 11 5 8 7 219 2
139-Q03-49_ATGCGCAG-CGTCTAAT     131 53 40 47 13 36 28 29
123-Q02-36_CGGAGCCT-CGTCTAAT     88 6 4 12 1 20 12 7
235-2014-01-1039_TAGGCATG-GAGCCTTA       135 13 39 50 13 16 37 25
160-Q03-59_CCTAAGAC-ACTGCATA     69 21 35 27 6 5 5 4
164-Q02-03_CCTAAGAC-TCTCTCCG     636 13 108 10 97 18 68 93
181-E6-02_TCGACGTC-CTCTCTAT      162 9 65 5 1 9 13 6
43-2014-01-7234_TAGGCATG-GTAAGGAG        453 99 6 33 33 24 8 7
167-Q03-68_CGATCAGT-GTAAGGAG     7331 10 11 4 8 631 8 2
197-2014-01-1182_CGTACTAG-TCGACTAG       93 6 11 4 10 7 8 6
200-2014-01-7207_CGTACTAG-GCGTAAGA       3107 10 11 5 8 7 219 2
237-2014-01-4539_CTCTCTAC-TCGACTAG       355 36 24 10 13 17 10 25
19-2014-01-4271_AGGCAGAA-GTAAGGAG        118 31 4 42 44 15 33 17
137-Q03-64_ATGCGCAG-AAGGAGTA     636 13 108 10 97 18 68 93
42-2014-01-7150_TAGGCATG-TATCCTCT        355 36 24 10 13 17 10 25
184-E7-61_TCGACGTC-ACTGCATA      117 20 45 41 43 5 32 2
157-Q01-74_CCTAAGAC-CTCTCTAT     636 13 108 10 97 18 68 93
223-2014-01-2995_GGACTCCT-CCTAGAGT       162 9 65 5 1 9 13 6
189-E5-45_TAAGGCGA-TCGACTAG      69 21 35 27 6 5 5 4
205-2014-01-2728_AGGCAGAA-TCGACTAG       10 10 11 4 8 8 8 2
101-Q01-28_GGAGCTAC-CTCTCTAT     1147 6 4 4 18 135 7 6
206-2014-01-7543_AGGCAGAA-TTCTAGCT       752 10 11 4 8 8 8 49
133-Q01-36_ATGCGCAG-CTCTCTAT     706 88 24 49 36 17 11 91
212-2014-01-5688_AGGCAGAA-TTATGCGA       1140 83 23 164 181 80 1 42
150-Q02-58_ACTGAGCG-TATCCTCT     778 4 26 10 25 5 5 19
"""
print(inputdata)

In [None]:
from io import StringIO
mlstdata = StringIO(inputdata)
mlst_frame = pd.read_csv(mlstdata, delim_whitespace=True, index_col=0)
mlst_edit = mlst_frame.drop("ST", axis=1)

In [None]:
from scipy.spatial.distance import pdist, squareform
distances = pdist(mlst_edit, metric="hamming")*7
distances = distances.astype(int)
dist_matrix = squareform(distances)

In [None]:
print(distances)
from collections import Counter
counts = dict(Counter(distances))
print(counts)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')
sns.distplot(distances, kde=False)
plt.show()

Conclusions so far: Most of our isolates are singletons. 

Will now try counting clusters instead.

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]

Z = linkage(X, 'single')
print(Z)
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()
print(dn)