In [74]:
# https://realpython.com/python-scipy-cluster-optimize/

from pathlib import Path
import numpy as np
from scipy.cluster.vq import whiten, kmeans, vq

In [75]:
data = Path("SMSSpamCollection").read_text()
data = data.strip()
data = data.split("\n")

In [76]:
digit_counts = np.empty((len(data), 2), dtype=int)
digit_counts

array([[      8192,        352],
       [         0,   30813144],
       [         1,   30813456],
       ...,
       [1750540334,  543712105],
       [1969692777,  544437093],
       [1953395561, 1684103712]])

In [77]:
for i, line in enumerate(data):
    case, message = line.split("\t")
    num_digits = sum(c.isdigit() for c in message)
    digit_counts[i, 0] = 0 if case == "ham" else 1
    digit_counts[i, 1] = num_digits

In [78]:
digit_counts

array([[ 0,  0],
       [ 0,  0],
       [ 1, 25],
       ...,
       [ 0,  0],
       [ 0,  0],
       [ 0,  0]])

In [79]:
unique_counts = np.unique(digit_counts[:, 1], return_counts=True)
unique_counts

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 40, 41, 47]),
 array([4110,  486,  160,   78,   42,   39,   16,   14,   28,   17,   16,
          34,   30,   31,   37,   29,   35,   33,   41,   47,   18,   31,
          28,   36,   34,   16,   16,   13,   19,    9,    2,    6,    3,
           4,    3,    4,    1,    1,    4,    2,    1]))

In [80]:
unique_counts = np.transpose(np.vstack(unique_counts))

In [81]:
unique_counts

array([[   0, 4110],
       [   1,  486],
       [   2,  160],
       [   3,   78],
       [   4,   42],
       [   5,   39],
       [   6,   16],
       [   7,   14],
       [   8,   28],
       [   9,   17],
       [  10,   16],
       [  11,   34],
       [  12,   30],
       [  13,   31],
       [  14,   37],
       [  15,   29],
       [  16,   35],
       [  17,   33],
       [  18,   41],
       [  19,   47],
       [  20,   18],
       [  21,   31],
       [  22,   28],
       [  23,   36],
       [  24,   34],
       [  25,   16],
       [  26,   16],
       [  27,   13],
       [  28,   19],
       [  29,    9],
       [  30,    2],
       [  31,    6],
       [  32,    3],
       [  33,    4],
       [  34,    3],
       [  35,    4],
       [  36,    1],
       [  37,    1],
       [  40,    4],
       [  41,    2],
       [  47,    1]])

In [82]:
whitened_counts = whiten(unique_counts)
whitened_counts

array([[0.00000000e+00, 6.49364346e+00],
       [8.11755468e-02, 7.67861489e-01],
       [1.62351094e-01, 2.52793906e-01],
       [2.43526640e-01, 1.23237029e-01],
       [3.24702187e-01, 6.63584003e-02],
       [4.05877734e-01, 6.16185146e-02],
       [4.87053281e-01, 2.52793906e-02],
       [5.68228827e-01, 2.21194668e-02],
       [6.49404374e-01, 4.42389335e-02],
       [7.30579921e-01, 2.68593525e-02],
       [8.11755468e-01, 2.52793906e-02],
       [8.92931014e-01, 5.37187050e-02],
       [9.74106561e-01, 4.73988574e-02],
       [1.05528211e+00, 4.89788193e-02],
       [1.13645765e+00, 5.84585908e-02],
       [1.21763320e+00, 4.58188955e-02],
       [1.29880875e+00, 5.52986669e-02],
       [1.37998430e+00, 5.21387431e-02],
       [1.46115984e+00, 6.47784384e-02],
       [1.54233539e+00, 7.42582099e-02],
       [1.62351094e+00, 2.84393144e-02],
       [1.70468648e+00, 4.89788193e-02],
       [1.78586203e+00, 4.42389335e-02],
       [1.86703758e+00, 5.68786288e-02],
       [1.948213

In [83]:
codebook, _ = kmeans(whitened_counts, 3)
codebook

array([[2.52050073, 0.01840656],
       [0.        , 6.49364346],
       [0.85234324, 0.09724666]])

In [84]:
codes, _ = vq(unique_counts, codebook)
codes

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [85]:
print(unique_counts[codes == 0][-1])

[47  1]


In [86]:
print(unique_counts[codes == 1][-1])

[28 19]


In [87]:
print(unique_counts[codes == 2][-1])

IndexError: index -1 is out of bounds for axis 0 with size 0