In [21]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm
import noisereduce as nr
import sounddevice as sd
import soundfile as sf
import pickle

In [22]:
def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix

def get_class_data(data_dir):
    files = os.listdir(data_dir)
    mfcc = [get_mfcc(os.path.join(data_dir,f)) for f in files if f.endswith(".wav")]
    print('data_dir: ', data_dir)
    print(f'mfcc.shape: {np.array(mfcc).shape}')
    return mfcc

def clustering(X, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0, verbose=0)
    kmeans.fit(X)
    print("centers", kmeans.cluster_centers_.shape)
    return kmeans  


In [23]:
class_names = ["nguoi", "toi", "khong", "mot", "co_the", "test_khong" , "test_toi" , "test_mot", "test_nguoi", "test_co_the"]
dataset = {}
for cname in class_names:
    print(f"Load {cname} dataset")
    dataset[cname] = get_class_data(os.path.join("data", cname))

Load nguoi dataset
data_dir:  data/nguoi
mfcc.shape: (76,)
Load toi dataset
data_dir:  data/toi
mfcc.shape: (76,)
Load khong dataset
data_dir:  data/khong
mfcc.shape: (76,)
Load mot dataset
data_dir:  data/mot
mfcc.shape: (76,)
Load co_the dataset
data_dir:  data/co_the
mfcc.shape: (99,)
Load test_khong dataset
data_dir:  data/test_khong
mfcc.shape: (25,)
Load test_toi dataset
data_dir:  data/test_toi
mfcc.shape: (25,)
Load test_mot dataset
data_dir:  data/test_mot
mfcc.shape: (24,)
Load test_nguoi dataset
data_dir:  data/test_nguoi
mfcc.shape: (25,)
Load test_co_the dataset
data_dir:  data/test_co_the
mfcc.shape: (21,)


In [24]:
# Get all vectors in the datasets
all_vectors = np.concatenate([np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
print("vectors", all_vectors.shape)

vectors (24050, 36)


In [25]:
# for k, v in dataset.items():
#     print(v)

In [26]:
# Run K-Means algorithm to get clusters
kmeans = clustering(all_vectors, n_clusters=21)
print("centers", kmeans.cluster_centers_.shape)

centers (21, 36)
centers (21, 36)


In [27]:
# kmeans.cluster_centers_

In [28]:
for key, val in dataset.items():
    print(key,'\n', np.array(val).shape)

nguoi 
 (76,)
toi 
 (76,)
khong 
 (76,)
mot 
 (76,)
co_the 
 (99,)
test_khong 
 (25,)
test_toi 
 (25,)
test_mot 
 (24,)
test_nguoi 
 (25,)
test_co_the 
 (21,)


In [29]:
models = {}
original_dataset = {}

In [30]:
original_dataset['nguoi'] = dataset['nguoi'].copy()
cname = 'nguoi'
# convert all vectors to the cluster index
dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])

hmm = hmmlearn.hmm.MultinomialHMM(
    n_components=9, random_state=0, n_iter=1000, verbose=True,
    params='te',
    init_params='e'
)
hmm.startprob_=np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_=np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    
])
if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

training class nguoi
(1875, 1) [20, 17, 23, 21, 19, 18, 27, 20, 27, 30, 21, 24, 21, 19, 31, 30, 17, 21, 44, 22, 20, 19, 42, 21, 23, 28, 17, 35, 37, 28, 30, 37, 50, 40, 12, 13, 29, 30, 23, 19, 22, 26, 16, 20, 22, 27, 18, 23, 15, 28, 22, 22, 23, 40, 25, 28, 19, 25, 33, 23, 31, 27, 16, 29, 20, 31, 14, 23, 14, 44, 20, 19, 22, 17, 22, 24] 76


         1       -5366.8436             +nan
         2       -3809.6719       +1557.1717
         3       -3447.3243        +362.3476
         4       -3180.8315        +266.4928
         5       -3006.1050        +174.7265
         6       -2941.5421         +64.5629
         7       -2924.8400         +16.7021
         8       -2916.1652          +8.6747
         9       -2908.2422          +7.9230
        10       -2901.6454          +6.5968
        11       -2891.9911          +9.6542
        12       -2882.5403          +9.4509
        13       -2878.8958          +3.6445
        14       -2876.6682          +2.2276
        15       -2874.7817          +1.8865
        16       -2873.1176          +1.6641
        17       -2871.8468          +1.2708
        18       -2870.9824          +0.8644
        19       -2870.4606          +0.5217
        20       -2870.1290          +0.3316
        21       -2869.8798          +0.2492
        22       -2869.6726          +0.2072
        23

In [31]:
np.set_printoptions(precision=2, suppress=True)
print(models['nguoi'].transmat_)

[[0.65 0.17 0.18 0.   0.   0.   0.   0.   0.  ]
 [0.   0.68 0.24 0.07 0.   0.   0.   0.   0.  ]
 [0.   0.   0.81 0.01 0.18 0.   0.   0.   0.  ]
 [0.   0.   0.   0.83 0.17 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.77 0.06 0.17 0.   0.  ]
 [0.   0.   0.   0.   0.   0.83 0.1  0.08 0.  ]
 [0.   0.   0.   0.   0.   0.   0.8  0.13 0.07]
 [0.   0.   0.   0.   0.   0.   0.   0.9  0.1 ]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.  ]]


In [32]:
original_dataset['toi'] = dataset['toi'].copy()
cname = 'toi'
class_vectors = dataset[cname]
# convert all vectors to the cluster index

dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])

hmm = hmmlearn.hmm.MultinomialHMM(
    n_components= 9, random_state=0, n_iter=1000, verbose=True,
    params='te',
    init_params='e'
)
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_ = np.array([ 
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    ])

if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

training class toi
(1736, 1) [17, 16, 23, 27, 26, 17, 27, 17, 10, 17, 16, 13, 20, 18, 23, 15, 37, 28, 15, 14, 15, 17, 30, 12, 37, 27, 17, 15, 38, 24, 39, 29, 18, 42, 16, 16, 41, 13, 16, 18, 16, 25, 32, 15, 23, 32, 15, 20, 19, 21, 24, 33, 14, 22, 25, 20, 43, 14, 24, 34, 14, 25, 17, 14, 35, 14, 12, 33, 32, 27, 33, 47, 27, 18, 21, 20] 76


         1       -5023.9957             +nan
         2       -3497.6255       +1526.3702
         3       -3082.6689        +414.9566
         4       -2910.4618        +172.2071
         5       -2801.1612        +109.3006
         6       -2741.7024         +59.4589
         7       -2704.7587         +36.9437
         8       -2677.8936         +26.8651
         9       -2667.0326         +10.8610
        10       -2660.1904          +6.8422
        11       -2656.0216          +4.1688
        12       -2653.5953          +2.4264
        13       -2651.8432          +1.7521
        14       -2650.2162          +1.6269
        15       -2648.8903          +1.3259
        16       -2648.2399          +0.6504
        17       -2647.9235          +0.3164
        18       -2647.7230          +0.2005
        19       -2647.5845          +0.1385
        20       -2647.4861          +0.0984
        21       -2647.4153          +0.0708
        22       -2647.3642          +0.0511
        23

In [33]:
np.set_printoptions(precision=2, suppress=True)
print(models['toi'].transmat_)

[[0.58 0.19 0.23 0.   0.   0.   0.   0.   0.  ]
 [0.   0.65 0.   0.35 0.   0.   0.   0.   0.  ]
 [0.   0.   0.6  0.09 0.31 0.   0.   0.   0.  ]
 [0.   0.   0.   0.64 0.36 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.69 0.27 0.04 0.   0.  ]
 [0.   0.   0.   0.   0.   0.77 0.11 0.12 0.  ]
 [0.   0.   0.   0.   0.   0.   0.75 0.09 0.15]
 [0.   0.   0.   0.   0.   0.   0.   0.89 0.11]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.  ]]


In [34]:
original_dataset['khong'] = dataset['khong'].copy()
cname = 'khong'
class_vectors = dataset[cname]
# convert all vectors to the cluster index
dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in dataset[cname]])

hmm = hmmlearn.hmm.MultinomialHMM(
    n_components=9, random_state=0, n_iter=1000, verbose=True,
    startprob_prior=np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]),
    transmat_prior=np.array([ 
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    ])
)

if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

training class khong
(1898, 1) [23, 17, 38, 28, 18, 33, 22, 20, 33, 21, 27, 32, 24, 19, 27, 22, 30, 29, 35, 21, 17, 25, 26, 20, 26, 31, 20, 23, 24, 27, 36, 25, 19, 25, 25, 26, 40, 25, 28, 20, 15, 27, 20, 27, 16, 20, 17, 21, 14, 23, 20, 33, 20, 19, 45, 34, 23, 25, 25, 25, 26, 23, 27, 22, 28, 34, 20, 22, 18, 30, 19, 24, 28, 27, 21, 33] 76


         1       -5742.6467             +nan
         2       -4385.8729       +1356.7738
         3       -4367.8389         +18.0341
         4       -4333.2786         +34.5603
         5       -4249.0132         +84.2654
         6       -4010.2591        +238.7541
         7       -3534.2108        +476.0483
         8       -3126.9941        +407.2167
         9       -2941.2530        +185.7411
        10       -2874.6902         +66.5628
        11       -2791.6033         +83.0869
        12       -2660.6335        +130.9697
        13       -2586.1014         +74.5321
        14       -2553.6454         +32.4561
        15       -2530.5855         +23.0599
        16       -2510.7502         +19.8353
        17       -2501.7652          +8.9850
        18       -2495.3477          +6.4175
        19       -2483.5573         +11.7904
        20       -2467.7544         +15.8029
        21       -2451.5855         +16.1689
        22       -2441.8776          +9.7079
        23

In [35]:
original_dataset['mot'] = dataset['mot'].copy()
cname = 'mot'
# convert all vectors to the cluster index
dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])


hmm = hmmlearn.hmm.MultinomialHMM(
    n_components=9, random_state=0, n_iter=1000, verbose=True,
    params='te',
    init_params='e'
)
hmm.startprob_=np.array([0.7,0.2,0.1,0.0,0.0,0.0,0.0,0.0,0.0])
hmm.transmat_=np.array([ 
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    ])


if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

         1       -4329.0513             +nan
         2       -3073.3410       +1255.7103


training class mot
(1472, 1) [18, 23, 23, 20, 12, 21, 16, 18, 24, 20, 20, 23, 23, 20, 18, 12, 21, 18, 25, 11, 18, 23, 20, 15, 18, 15, 16, 22, 12, 11, 16, 36, 16, 16, 24, 16, 22, 15, 14, 14, 27, 36, 24, 21, 20, 18, 23, 23, 16, 23, 19, 16, 18, 26, 26, 17, 18, 18, 21, 22, 24, 21, 22, 20, 19, 17, 17, 10, 14, 15, 19, 16, 20, 18, 20, 23] 76


         3       -2819.3242        +254.0168
         4       -2716.7284        +102.5958
         5       -2668.3005         +48.4279
         6       -2642.1019         +26.1986
         7       -2626.6464         +15.4555
         8       -2615.6213         +11.0251
         9       -2606.9437          +8.6776
        10       -2600.2115          +6.7322
        11       -2594.7296          +5.4819
        12       -2589.5835          +5.1461
        13       -2585.0362          +4.5474
        14       -2581.7121          +3.3240
        15       -2579.5094          +2.2027
        16       -2578.1329          +1.3765
        17       -2577.2432          +0.8897
        18       -2576.6043          +0.6389
        19       -2576.0946          +0.5098
        20       -2575.6479          +0.4467
        21       -2575.2309          +0.4170
        22       -2574.8411          +0.3898
        23       -2574.4948          +0.3463
        24       -2574.1992          +0.2956
        25

In [36]:
original_dataset['co_the'] = dataset['co_the'].copy()
cname = 'co_the'

# convert all vectors to the cluster index
dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])

hmm = hmmlearn.hmm.MultinomialHMM(
    n_components=12, random_state=0, n_iter=1000, verbose=True,
    params='te',
    init_params='e'
)
hmm.startprob_=np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_=np.array([ 
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
    ])

if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

training class co_the
(12074, 1) [137, 117, 156, 128, 122, 122, 111, 112, 125, 122, 108, 122, 124, 110, 111, 118, 105, 131, 119, 111, 152, 116, 122, 110, 125, 119, 109, 115, 117, 112, 125, 108, 137, 106, 116, 111, 115, 129, 122, 124, 152, 125, 119, 129, 128, 122, 127, 129, 119, 121, 118, 108, 113, 124, 152, 156, 124, 108, 152, 128, 129, 118, 129, 152, 116, 109, 119, 116, 125, 152, 127, 129, 128, 111, 109, 111, 124, 152, 117, 103, 114, 118, 109, 118, 123, 116, 115, 122, 118, 117, 115, 112, 119, 114, 141, 109, 124, 128, 111] 99


         1      -33949.6983             +nan
         2       -8497.3349      +25452.3634
         3       -6604.8045       +1892.5303
         4       -6261.1607        +343.6438
         5       -5983.6686        +277.4921
         6       -5792.6534        +191.0152
         7       -5719.0786         +73.5749
         8       -5691.9053         +27.1733
         9       -5681.2695         +10.6357
        10       -5675.7894          +5.4801
        11       -5672.3597          +3.4298
        12       -5670.1861          +2.1735
        13       -5668.3351          +1.8511
        14       -5664.4432          +3.8919
        15       -5653.8979         +10.5453
        16       -5641.8188         +12.0791
        17       -5637.6337          +4.1851
        18       -5636.2089          +1.4248
        19       -5635.7053          +0.5036
        20       -5635.5401          +0.1652
        21       -5635.4654          +0.0747
        22       -5635.4229          +0.0425
        23

In [37]:
np.set_printoptions(precision=2, suppress=True)
print(models['co_the'].transmat_)

[[0.94 0.06 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.92 0.03 0.05 0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.79 0.07 0.14 0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.56 0.22 0.21 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.94 0.   0.06 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.92 0.04 0.04 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.16 0.   0.84 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.06 0.   0.94]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.  ]]


In [38]:
pickle.dump(models, open('./model/model.pkl','wb'))

In [39]:
dataset["test_nguoi"] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_nguoi']])
dataset['test_toi'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_toi']])
dataset['test_khong'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_khong']])
dataset['test_mot'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_mot']])
dataset['test_co_the'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_co_the']])

In [40]:
print("Accuracy:")
mapping = ["nguoi", "toi", "khong", "mot", "co_the"]
class_names = ["test_nguoi", "test_toi", "test_khong", "test_mot", "test_co_the"]
count = 0
correct = 0
for true_cname in class_names:
    score = []
    for i in dataset[true_cname]:
        score = [model.score(i, [len(i)]) for cname, model in models.items() if cname[:4] != 'test']
        res = mapping[score.index(max(score))] 
        if res == true_cname[5:]:
            correct += 1
        count += 1 
print(100*correct/count, "%")

Accuracy:
85.0 %
