## Continuous and Discrete Hidden Markvov Models
Kaela Nelson

Volume 3A

In [3]:
import numpy as np
import MFCC
from scipy.io import wavfile
from collections import defaultdict
import os
import gmmhmm
import random
import pickle

We first implement and sample from a Gaussian mixture model continuous hidden markov model.

In [1]:
def sample_gmmhmm(gmmhmm, n_sim):
    """
    Inputs
    ------
    gmmhmm : list of parameters for gaussian mixture model
    n_sim: number of simulations to iterate over
    
    Simulates sampling from a Gaussian mixture model hidden markov model.
    
    Returns
    -------
    states : ndarray of shape (n_sim,)
        The sequence of states
    obs : ndarray of shape (n_sim, K)
        The generated observations (column vectors of length K)
    """
    #n_sim different observations
    pi = gmmhmm[-1]
    A = gmmhmm[0]
    states = []
    obs_ls = []
    
    #select initial random state
    state = np.argmax(np.random.multinomial(1, pi))
    states.append(state)
    
    #n simmulations
    for i in range(n_sim):
        #select initial sample
        state = states[-1]
        sample_component = np.argmax(states)
        # sample gmmhmm
        obs = np.random.multivariate_normal(means[1, sample_component, :], covars[1, sample_component, :, :])
        state = np.argmax(np.random.multinomial(1, A[state,:]))
        #save observation
        obs_ls.append(obs)
        states.append(state)
    return states, obs_ls

In [4]:
#initial conditions
A = np.array([[.65, .35], [.15, .85]])
pi = np.array([.8, .2])
weights = np.array([[.7, .2, .1], [.1, .5, .4]])
means1 = np.array([[0., 17., -4.], [5., -12., -8.], [-16., 22., 2.]])
means2 = np.array([[-5., 3., 23.], [-12., -2., 14.], [15., -32., 0.]])
means = np.array([means1, means2])
covars1 = np.array([5*np.eye(3), 7*np.eye(3), np.eye(3)])
covars2 = np.array([10*np.eye(3), 3*np.eye(3), 4*np.eye(3)])
covars = np.array([covars1, covars2])
gmmhmm = [A, weights, means, covars, pi]
n_sim = 8

#sample gmmhmm
states, obs = sample_gmmhmm(gmmhmm, n_sim)

We now implement our method in order to classify each school subject within our data set.

In [42]:
# int, np array
mfccs = defaultdict(list)
subjects = ["Mathematics", "Psychology", "Statistics", "PoliticalScience", "Biology"]

for subject in subjects:
    #get 30 samples
    for num in range(1, 31):
        sample = wavfile.read(f"Samples/{subject} ({num}).wav")
        rate, data = sample[0],sample[1]
        #check if correct length
        if len(data) != 88200:
            print(f"Skipped {subject} of length {len(data)}")
        else:
            mfccs[subject].append(MFCC.extract(data))

Skipped Statistics of length 88186
Skipped Statistics of length 97792
Skipped Statistics of length 85422


In [43]:
folders = ["biology", "mathematics", "polysci", "psychology", "statistics"]
for subject, folder in zip(subjects, folders):
    for num in range(1, 10):
        #read in file
        sample = wavfile.read(f"CDHMMSoundFiles/{folder}/{folder}_0{num}.wav")
        rate, data = sample[0],sample[1]
        #check if correct length
        if len(data) != 88200:
            print(f"Skipped {subject} of length {len(data)}")
        else:
            mfccs[subject].append(MFCC.extract(data))

Skipped Biology of length 619776
Skipped Biology of length 543744


In [74]:
def initialize(n):
    """n: number of states
    Returns: 
    start_prob: a random initial state distribution
    transmat: a (row-stochastic) transition matrix"""
    #initial state
    rand_start = np.random.rand(n)
    start_prob = rand_start/rand_start.sum()
    
    #transition matrix
    matrix = np.random.rand(n,n)
    transmat = matrix/matrix.sum(axis=1)[:,None]
    
    return start_prob, transmat

In [225]:
best_models = defaultdict(list)
tests = defaultdict(list)
for word in mfccs.keys():
    #random re-starts
    models = []
    logs = []
    for i in range(10):
        #partition into train and test
        train = mfccs[word][:20]
        test = mfccs[word][20:30]

        #randomly initialize startprob and transition matrix
        startprob, transmat = initialize(5)
        model = gmmhmm.GMMHMM(n_components=5, n_mix=3, transmat=transmat, startprob 
            =startprob, cvtype='diag')
        
        # these values for covars_prior and var should work well for this problem
        model.covars_prior = 0.01
        model.fit(train, init_params='mc', var=0.1)
        
        #save log probs
        logs.append(model.logprob)
        models.append(model)
    best_models[word].append([max(logs), models[np.argmax(max(logs))]])
    tests[word] = test

In [236]:
#pickle test data and trained models
with open("best_models", "wb") as pickle_file:
    pickle.dump(best_models,pickle_file)
with open('best_models', 'rb') as f:
    new_models = pickle.load(f)
    
with open("tests", "wb") as pickle_file:
    pickle.dump(tests,pickle_file)
with open('tests', 'rb') as f:
    new_tests = pickle.load(f)

We calculate the accuracy within each model for each subject. The results are recorded below.

In [361]:
accuracy = defaultdict(list)
#10 test samples
subjects = ['Mathematics', 'Psychology', 'Statistics', 'PoliticalScience', 'Biology']

#for each sets of tests corresponding to each model
for test in new_tests.keys():
    scores = []
    #for each of the 5 models
    for model in subjects:       
        score = [new_models[model][0][1].score(sample) for sample in tests[test]]
        scores.append(score)
    #find model with max log probs score
    max_score = np.argmax(scores, axis=0)
    classification = [subjects[i] for i in max_score]
#     print(classification)
    accuracy = np.mean([classif == test for classif in classification])
    print(f"The model {test} has an accuracy of {accuracy}.")

The model Mathematics has an accuracy of 1.0.
The model Psychology has an accuracy of 0.9.
The model Statistics has an accuracy of 0.7.
The model PoliticalScience has an accuracy of 0.9.
The model Biology has an accuracy of 0.6.


According to my models, Biology is the hardest to classify, while Mathematics is the easiest to classify.