# Demo for data_prep, hmm_training, classifier, and hmm_test

In [1]:
import numpy as np
import pandas as pd
import scipy.stats
from matplotlib import pyplot as plt

from CharacterFeatureExtractor import featureExtractor
from dataprep import *
from modeltrain import *
from classifier import *
from hmm_test import *
from createDataSet import *
from DrawCharacter import DrawCharacter
from PattRecClasses import HMM_TA
from hmm_gen import hmm_gen


In [2]:
'''
Multivariate Gaussian Distribution Class
'''
class multigaussD:
    mean = np.array([0])
    cov = np.array([[0]])
    def __init__(self, mu, C):
        if C.shape[0] is not C.shape[1]:
            print("error, non-square covariance matrix supplied")
            return
        if mu.shape[0] is not C.shape[0]:
            print("error, mismatched mean vector and covariance matrix dimensions")
            return
        self.mean = mu
        if np.where(np.diag(C)==0)[0].shape[0] != 0:
            C += np.diagflat(np.ones(C.shape[0])/10000)
        C[np.isnan(C)]=1
        self.cov = C
        return
    def random(self, num):
        return np.random.multivariate_normal(self.mean, self.cov, num)
    def rand(self):
        return np.random.multivariate_normal(self.mean, self.cov, 1)[0]
    def likelihood(self, X):
        p = scipy.stats.multivariate_normal(self.mean, self.cov, 1)
        pd = p.pdf(X)
        return pd
    def loghood(self, X):
        return np.log(self.likelihood(X))
    def getmean(self):
        return self.mean
    def getcov(self):
        return self.cov
    
def prob(x, B):
    T = x.shape[0]
    N = B.shape[0]
    res = np.zeros((T, N))
    for i in range(T):
        for j in range(N):
            res[i,j] = B[j].likelihood(x[i])
    scaled = np.zeros(res.shape)
    for i in range(scaled.shape[0]):
        for j in range(scaled.shape[1]):
            scaled[i, j] = res[i,j]/np.amax(res[i])
    return res, scaled


def logprob(x, B):
    res, scaled = prob(x,B)
    return np.log(res), np.log(scaled)

## Training Model

In [3]:
### data prep
db_name = "database_inc_sampchar"
data_features = pd.read_pickle(r'data/' + db_name + '_features.cdb')
data_labels = pd.read_pickle(r'data/' + db_name + '_labels.cdb')
class_state_nums = np.array([2,5,5,5,5,5,5,6,5,5])

train_data, test_data, data_labels = dataprep(db_name, nr_test=5)
hm_learn, train_acc = modeltrain(train_data, data_labels, 13, class_state_nums, longest_sample = True, useprint=False)
accuracies, result_labels_list = hmm_test(hm_learn,test_data, data_labels)

    


Database read is  database_inc_sampchar
Labels used are  ['A', 'C', 'K', 'P', 'X', 'T', '+', 'N', 'V', '4']
Total training samples are  15  and testing samples are  5 


 ------------ CHARACTER  A ------------
Number of states:  2
Avg probability for entire sequence over test samples is -3.297579721394798  (log),  3.697254305555696 %
Success score:  369.7254305555696

 ------------ CHARACTER  C ------------
Number of states:  5
Avg probability for entire sequence over test samples is -12.728209172857998  (log),  0.00029662584726259983 %
Success score:  29.662584726259983

 ------------ CHARACTER  K ------------
Number of states:  5
Avg probability for entire sequence over test samples is -16.376561683499343  (log),  7.722352614732563e-06 %
Success score:  0.7722352614732564

 ------------ CHARACTER  P ------------
Number of states:  5
Avg probability for entire sequence over test samples is -13.666182143525589  (log),  0.00011610542335769848 %
Success score:  11.610542335769848

 ----

In [4]:
# ### data prep
# db_name = "database_inc_sampchar"
# data_features = pd.read_pickle(r'data/' + db_name + '_features.cdb')
# data_labels = pd.read_pickle(r'data/' + db_name + '_labels.cdb')

# num_state_list = np.array([2, 3, 4, 5, 6, 7])
# train_acc_list = []
# big_result_labels_list = []
# test_acc_list = []
# for i in num_state_list:
#     print("++++++++++++++++++++++TRAINING OF HMM MODELS WITH " + str(i) + " STATES++++++++++++++++++++++")
#     train_data, test_data, data_labels = dataprep(db_name, nr_test=5)
#     hm_learn, train_acc = modeltrain(train_data, data_labels, 20, i, longest_sample= True, useprint=False)
#     accuracies, result_labels_list = hmm_test(hm_learn,test_data, data_labels)
#     train_acc_list.append(train_acc)
#     test_acc_list.append(accuracies)
#     big_result_labels_list.append(result_labels_list)
    


# #print(result_labels_list)

In [5]:
db_name = "database_inc_sampchar"
data_features = pd.read_pickle(r'data/' + db_name + '_features.cdb')
data_labels = pd.read_pickle(r'data/' + db_name + '_labels.cdb')
print(data_labels)
print(len(data_features))
print(train_acc)


['A', 'C', 'K', 'P', 'X', 'T', '+', 'N', 'V', '4']
10
[-3.297579721394798, -12.728209172857998, -16.376561683499343, -13.666182143525589, -11.071580998280288, -13.43206417504319, -12.183900861745903, -22.815841376115092, -10.00731780907363, -12.820583907986208]
