# Demo for data_prep, hmm_training, classifier, and hmm_test

In [1]:
import numpy as np
import pandas as pd
import scipy.stats
from matplotlib import pyplot as plt

from CharacterFeatureExtractor import featureExtractor
from dataprep import *
from modeltrain import *
from classifier import *
from hmm_test import *
from createDataSet import *
from DrawCharacter import DrawCharacter
from PattRecClasses import HMM_TA
from hmm_gen import hmm_gen


In [2]:
'''
Multivariate Gaussian Distribution Class
'''
class multigaussD:
    mean = np.array([0])
    cov = np.array([[0]])
    def __init__(self, mu, C):
        if C.shape[0] is not C.shape[1]:
            print("error, non-square covariance matrix supplied")
            return
        if mu.shape[0] is not C.shape[0]:
            print("error, mismatched mean vector and covariance matrix dimensions")
            return
        self.mean = mu
        if np.where(np.diag(C)==0)[0].shape[0] != 0:
            C += np.diagflat(np.ones(C.shape[0])/10000)
        C[np.isnan(C)]=1
        self.cov = C
        return
    def random(self, num):
        return np.random.multivariate_normal(self.mean, self.cov, num)
    def rand(self):
        return np.random.multivariate_normal(self.mean, self.cov, 1)[0]
    def likelihood(self, X):
        p = scipy.stats.multivariate_normal(self.mean, self.cov, 1)
        pd = p.pdf(X)
        return pd
    def loghood(self, X):
        return np.log(self.likelihood(X))
    def getmean(self):
        return self.mean
    def getcov(self):
        return self.cov
    
def prob(x, B):
    T = x.shape[0]
    N = B.shape[0]
    res = np.zeros((T, N))
    for i in range(T):
        for j in range(N):
            res[i,j] = B[j].likelihood(x[i])
    scaled = np.zeros(res.shape)
    for i in range(scaled.shape[0]):
        for j in range(scaled.shape[1]):
            scaled[i, j] = res[i,j]/np.amax(res[i])
    return res, scaled


def logprob(x, B):
    res, scaled = prob(x,B)
    return np.log(res), np.log(scaled)

## Training Model

### DIFFERENT NUMBER OF STATES FOR EACH CLASS

In [3]:
# ### data prep
# db_name = "database_inc_sampchar"
# data_features = pd.read_pickle(r'data/' + db_name + '_features.cdb')
# data_labels = pd.read_pickle(r'data/' + db_name + '_labels.cdb')
# class_state_nums = np.array([2,5,5,5,5,5,5,6,5,5])

# train_data, test_data, data_labels = dataprep(db_name, nr_test=5)
# hm_learn, train_acc = modeltrain(train_data, data_labels, 13, class_state_nums, longest_sample = True, useprint=False)
# accuracies, result_labels_list = hmm_test(hm_learn,test_data, data_labels)

    

## The same number of states for all classes

In [None]:
### data prep
db_name = "Bigdata"
data_features = pd.read_pickle(r'data/' + db_name + '_features.cdb')
data_labels = pd.read_pickle(r'data/' + db_name + '_labels.cdb')

num_state_list = np.array([2, 3, 4, 5, 6, 7])
train_acc_list = []
big_result_labels_list = []
test_acc_list = []
for i in num_state_list:
    print("++++++++++++++++++++++TRAINING OF HMM MODELS WITH " + str(i) + " STATES++++++++++++++++++++++")
    train_data, test_data, data_labels = dataprep(db_name, nr_test=10)
    hm_learn, train_acc = modeltrain(train_data, data_labels, 15, i, longest_sample= True, useprint=False)
    accuracies, result_labels_list = hmm_test(hm_learn,test_data, data_labels)
    train_acc_list.append(train_acc)
    test_acc_list.append(accuracies)
    big_result_labels_list.append(result_labels_list)
    


#print(result_labels_list)

++++++++++++++++++++++TRAINING OF HMM MODELS WITH 2 STATES++++++++++++++++++++++

Database read is  Bigdata
Labels used are  ['A', 'C', 'K', 'P', 'X', 'T', '+', 'N', 'V', '4']
Total training samples are  20  and testing samples are  10 


 ------------ CHARACTER  A ------------
Number of states:  2
Avg probability for entire sequence over test samples is -3.5459065580283786  (log),  2.884246328739876 %
Normalized score:  288.4246328739876

 ------------ CHARACTER  C ------------
Number of states:  2
Avg probability for entire sequence over test samples is -6.32728603525559  (log),  0.17868767147434447 %
Normalized score:  17.868767147434447

 ------------ CHARACTER  K ------------
Number of states:  2
Avg probability for entire sequence over test samples is -3.2295322305693417  (log),  3.9576006905513466 %
Normalized score:  395.76006905513464

 ------------ CHARACTER  P ------------
Number of states:  2
Avg probability for entire sequence over test samples is -3.2295322305693426  (log

  scaled[i, j] = res[i, j] / np.amax(res[i])


Classification accuracy of test samples of character K is: 50.0%
Classification accuracy of test samples of character P is: 0.0%
Classification accuracy of test samples of character X is: 50.0%
Classification accuracy of test samples of character T is: 0.0%
Classification accuracy of test samples of character + is: 0.0%
Classification accuracy of test samples of character N is: 30.0%
Classification accuracy of test samples of character V is: 40.0%
Classification accuracy of test samples of character 4 is: 40.0%
++++++++++++++++++++++TRAINING OF HMM MODELS WITH 3 STATES++++++++++++++++++++++

Database read is  Bigdata
Labels used are  ['A', 'C', 'K', 'P', 'X', 'T', '+', 'N', 'V', '4']
Total training samples are  20  and testing samples are  10 


 ------------ CHARACTER  A ------------
Number of states:  3
Avg probability for entire sequence over test samples is -7.048571812755469  (log),  0.08686486647720931 %
Normalized score:  86.86486647720932

 ------------ CHARACTER  C -----------

  scaled[i, j] = res[i, j] / np.amax(res[i])


Classification accuracy of test samples of character K is: 20.0%
Classification accuracy of test samples of character P is: 60.0%
Classification accuracy of test samples of character X is: 90.0%
Classification accuracy of test samples of character T is: 20.0%
Classification accuracy of test samples of character + is: 10.0%
Classification accuracy of test samples of character N is: 30.0%
Classification accuracy of test samples of character V is: 60.0%
Classification accuracy of test samples of character 4 is: 50.0%
++++++++++++++++++++++TRAINING OF HMM MODELS WITH 4 STATES++++++++++++++++++++++

Database read is  Bigdata
Labels used are  ['A', 'C', 'K', 'P', 'X', 'T', '+', 'N', 'V', '4']
Total training samples are  20  and testing samples are  10 


 ------------ CHARACTER  A ------------
Number of states:  4
Avg probability for entire sequence over test samples is -10.141579537319135  (log),  0.003940650958873664 %
Normalized score:  39.40650958873664

 ------------ CHARACTER  C ------

  scaled[i, j] = res[i, j] / np.amax(res[i])


Classification accuracy of test samples of character A is: 90.0%
Classification accuracy of test samples of character C is: 100.0%
Classification accuracy of test samples of character K is: 40.0%
Classification accuracy of test samples of character P is: 50.0%
Classification accuracy of test samples of character X is: 80.0%
Classification accuracy of test samples of character T is: 0.0%
Classification accuracy of test samples of character + is: 70.0%
Classification accuracy of test samples of character N is: 100.0%
Classification accuracy of test samples of character V is: 20.0%
Classification accuracy of test samples of character 4 is: 80.0%
++++++++++++++++++++++TRAINING OF HMM MODELS WITH 5 STATES++++++++++++++++++++++

Database read is  Bigdata
Labels used are  ['A', 'C', 'K', 'P', 'X', 'T', '+', 'N', 'V', '4']
Total training samples are  20  and testing samples are  10 


 ------------ CHARACTER  A ------------
Number of states:  5
Avg probability for entire sequence over test sam

  scaled[i, j] = res[i, j] / np.amax(res[i])


Classification accuracy of test samples of character K is: 60.0%
Classification accuracy of test samples of character P is: 50.0%


  alpha[0, :] = (self.q * scaled[0]) / c[0]
  clog = np.log(c)


Classification accuracy of test samples of character X is: 70.0%
Classification accuracy of test samples of character T is: 0.0%
Classification accuracy of test samples of character + is: 40.0%
Classification accuracy of test samples of character N is: 90.0%
Classification accuracy of test samples of character V is: 10.0%
Classification accuracy of test samples of character 4 is: 80.0%
++++++++++++++++++++++TRAINING OF HMM MODELS WITH 6 STATES++++++++++++++++++++++

Database read is  Bigdata
Labels used are  ['A', 'C', 'K', 'P', 'X', 'T', '+', 'N', 'V', '4']
Total training samples are  20  and testing samples are  10 


 ------------ CHARACTER  A ------------
Number of states:  6
Avg probability for entire sequence over test samples is -14.91726352112945  (log),  3.322880840873165e-05 %
Normalized score:  33.228808408731645

 ------------ CHARACTER  C ------------
Number of states:  6
Avg probability for entire sequence over test samples is -15.541479452346454  (log),  1.78000504507516

In [None]:
db_name = "database_inc_sampchar"
data_features = pd.read_pickle(r'data/' + db_name + '_features.cdb')
data_labels = pd.read_pickle(r'data/' + db_name + '_labels.cdb')
print(data_labels)
print(len(data_features))
print(train_acc)
