In [1]:
%config IPCompleter.greedy=True

In [2]:
#Import Libraries and Data
import numpy as np
import pandas as pd
from scipy.stats import rv_discrete
from sklearn.feature_extraction.text import CountVectorizer

shakespeare=pd.read_csv('../data/external/Shakespeare_data.csv')
shakespeare.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [3]:
#Clean Data
shakespeare=shakespeare.dropna(subset=['Player'])
shakespeare=shakespeare.dropna(subset=['PlayerLine'])
print(shakespeare.head())
print(shakespeare.shape)

   Dataline      Play  PlayerLinenumber ActSceneLine         Player  \
3         4  Henry IV               1.0        1.1.1  KING HENRY IV   
4         5  Henry IV               1.0        1.1.2  KING HENRY IV   
5         6  Henry IV               1.0        1.1.3  KING HENRY IV   
6         7  Henry IV               1.0        1.1.4  KING HENRY IV   
7         8  Henry IV               1.0        1.1.5  KING HENRY IV   

                                       PlayerLine  
3          So shaken as we are, so wan with care,  
4      Find we a time for frighted peace to pant,  
5  And breathe short-winded accents of new broils  
6         To be commenced in strands afar remote.  
7       No more the thirsty entrance of this soil  
(111389, 6)


In [4]:
#----------------------------Define the Hidden States and Transitions Between----------------------------
hiddenstates=shakespeare.ActSceneLine.unique()

#Auto Generate initial state
initialStateDist=pd.Series(data=np.random.uniform(size=len(hiddenstates)),index=hiddenstates)

#Auto Generate the transitions between these states
hiddenStateProb=pd.DataFrame(np.random.uniform(size=(len(hiddenstates),
                                                     len(hiddenstates))),columns=hiddenstates, index=hiddenstates)
hiddenStateProb.head()

Unnamed: 0,1.1.1,1.1.2,1.1.3,1.1.4,1.1.5,1.1.6,1.1.7,1.1.8,1.1.9,1.1.10,...,4.4.920,4.4.921,4.4.922,4.4.923,4.4.924,4.4.925,4.4.926,4.4.927,4.4.928,4.4.929
1.1.1,0.09518,0.414167,0.378004,0.769673,0.51099,0.601169,0.603081,0.159358,0.219004,0.877844,...,0.865916,0.528664,0.295282,0.138024,0.392683,0.346478,0.222289,0.711241,0.170063,0.643743
1.1.2,0.284495,0.154968,0.882633,0.054388,0.107948,0.367598,0.427567,0.486834,0.15339,0.089449,...,0.419613,0.388491,0.463514,0.209003,0.522351,0.377687,0.132087,0.642959,0.436291,0.660961
1.1.3,0.034839,0.826555,0.363361,0.711913,0.912885,0.958149,0.110935,0.217932,0.752278,0.299046,...,0.414823,0.106982,0.810081,0.922351,0.866484,0.270458,0.281048,0.304293,0.533485,0.197409
1.1.4,0.637037,0.854455,0.903073,0.588968,0.871993,0.2094,0.232473,0.051833,0.95401,0.932097,...,0.276138,0.32931,0.790561,0.867215,0.31356,0.675387,0.733827,0.425965,0.468025,0.209919
1.1.5,0.967525,0.017826,0.430729,0.164976,0.162447,0.030287,0.613126,0.310196,0.370726,0.919195,...,0.14276,0.727371,0.563233,0.778693,0.493363,0.957392,0.492526,0.381325,0.142486,0.374152


In [5]:
#-------------------------Observable state calculation----------------------------
obsstates=shakespeare[['ActSceneLine','PlayerLine']]
obsstates['PlayerLine2']=obsstates.PlayerLine.str.replace(pat='[,.:]', repl='').astype(str)
#Figure out initial probabilities through frequency
vectorizer = CountVectorizer()
corpus=obsstates['PlayerLine2'].values
X = vectorizer.fit_transform(corpus)
X.toarray() #To grab this value, vectorizer.get_feature_names()
observablestates=vectorizer.get_feature_names()
print(len(observablestates))
observableStateDF=pd.DataFrame(data=X.toarray(),columns=vectorizer.get_feature_names())
print(observableStateDF.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


22572
   10  2d  2s  4d  5s  6d  8d  aaron  abaissiez  abandon    ...      \
0   0   0   0   0   0   0   0      0          0        0    ...       
1   0   0   0   0   0   0   0      0          0        0    ...       
2   0   0   0   0   0   0   0      0          0        0    ...       
3   0   0   0   0   0   0   0      0          0        0    ...       
4   0   0   0   0   0   0   0      0          0        0    ...       

   zenelophon  zenith  zephyrs  zir  zo  zodiac  zodiacs  zone  zounds  \
0           0       0        0    0   0       0        0     0       0   
1           0       0        0    0   0       0        0     0       0   
2           0       0        0    0   0       0        0     0       0   
3           0       0        0    0   0       0        0     0       0   
4           0       0        0    0   0       0        0     0       0   

   zwaggered  
0          0  
1          0  
2          0  
3          0  
4          0  

[5 rows x 22572 columns]


In [6]:
#Observation here
obs=['to', 'be', 'or', 'not']

In [11]:
#----Viterbi-----

#Initialize viterbi probs
v=pd.DataFrame(np.zeros(shape=(len(hiddenstates),len(hiddenstates))),columns=hiddenstates, index=hiddenstates)
v.iloc[:,0]=initialStateDist*observableStateDF.iloc[:,observableStateDF.columns.get_loc(obs[0])]

#Initialize path indexes
B=pd.DataFrame(np.zeros(shape=(len(hiddenstates),len(hiddenstates))),columns=hiddenstates, index=hiddenstates)
B.iloc[:,0]=0

#Induction
#Based on http://practicalcryptography.com/miscellaneous/machine-learning/hidden-markov-model-hmm-tutorial/ 
#and http://www.blackarbs.com/blog/introduction-hidden-markov-models-python-networkx-sklearn/2/9/2017
def probs(current,viterb,bvals,hiddenTransit,stateOfObs,obs):
    if current==1: #go back through obs starting at 2
        return
    probs(current-1,viterb,bvals,hiddenTransit,stateOfObs,obs)
    for statej, row in enumerate(hiddenStateProb.itertuples(index=False)):
        calculation=viterb.iloc[current-1,:]*hiddenTransit.iloc[:,statej]
        viterb.iloc[current,statej]= max(calculation)*stateOfObs.iloc[statej,stateOfObs.columns.get_loc(obs[current])]
        bvals.iloc[current,statej]=(calculation).idxmax()

# Recurse through
pathoutcomeprob=probs(len(obs)-1,v,B,hiddenStateProb,observableStateDF,obs)
thestate=B.iloc[len(obs)-1].idxmax()

print(thestate)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86


KeyboardInterrupt: 

In [None]:
#------FWD, BKWD------
#Based on http://practicalcryptography.com/miscellaneous/machine-learning/hidden-markov-model-hmm-tutorial/
# and https://github.com/alexsosn/MarslandMLAlgo/blob/master/Ch16/HMM.py
#Initialize fwd probs
fwda=pd.DataFrame(np.zeros(shape=(len(hiddenstates),len(hiddenstates))),columns=hiddenstates, index=hiddenstates)
fwda.iloc[:,0]=initialStateDist*observableStateDF.iloc[:,observableStateDF.columns.get_loc(obs[0])]

def probs(current,fwda,hiddenTransit,stateOfObs,obs):
    if current==1: #go back through obs starting at 2
        return
    probs(current-1,fwda,bvals,hiddenTransit,stateOfObs,obs)
    for statej, row in enumerate(hiddenStateProb.itertuples(index=False)):
        calculation=fwda.iloc[current-1,:]*hiddenTransit.iloc[:,statej]
        fwda.iloc[current,statej]= sumß(calculation)*stateOfObs.iloc[statej,stateOfObs.columns.get_loc(obs[current])]

# Recurse through
pathoutcomeprob=probs(len(obs)-1,fwda,hiddenStateProb,observableStateDF,obs)
summation=sum(fwda.iloc[len(obs)-1])

#BKWD
#Based on
https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm#Backward_probabilities

## References
Various Pandas and Matrix Code Snippets
* http://www.blackarbs.com/blog/introduction-hidden-markov-models-python-networkx-sklearn/2/9/2017
* https://stackoverflow.com/questions/25292838/applying-regex-to-a-pandas-dataframe 
* https://stackoverflow.com/questions/48474442/python-from-list-of-list-of-tokens-to-bag-of-words?rq=1
* https://stackoverflow.com/questions/47297585/building-a-transition-matrix-using-words-in-python-numpy
* https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
* https://datascience.stackexchange.com/questions/37329/how-to-convert-an-array-of-numbers-into-probability-values
* https://stackoverflow.com/questions/26537878/pandas-sum-across-columns-and-divide-each-cell-from-that-value

## Unused Code Snippets
#Calculate Initial Probabilities for observation states (words) and hidden states (player)

#First, observation states
obsstates=shakespeare[['Player','PlayerLine']]
#Parse out player line
obsstates['PlayerLine2']=obsstates.PlayerLine.str.replace(pat='[,.:]', repl='').astype(str)

#Figure out initial probabilities through frequency
vectorizer = CountVectorizer()
corpus=obsstates['PlayerLine2'].values
X = vectorizer.fit_transform(corpus)
X.toarray() #To grab this value, vectorizer.get_feature_names()
toDivide=len(vectorizer.get_feature_names())
initProb=X/toDivide
print(initProb.toarray())
print(vectorizer.get_feature_names())
