In [1]:
%config IPCompleter.greedy=True

In [2]:
#Import Libraries and Data
import numpy as np
import pandas as pd
from scipy.stats import rv_discrete
from sklearn.feature_extraction.text import CountVectorizer

shakespeare=pd.read_csv('../data/external/Shakespeare_data.csv')
shakespeare.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [3]:
#Clean Data
shakespeare=shakespeare.dropna() #drop all NaN as typically represent scene transitions
obsstatesInit=shakespeare['PlayerLine'].astype(str)
obsstatesInit=obsstatesInit.str.replace(pat=r"[,.:!?]", repl=" ") #remove punctuation and place space
obsstatesInit=obsstatesInit.str.replace(pat=r"--|\t|  ", repl=" ") #remove excessive white space and replace with one space
obsstates=obsstatesInit.str.replace(pat=r"\'|\-|\[|\]|\$|\(|\)", repl="") #remove parenthesis or dashes
obsstates=obsstates.dropna() #drop any strings that are now empty
obsstates=obsstates.str.lower() #lower all capitalization
obsstates=obsstates.to_frame() #back to frame 
print(obsstates.head())
obsstates.to_csv(path_or_buf='../data/internal/Shakespeare_data.csv') #save to data file

                                      PlayerLine
3          so shaken as we are so wan with care 
4     find we a time for frighted peace to pant 
5  and breathe shortwinded accents of new broils
6        to be commenced in strands afar remote 
7      no more the thirsty entrance of this soil


In [4]:
#-------------------------Determine Amount of Unique States (words)------------------
#Figure out initial probabilities through frequency
vectorizer = CountVectorizer(token_pattern=u"(?u)\\b\\w+\\b")
corpus=obsstates.PlayerLine.values
X = vectorizer.fit_transform(corpus)
observablestates=vectorizer.get_feature_names()
StateWords=len(observablestates)
print("Number of Unique Words (aka states)")
print(StateWords)

Number of Unique Words (aka states)
26307


In [5]:
#Create Transition Matrix
initTransit=pd.Series(data=np.zeros(shape=(StateWords)),index=observablestates)
obsTransit=pd.DataFrame(np.zeros(shape=(StateWords,StateWords)),index=observablestates, columns=observablestates)

print(initTransit.head())
print(obsTransit.head())

1     0.0
10    0.0
2     0.0
2d    0.0
2s    0.0
dtype: float64
      1   10    2   2d   2s    3    4   4d    5   5s    ...      zenelophon  \
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2d  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2s  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   

    zenith  zephyrs  zir   zo  zodiac  zodiacs  zone  zounds  zwaggered  
1      0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
10     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2      0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2d     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2s     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  

[5 rows x 26307

In [6]:
#For each line, 
obsstates.loc[:,'PlayerLine']=obsstates.PlayerLine.str.lower()
for row in obsstates['PlayerLine']:
    #Split row for easy counting
    new=row.split(" ")
    new=list(filter(None, new))
    # Check if a list
    if len(new)==0:
        break
    # Grab the first word and add count to series
    initTransit[new[0]]=initTransit[new[0]]+1

    #Count the transitions
    for i in range(len(new)-1):
        obsTransit.loc[new[i],new[i+1]]=obsTransit.loc[new[i],new[i+1]]+1
print(obsTransit.head())

      1   10    2   2d   2s    3    4   4d    5   5s    ...      zenelophon  \
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2d  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2s  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   

    zenith  zephyrs  zir   zo  zodiac  zodiacs  zone  zounds  zwaggered  
1      0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
10     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2      0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2d     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2s     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  

[5 rows x 26307 columns]


In [7]:
#Convert to probabilities
newInitTransit= initTransit.div(initTransit.sum())
print(newInitTransit.head())
#Sum each row and divide by result
newobsTransit=obsTransit.div(obsTransit.sum(axis=1), axis=0)
newobsTransit.fillna(value=0,inplace=True) #meaning the entire row was zero thus division by zero introduced NaN
print(newobsTransit.head())

1     0.000015
10    0.000015
2     0.000015
2d    0.000000
2s    0.000000
dtype: float64
      1   10    2   2d   2s    3    4   4d    5   5s    ...      zenelophon  \
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2d  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
2s  0.0  0.0  0.0  0.5  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   

    zenith  zephyrs  zir   zo  zodiac  zodiacs  zone  zounds  zwaggered  
1      0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
10     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2      0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2d     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
2s     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0     

In [8]:
#------Forward Algorithm x Viterbi with Markov Model to Generate text given the length value------------
length=10

outputSequence=pd.Series(np.zeros(shape=length),index=range(length))
outputIndices=pd.Series(np.zeros(shape=length),index=range(length))
                            
for i in range(length):
    if i == 0:
        outputSequence[i]=initTransit.max() #Return most frequent starting word probability
        outputIndices[i]=initTransit.idxmax() #Save that word
    else:
        inQuestion=outputSequence[i-1]*newobsTransit.loc[outputIndices[i-1],:]
        outputSequence[i]=inQuestion.max()
        outputIndices[i]=inQuestion.idxmax()
    print(outputIndices[i])

and
i
am
i
am
i
am
i
am
i


In [9]:
#Some checks
print(newInitTransit.loc['and'])
print(newInitTransit.loc['the'])
print(newobsTransit.loc['the','and'])
print(newobsTransit.loc['and','the'])
print(newobsTransit.loc['and', 'i'])
print(newobsTransit.loc['i','am'])
print(newobsTransit.loc['am','i'])

0.06737789432905206
0.036564651190511666
0.0
0.03240109140518418
0.03315143246930423
0.09365509316260363
0.10526315789473684


In [10]:
#----------------------------Define the Hidden States----------------------------
hiddenstates=obsstates.index

In [11]:
#-------------------------Observable hidden state calculation----------------------------
observableStateDF=pd.DataFrame(data=X.toarray(),index=hiddenstates,columns=vectorizer.get_feature_names())
#fill any NaN with 0
observableStateDF.fillna(0)

#add columns for total word sums
eachWordCount=observableStateDF.values.sum(axis=0)
print("Total Word Sums:" + str(len(eachWordCount)))

#divide each row by the sum vector to grab overall word probabilities
observableStateDF=observableStateDF.values / eachWordCount
observableStateDF=pd.DataFrame(data=observableStateDF,index=hiddenstates, columns=vectorizer.get_feature_names())
print(observableStateDF.head())


Total Word Sums:26307
     1   10    2   2d   2s    3    4   4d    5   5s    ...      zenelophon  \
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
5  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
6  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   
7  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    ...             0.0   

   zenith  zephyrs  zir   zo  zodiac  zodiacs  zone  zounds  zwaggered  
3     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
4     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
5     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
6     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  
7     0.0      0.0  0.0  0.0     0.0      0.0   0.0     0.0        0.0  

[5 rows x 26307 columns]


In [15]:
#Observation here
obs=["to","be","or","not"]
#For a little bit of fun, loop through a couple of times to see a variation depending on initial state probs~
lengths=10

In [16]:
for i in range(lengths):
    #Auto generate initial states as these are unknown
    initHState=pd.Series(data=np.random.uniform(size=len(hiddenstates)),index=hiddenstates)

    #----------Viterbi Here----------
    viterbi=pd.DataFrame(data=np.zeros(shape=(len(obs),len(hiddenstates))),index=obs,columns=hiddenstates)
    hStates=pd.Series(data=np.zeros(shape=len(obs)))

    for index,ob in enumerate(obs):
        if ob==obs[0]:
            viterbi.loc[ob]=initHState*observableStateDF.loc[:,ob]
            hStates[index]=0
        else:
            prevObs=obs[index-1]
            viterbi.loc[ob]=viterbi.loc[prevObs]*observableStateDF.loc[:,ob]*newobsTransit.loc[prevObs,ob]
            hStates[index]=viterbi.loc[ob].idxmax()
    calculatedState=hStates[len(obs)-1]
    calculatedState=int(calculatedState)
    print(calculatedState)
    print(shakespeare.loc[calculatedState,:])
    

103117
Dataline                                                      103118
Play                                                   Twelfth Night
PlayerLinenumber                                                  53
ActSceneLine                                                  1.3.95
Player                                                    SIR ANDREW
PlayerLine          will not be seen, or if she be, it's four to one
Name: 103117, dtype: object
34229
Dataline                                                 34230
Play                                                    Hamlet
PlayerLinenumber                                            19
ActSceneLine                                            3.1.64
Player                                                  HAMLET
PlayerLine          To be, or not to be: that is the question:
Name: 34229, dtype: object
34229
Dataline                                                 34230
Play                                                    Hamlet
PlayerLi

## References
Various Pandas and Matrix Code Snippets
* http://www.blackarbs.com/blog/introduction-hidden-markov-models-python-networkx-sklearn/2/9/2017
* https://stackoverflow.com/questions/25292838/applying-regex-to-a-pandas-dataframe 
* https://stackoverflow.com/questions/48474442/python-from-list-of-list-of-tokens-to-bag-of-words?rq=1
* https://stackoverflow.com/questions/47297585/building-a-transition-matrix-using-words-in-python-numpy
* https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
* https://datascience.stackexchange.com/questions/37329/how-to-convert-an-array-of-numbers-into-probability-values
* https://stackoverflow.com/questions/26537878/pandas-sum-across-columns-and-divide-each-cell-from-that-value
* https://web.stanford.edu/~jurafsky/slp3/A.pdf