In [2]:
import pandas as pd
import numpy as np

In [3]:
raw_data = pd.read_csv('./SouthParkData-master/All-seasons.csv')
raw_data.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."


In [4]:
raw_data['Line'][4]

"Chef said he's been bored, so he joining a group called the Super Adventure Club. \n"

## We're going to attempt to implement the Viterbi algorithm
From https://en.wikipedia.org/wiki/Viterbi_algorithm

### Here's what we need:

#### parameters:

K => number of hidden states

T => length of sequence of observations

N => number of possible observations 

S => the "state space", i.e. all possible words (s1, s2, ... , sK)

Priors => an array of prior probabilities for each state (ie. how likely is a word to occur w/o context?)

Transition Matrix A => K x K matrix where A[i, j] stores probability of transitioning from si to sj

Emission Matrix   B => K x N matrix where B[i, j] stores probability of observing oj from state si
    
#### output:

X - a sequence of states (x1, x2, ..., xT)

Let's first see how large of a vocabulary we're working with

I'm going to include all individual words, plus some basic punctuation like ',' '.' '!' and '?'

In [34]:
# Takes a string, tokenizes it 
# Returns a list of the tokens
def tokenize_str(string):
    
    string = string.lower()
    
    # remove punctuation except for newline
    string = string.replace(',', '')
    string = string.replace('.', '')
    string = string.replace('!', '')
    string = string.replace('?', '')
    string = string.replace('-', '')
    
    # split string and add newline at end
    string = string.split()
    string += ['\n']
    
    return string

print(tokenize_str(raw_data['Line'][4]))

['chef', 'said', "he's", 'been', 'bored', 'so', 'he', 'joining', 'a', 'group', 'called', 'the', 'super', 'adventure', 'club', '\n']


In [6]:
# first I think I want one array of the entire corpus, this will be useful for all 3 of these remaining variables

def get_corpus(dialogue):
    # dialogue <=> a column of strings in a DataFrame

    return [word for sentence in dialogue for word in tokenize_str(sentence)]

corpus = get_corpus(raw_data['Line'][:500])

We would have a vocab of 32834 words, but I'm going to just use the first 500 lines of dialogue for this first go. We'll see if we can feasibly increase it later.

Since we're trying to generate text from this corpus, our # of observations N will be the same as K (vocab_size)

Now we have our states, and also our # of states K, and our # of observations N. In our application, K == N


In [7]:
states = list(set(corpus))
vocab_size = len(states)
corpus_size = len(corpus)
print("corpus size: {cs} \nvocab size: {vs}".format(cs=corpus_size, vs=vocab_size))

corpus size: 5837 
vocab size: 1181


1181 words will be more manageable

We still need the transition matrix A, our emission matrix B, and our array of priors for each observation

Let's do our array of priors, which I believe is simply the frequency of the words in the corpus

In [8]:
import collections # <= for counting the frequency of each word in the corpus (O(n) implementation)

def get_priors(corpus):
    counter = collections.Counter(corpus)
    words = list(counter.keys())
    probabilities = [c/len(corpus) for c in list(counter.values())]
    return dict(zip(words, probabilities))

priors = get_priors(corpus)

# now we have a priors array where the prior probability of word states[i] = priors[i]
print(priors['\n'])

0.08566044200788075


In [9]:
# I want the counts given a word for the next step so I'll define that here
def get_counts(corpus):
    counter = collections.Counter(corpus)
    return dict(zip(list(counter.keys()), list(counter.values())))

In [10]:
# I'm currently confused about what the transition matrix and emission matrix are, and if they're even different. 
# Since we only have the corpus to work with, (I assume this is where the transition matrix comes from),
# as long as I decide to include all words of the corpus in the observation space, the emission matrix
# should be the same as the transition matrix...? I could be wrong on this, but I'm going to roll with it

# So I'm going to essentially compute the probability of transitioning from one word to another w/o further context

def get_transition_matrix (corpus, states):
    K = len(states)
    N = len(corpus)
    
    # initialize with zeros
    transition_matrix = np.zeros((K, K))
    
    state_index = {word:i for i, word in enumerate(states)}

    for word_index in range(1, N):
        to = corpus[word_index]
        frm = corpus[word_index-1]
        
        transition_matrix[state_index[frm]][state_index[to]] += 1
    
    counts = get_counts(corpus)
    for frm in range(K):
        for to in range(K):
            transition_matrix[frm][to] /= counts[states[frm]]
            
    return transition_matrix
        
transition_matrix = get_transition_matrix(corpus, states)

In [11]:
i = 17
print(max(transition_matrix[i]))
print(states[i])
print(get_counts(corpus)[states[i]])

1.0
brains
1


# I messed up
For some reason I thought I'd be able to generate text using the Viterbi algorithm. It's actually the Forward-Backward algorithm I want to implement - we might try to use Viterbi later 

Luckily, the work done so far is still useful

So, 
# Implementing the Forward-Backward algorithm
From https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm



In [12]:
def forward_backward(observations, states, priors, transition_matrix, emission_matrix, end_state):
    '''
    Forward-Backward Algorithm
    
    observations - sequence of observations (list of strings) : this is the corpus
    states - set of hidden states (list of unique strings)
    priors - dictionary of prior probabilities of each state (string -> prior prob)
    transition_matrix - tm[i][j] gives prob of transitioning from states[i] to states[j]
    emission_matrix - em[i][j] gives prob of emitting observation j given hidden state states[i]
    '''
    
    
    K = len(observations)
    S = len(states)
    
    # forward algorithm
    forward = []
    
    # make a pass thru the observations to compute joint prob P(states[k], observations[1:k])
    for i, observation in enumerate(observations):
        f_current = {}
        for s in range(S):
            if i == 0:
                prev_sum = priors[states[s]]
            else:
                prev_sum = sum(f_prev[states[k]] * transition_matrix[k][s] for k in range(S))
            f_current[states[s]] = emission_matrix[s][states.index(observations[i])] * prev_sum

        forward.append(f_current)
        f_prev = f_current
        
    p_forward = sum(f_current[states[k]] * transition_matrix[k][states.index(end_state)] for k in range(S))
    
    # backward algorithm
    backward = []

    for i, observation in enumerate(reversed(observations[1:] + [None,])):
        b_current = {}
        for s in range(S):
            if i == 0:
                b_current[states[s]] = transition_matrix[s][states.index(end_state)]
            else:
                b_current[states[s]] = sum(transition_matrix[s][l] * emission_matrix[l][states.index(observations[i])] * b_prev[states[l]] for l in range(S))
        
        backward.insert(0, b_current)
        b_prev = b_current
    
    p_backward = sum(priors[states[l]] * emission_matrix[l][states.index(observations[0])] * b_current[states[l]] for l in range(S))
    
    posterior = []
    for i in range(len(observations)):
        posterior.append({state: forward[i][state] * backward[i][state] / p_forward for state in states})
        
    # assert p_forward == p_backward
    return forward, backward, posterior

small_corpus = get_corpus(raw_data['Line'][:50])
small_states = list(set(small_corpus))
small_priors = get_priors(small_corpus)
small_transition_matrix = get_transition_matrix(small_corpus, small_states)
print(len(small_corpus), len(small_states))


fwd, bkwd, post = forward_backward(small_corpus, small_states, small_priors, small_transition_matrix, small_transition_matrix.transpose(1,0), '\n')
print(max(post[1].values()))

428 174
nan


  posterior.append({state: forward[i][state] * backward[i][state] / p_forward for state in states})


### I'm having trouble keeping track of all the states vs. indices of the states so I'm going to make a HMM class

#### hopefully this improves understanding & clean up the code a little

In [19]:
class HMM():
    def __init__(self, corpus):
        self.Corpus = corpus
        self.States = self.get_states()
        self.Observations = self.States
        self.TransitionMatrix = self.dictify(self.get_transition_matrix())
        self.EmissionMatrix = self.dictify(self.get_transition_matrix())
        self.Priors = self.get_priors()
        
        
    def dictify(self, matrix2d):
        return {frm: {to: matrix2d[i][j] for j, to in enumerate(self.States)} for i, frm in enumerate(self.States)}
    
    
    # self.Corpus is assumed to be an ordered list of all words in the corpus 
    def get_states(self):
        return list(set(self.Corpus))
    
    
    def get_counts(self):
        counter = collections.Counter(self.Corpus)
        return dict(zip(list(counter.keys()), list(counter.values())))
        
        
    def get_transition_matrix(self):
        K = len(self.States)
        N = len(self.Corpus)

        # initialize with zeros
        transition_matrix = np.zeros((K, K))
        
        # iterate over observations and increase word transition counts
        for word_index in range(1, N):
            to = self.States.index(self.Corpus[word_index])
            frm = self.States.index(self.Corpus[word_index-1])

            transition_matrix[frm][to] += 1

        # divide by word count
        counts = self.get_counts()
        for frm in range(K):
            for to in range(K):
                transition_matrix[frm][to] /= counts[self.States[frm]]

        return self.laplace_smooth(transition_matrix, K)
    
    
    def get_priors(self):
        counter = collections.Counter(corpus)
        words = list(counter.keys())
        probabilities = [c/len(corpus) for c in list(counter.values())]
        return dict(zip(words, probabilities))
    
    # trying laplace smoothing after my forward-backward algorithm kept
    # running into division by zero
    def laplace_smooth(self, matrix, total_count):
        matrix[matrix != 0] = matrix[matrix != 0] + (1 / total_count)
        matrix[matrix == 0] = 1 / total_count
        return matrix

In [20]:
small_corpus = get_corpus(raw_data['Line'][:30])
Model = HMM(small_corpus)

In [21]:
print(Model.States)
print(Model.TransitionMatrix)


['is', 'away', 'back', 'said', "can't", 'guys', 'he', 'reverse', 'there', 'life', 'hope', 'meaning', 'great', 'bored', 'him', 'and', 'our', 'making', 'to', "it's", 'adventure', 'sorry', 'forever', 'tells', 'adventuring', 'draw', 'dude', 'questions', 'gonna', 'you', 'called', 'right', 'group', 'good-bye', 'time', 'are', 'fatass', 'so', 'going', 'all', 'been', 'joining', 'have', "don't", 'go', 'heart', 'here', 'children', 'your', 'around', 'do', 'for', 'but', 'two', 'a', 'think', 'iand', 'good', "i'm", 'well', 'i', 'on', 'how', 'card', 'long', 'boys', 'club', 'get', 'tell', 'wow', 'must', 'fuhf-ffriend', "i'll", 'answer', 'believe', '\n', 'it', 'choice', 'bye-bye', 'know', 'kind', 'true', 'the', 'what', 'super', "he's", 'hello', 'yeah', "you're", 'of', "that's", "what's", 'we', 'will', 'world', 'was', 'chef', 'why', 'jew', 'with', 'miss']
{'is': {'is': 0.009900990099009901, 'away': 0.009900990099009901, 'back': 0.009900990099009901, 'said': 0.009900990099009901, "can't": 0.00990099009900

In [33]:
def forward_backward(HMM, observations, end_state):
    
    # forward pass
    forward = []
    f_prev = {}
    
    # make a pass thru the observations to compute joint prob P(states[k], observations[1:k])
    for i, observation in enumerate(HMM.Observations):
        f_current = {}
        for state in HMM.States:
            if i == 0:
                prev_sum = HMM.Priors[state]
            else:
                prev_sum = sum(f_prev[k] * HMM.TransitionMatrix[k][state] for k in HMM.States)
            
            f_current[state] = HMM.EmissionMatrix[state][observation] * prev_sum

        forward.append(f_current)
        f_prev = f_current
        
    p_forward = sum(f_current[k] * HMM.TransitionMatrix[k][end_state] for k in HMM.States)
    print(p_forward)
    
        
    # backward algorithm
    backward = []

    for i, observation in enumerate(reversed(HMM.Observations[1:] + [None,])):
        b_current = {}
        for state in HMM.States:
            if i == 0:
                b_current[state] = HMM.TransitionMatrix[state][end_state]
            else:
                b_current[state] = sum(HMM.TransitionMatrix[state][l] * HMM.EmissionMatrix[l][observation] * b_prev[l] for l in HMM.States)
        
        backward.insert(0, b_current)
        b_prev = b_current
    
    p_backward = sum(HMM.Priors[l] * HMM.EmissionMatrix[l][HMM.Observations[0]] * b_current[l] for l in HMM.States)
    
    posterior = []
    for i in range(len(HMM.Observations)):
        posterior.append({state: forward[i][state] * backward[i][state] / p_forward for state in HMM.States})
        
    return posterior

In [24]:
medium_corpus = get_corpus(raw_data['Line'][:50])
MedModel = HMM(medium_corpus)
post = forward_backward(MedModel, medium_corpus, '\n')

3.4688125972193125e-301


In [30]:
import operator

def generate_text(posteriors):
    wordPath = []
    for item in posteriors:
        highest_value = max(item.items(), key=operator.itemgetter(1))[0]
        wordPath.append(highest_value)

    for word in wordPath:
        print(word, end=' ')

generate_text(post)

is going you buy chef see i you you so south 
 hello uh of i the well a like it 
 i'm gonna miss chef well hope you're welcome to the super adventure club 
 school that heart think adventuring 
 everybody of questions do you group that's right 
 great to why card bored very hey everybody will he's he so your still can club you i with another in but in gonna get your you live here there children chef i- you're what you guess adventuring can 
 questions do care chef 
 draw back for have you and for oh finally 
 was 
 we'll 
 really be you me 
 iand come dude two happy so interesting sorry adventure i'll to 
 you our 
 sound like 
 also can't 
 yeah 
 get moved back don't friends what they it's you around just the said 
 
 believe you're kind 
 feeling 
 why are we gonna park chef 
 life chef you you're you time gonna answer 

#### Alright... interesting... Let's try doubling the corpus size

In [31]:
corpus_2 = get_corpus(raw_data['Line'][:100])
Model2 = HMM(corpus_2)
post = forward_backward(Model2, corpus_2, '\n')

generate_text(post)

0.0
arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest arrest ar

  posterior.append({state: forward[i][state] * backward[i][state] / p_forward for state in HMM.States})


# :(
I'm getting this consistent problem where the forward pass reduces p_forward to zero 

I'm currently having this crazy confusion about the distinction between the transition matrix and the emission matrix. Aren't they the same in this application?

Let's try Viterbi for text prediction now

In [42]:
def viterbi(HMM, observations):
    O = len(observations)
    K = len(HMM.States)
    
    dynamic = np.zeros((K, O))
    
    # get each state's probability for observation zero
    for state in HMM.States:
        dynamic = HMM.Priors[state] * HMM.EmissionMatrix[state][observations[0]]
    
    for o in range(1, O):
        for state in HMM.States:
            state_index = HMM.States.index(state)
            k_max = np.argmax(dynamic[state_index][o-1] 
                                        * HMM.TransitionMatrix[k][state] 
                                        * HMM.EmissionMatrix[state][observations[o]] for k in states)
            print(k_max)
            k_max_state = HMM.States[k_max]
            print(k_max_state)
            dynamic[state_index][o] = (dynamic[k_max][o-1] 
                                                   * HMM.TransitionMatrix[k_max_state][state] 
                                                   * HMM.EmissionMatrix[state][observations[o]])
    
    best_path = []
    for o in range(-1, -O, -1):
        print(o)
        

viterbi(MedModel, ['everybody', 'will', 'still'])

0
is


IndexError: invalid index to scalar variable.

In [246]:
a = np.array([0, 0, 0, 1, 1,1,2,3,4,4,5])
print({'a': 'b'}.items())
print(list(np.where(a == 1)[0]))

dict_items([('a', 'b')])
[3, 4, 5]
