#### Markovian Analysis of text
The functions relating to Markovian analysis of the text are from book * Think Python *

In [1]:
import string
import numpy as np
import os

In [2]:
def word_analysis(file_name):
    """ read a text file and make a list of words"""
    # open file
    try:
        fin = open(file_name)
    except:
        print('The file can not be opened')
        
    lines = fin.readlines()
    print('Number of lines of file "%s" is %d' %(file_name, len(lines)))
    word_list = []
    lcount = 0
    for line in lines:
        # split the line into words
        line = line.replace('-', '')
        for word in line.split():
            # get rid of white space and punctuation
            word = word.strip(string.whitespace + string.punctuation)
            word = word.lower()
            word_list.append(word)
    return word_list

In [3]:
def shift(t, word):
    """ forms a new tuple by removing the head and adding word to the tail.
    t: tuple of strings for the prefix
    word: string
    
    returns: tuple of strings"""
    return t[1:] + (word,)

In [4]:
def word_process(word_list, order=2):
    """ processes each words
    
    during the first few iteration, all we do is store up the words;
    after that we start adding entries to the dictionary.
    """
    prefix = ()
    suffix_map = {}
    
    for word in word_list:
        if len(prefix) < order:
            prefix  += (word, )
            continue
        try:
            suffix_map[prefix].append(word)
        except KeyError:
            # if there is no entry for this prefix make one
            suffix_map[prefix] = [word]
        
        prefix = shift(prefix, word)
            
        
    return suffix_map    
        

In [5]:
# test if it works
file_name = './Jane_Austin/Emma.txt'
sdics = word_process(word_analysis(file_name),2)
print('the length of suffix map dictionary: %d' %len(sdics))
# sdics
# print(len(word_analysis(file_name)))
# words = ['this', 'that', 'which', 'element', 'those', 'coffee']
# prefix = ('a', 'b')
# dls = {}
# word = 'this'
# for word in words:
#     try:
#         dls[prefix].append(word)
#     except:
#         dls[prefix] = [word]
    
# print(dls)
# for key in sdics.keys():
#     print(key, sdics[key])

Number of lines of file "./Jane_Austin/Emma.txt" is 16633
the length of suffix map dictionary: 73170
