In [None]:
import nltk
import glob

In [17]:
def tokenize(s):
    # turns string into list of strings separated by spaces
    return s.split()
    """
    Input:
        string s
    Output:
        list of strings
    """
    
def preprocess(s, lowercase=True, strip_punctuation=True):
    """
    Input:
        string s
        boolean lowercase
        boolean strip_punctuation
    Output:
        list of strings
    """
    punctuation='.,?<>:;"\'!%'
    if isinstance(s, str):
        # if s is an instance of the string class
        s = tokenize(s)
    if lowercase:
        # if the lowercase option is true, go through each element in the list and turn the characters lowercase
        s = [t.lower() for t in s]
    if strip_punctuation:
        # get rid of the punction for each of the elements in the list
        # need to do this after splitting each word up because strip only looks at the beginning and end of a string
        s = [t.strip(punctuation) for t in s]
    return s
    
def token_frequency(tokens, tf=None, relative=False):
    # We want to have raw frequency as default because if we want to look at multiple texts, then we can add the raw frequencies together
    """
    Input: 
        tokens = list of strings or None
        tf = dict or None
        realtive = boolean
    Return:
        dictionary of tokens and frequency {t:f}
    """
    token_frequency = {} # dictionary to hold token counts
    
    # adding previous token frequency dictionary
    token_frequency={} if tf==None else tf
    if len(token_frequency) != 0 and relative==True:
        if isinstance(list(token_frequency.items())[0][1], float):
            print('warning, adding raw counts to relative frequency')
            return tf
    
    # counting up the words
    for token in tokens:
        if token in token_frequency:
            # if the token is already a key in the dictionary
            token_frequency[token] += 1
        else:
            # token is not a key in the dictionary
            token_frequency[token] = 1
    
    # changing to relative or keeping as raw frequency
    if relative:
        # relative frequency is wanted
        total_words = sum([v for k, v in token_frequency.items()]) # total number of words in the string that was input and the new tf dictionary
        return {k:v/total_words for k,v in token_frequency.items()} # go through each key, value pair in the dictionary and divide the value by the total number of words 
        # note: dictionary.items() turns the dictionary into a list of tuples ie. [(key1, value1), (key2, value2)]
    else:
        # want raw frequency
        return token_frequency

In [16]:
files = glob.glob("./shakespeare/*.txt")
for file in files:
    # tokenize text
    text = open(file, "r").read()
    tokens = tokenize(text)
    print(token_frequency(tokens))
    print(tokens[1:10])

['shaken', 'as', 'we', 'are,', 'so', 'wan', 'with', 'care,', 'Find']
['be', 'the', 'heavens', 'with', 'black,', 'yield', 'day', 'to', 'night!']
['your', 'ears;', 'for', 'which', 'of', 'you', 'will', 'stop', 'The']
['by', 'your', 'high', 'imperial', 'majesty', 'I', 'had', 'in', 'charge']
['wonder', 'how', 'the', 'king', 'escaped', 'our', 'hands.', 'While', 'we']
['delivering', 'my', 'son', 'from', 'me,', 'I', 'bury', 'a', 'second']
['but', 'this', 'dotage', 'of', 'our', "general's", "O'erflows", 'the', 'measure:']
['I', 'remember,', 'Adam,', 'it', 'was', 'upon', 'this', 'fashion', 'bequeathed']
['fair', 'Hippolyta,', 'our', 'nuptial', 'hour', 'Draws', 'on', 'apace;', 'four']
['Solinus,', 'to', 'procure', 'my', 'fall', 'And', 'by', 'the', 'doom']
['we', 'proceed', 'any', 'further,', 'hear', 'me', 'speak.', 'Speak,', 'speak.']
['do', 'not', 'meet', 'a', 'man', 'but', 'frowns:', 'our', 'bloods']
['there?', 'Nay,', 'answer', 'me:', 'stand,', 'and', 'unfold', 'yourself.', 'Long']
['home,', '