In [None]:
# If we are going to work with data, e.g., text files,
# we need to be able to read data from files:

f = open("text/gettysburg.txt","r")
data = f.read()  
f.close()
print data
print len(data)

In [None]:
# Let's define a function to extract words from text:
import re

def words(text):
    words = text.split()
    return words

In [None]:
words("This. " + data)

In [None]:
# Oops, we have to remove punctuation at the end of words

def depunctuate(word):
    return re.sub(r"(.*)[\.,:;!?]", r"\1", word)
    
depunctuate("FOO,")

In [None]:
# OK, 'depunctuate' works, so we map it over the list of words:

def words(text):
    words = text.split()
    return map(depunctuate, words)

words(data)

In [None]:
#  We should also map words to lower case 
# (though this will not matter for word-length frequencies)

def normalize(word):
    return depunctuate(word).lower() 
    
normalize("Foo!")

In [None]:
def words(text):
    words = text.split()
    return map(normalize, words)

words(data)

In [None]:
# We can now count words, and, more interestingly, analyze word frequencies and word-length statistics:

def word_count(data):
    return len(words(data))

def word_lengths(data):
    return map(len, words(data))

# Our little statistics package
from statistics import mean, stdev

def word_stats(data):
    lengths = word_lengths(data)
    return [mean(lengths), stdev(lengths)]

print "Word count", word_count(data)

print "Word lengths", word_lengths(data)

print "Word length stats", word_stats(data)

In [None]:
# We are now going to work towards making a table of word-length
# frequencies. For this we will use another data structure:
# the dictionary

Z = dict()   # empty dictionary
print Z

In [None]:
Z['hydrogen'] = 1; Z['helium'] = 2; Z

In [None]:
Z['hydrogen']

In [None]:
'helium' in Z

In [None]:
# OK, we know how to create a dictionary and how to add
# entries (key-value pairs) to the dictionary.

# Let's work towards a word-frequency dictionary

fake_data = [3, 2, 2, 4, 7, 1, 3]
freq = dict()
print freq

In [None]:
freq[3] = 1

In [None]:
freq

In [None]:
freq[2]=1; freq[2] = freq[2] + 1; freq

In [None]:
# Aha!  We can modidy entrie as well as adding them.
# This gives us clue of how to populate a dictionary programmatically.
# First, we devise an 'add_entry' function:

def add_entry(dictionary, key):
    if key in dictionary:
        dictionary[key] = dictionary[key] + 1
    else:
        dictionary[key] = 1
        
freq = dict()

add_entry(freq, 3)

freq

In [None]:
add_entry(freq, 2); freq

In [None]:
add_entry(freq, 2); freq

In [None]:
# OK, it is working, but we are still doing too much work by hand.
# Let's automate things.

def frequencies(data):
    freq = dict()
    for datum in data:
        add_entry(freq, datum)
    return freq

ftable = frequencies(fake_data)

ftable

In [None]:
# OK, there it is -- a "table" of word frequencies
# BUT ... maybe what is more useful is a table of __relative frequencies__

ftable.values()

In [None]:
total = sum(ftable.values());total

In [None]:
def rescale(dictionary):
    total = float(sum(dictionary.values()))
    new_dictionary = dict()
    for key in dictionary.keys():
        new_dictionary[key] = dictionary[key]/total
    return new_dictionary

rftable = rescale(ftable)

rftable

In [None]:
# Let's apply what we have done to the Gettysburg address:

data

In [None]:
w = word_lengths(data); w

In [None]:
rfw = rescale(frequencies(word_lengths(data))); rfw

In [None]:
import matplotlib.pyplot as plt
import numpy as np

xvalues = [i for i in range(1,12)]  # [1,2, ..., 11]
yvalues = rfw.values()
plt.plot(xvalues, yvalues)
plt.savefig("gettysburg.png")
plt.show()
