In [3]:
# Week 2:  Bigram Frequencies and Mutual Information
# This file has small examples that are meant to be run individually
#   in the Python interpreter or jupyter notebook cells



## Part 1: word frequency distribution

# Getting started to process a text example
import nltk

In [4]:
# get the tokens of the book Emma from the Gutenberg corpus,
# and reduce the tokens to lowercase.
from nltk.corpus import gutenberg

In [5]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [6]:
emma=gutenberg.words(fileids='austen-emma.txt')
emmawords =[w.lower() for w in emma]

In [7]:
# show some of the words
len(emmawords)

192427

In [8]:
# what does this code do?
# Print the first 30 tokens
emmawords[ :30]

['[',
 'emma',
 'by',
 'jane',
 'austen',
 '1816',
 ']',
 'volume',
 'i',
 'chapter',
 'i',
 'emma',
 'woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich',
 ',',
 'with',
 'a',
 'comfortable',
 'home',
 'and',
 'happy',
 'disposition',
 ',',
 'seemed']

In [12]:
# Creating a frequency distribution of words
from nltk import FreqDist
ndist = FreqDist(emmawords)
ndist

FreqDist({',': 11454, '.': 6928, 'to': 5239, 'the': 5201, 'and': 4896, 'of': 4291, 'i': 3178, 'a': 3129, 'it': 2528, 'her': 2469, ...})

In [13]:
# print the top 30 tokens by frequency
nitems = ndist.most_common(30)
nitems

[(',', 11454),
 ('.', 6928),
 ('to', 5239),
 ('the', 5201),
 ('and', 4896),
 ('of', 4291),
 ('i', 3178),
 ('a', 3129),
 ('it', 2528),
 ('her', 2469),
 ('was', 2398),
 ('she', 2340),
 (';', 2199),
 ('in', 2188),
 ('not', 2140),
 ('"', 2004),
 ('you', 1980),
 ('be', 1975),
 ('that', 1806),
 ('he', 1806),
 ('had', 1624),
 ('but', 1441),
 ('as', 1436),
 ('--', 1382),
 ('for', 1347),
 ('have', 1320),
 ('is', 1240),
 ('with', 1217),
 ('very', 1202),
 ('mr', 1153)]

In [14]:
## Part 2: Define Python functions  


# the function doublesum takes 2 numbers as parameters, either int or float
#  and returns a result which is the sum of those numbers multiplied by 2
def doublesum (x, y):
    result = 2 * (x + y)
    return result



In [17]:
# test our self-defined function
doublesum(3, 5)



16

In [18]:
# this following function takes a string and a list of words as parameters.
#   It will return all the words in the list that contain the string as a substring
def searchstring (substring, wordlist):
    # initialize the result
    result = [ ]
    #  loop over all the words
    for word in wordlist:
        # test each word if it contains the substring
        if substring in word:
            # add it to the result
            result.append(word)
    return result


In [21]:
searchstring('zz', emmawords)

['drizzle',
 'puzzled',
 'puzzles',
 'puzzling',
 'puzzled',
 'puzzle',
 'puzzle',
 'puzzled']

In [22]:
# multiple variable assignment and use
name, phone, location = ('Zack', '22-15', 'Room 159')
print(name)
print(phone)
print(location)

Zack
22-15
Room 159


In [24]:
# Part 3: Bigrams and Bigram frequency distribution

emmabigrams = list(nltk.bigrams(emmawords))
emmawords[:20]


['[',
 'emma',
 'by',
 'jane',
 'austen',
 '1816',
 ']',
 'volume',
 'i',
 'chapter',
 'i',
 'emma',
 'woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich']

In [25]:
emmabigrams[:19]

[('[', 'emma'),
 ('emma', 'by'),
 ('by', 'jane'),
 ('jane', 'austen'),
 ('austen', '1816'),
 ('1816', ']'),
 (']', 'volume'),
 ('volume', 'i'),
 ('i', 'chapter'),
 ('chapter', 'i'),
 ('i', 'emma'),
 ('emma', 'woodhouse'),
 ('woodhouse', ','),
 (',', 'handsome'),
 ('handsome', ','),
 (',', 'clever'),
 ('clever', ','),
 (',', 'and'),
 ('and', 'rich')]

In [26]:
# setup for bigrams and bigram measures
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()



In [27]:
# create the bigram finder and score the bigrams by frequency
finder = BigramCollocationFinder.from_words(emmawords)
scored = finder.score_ngrams(bigram_measures.raw_freq)


In [28]:
# scored is a list of bigram pairs with their score
type(scored)


list

In [30]:
first = scored[0]
type(first)


tuple

In [31]:
# let's check out first-- what it look like?
first

((',', 'and'), 0.00977513550593212)

In [33]:
# What do the two lines do?
scored[:20]


[((',', 'and'), 0.00977513550593212),
 (('mr', '.'), 0.005991882636012618),
 (("'", 's'), 0.0048433951576441975),
 ((';', 'and'), 0.004500407946909737),
 (('."', '"'), 0.003933959371605855),
 (('mrs', '.'), 0.0036325463682331484),
 (('to', 'be'), 0.003154442983572991),
 (('.', 'i'), 0.0029621622745248846),
 ((',', 'i'), 0.0029517687226844466),
 (('of', 'the'), 0.002904997739402475),
 (('in', 'the'), 0.0023125652844974975),
 (('it', 'was'), 0.0023073685085772786),
 (('.', 'weston'), 0.002219023317933554),
 ((';', 'but'), 0.002219023317933554),
 (('.', '"'), 0.002156662006890925),
 (('.', 'she'), 0.0021462684550504867),
 ((',', '"'), 0.0020683168162472),
 (('i', 'am'), 0.002052726488486543),
 (('.', 'elton'), 0.0019799716256034758),
 ((',', 'that'), 0.0018656425553586556)]

In [34]:
scored[-20:]

[(('yourself', 'harriet'), 5.196775920219096e-06),
 (('yourself', 'how'), 5.196775920219096e-06),
 (('yourself', 'mistaken'), 5.196775920219096e-06),
 (('yourself', 'one'), 5.196775920219096e-06),
 (('yourself', 'only'), 5.196775920219096e-06),
 (('yourself', 'out'), 5.196775920219096e-06),
 (('yourself', 'particularly'), 5.196775920219096e-06),
 (('yourself', 'this'), 5.196775920219096e-06),
 (('yourself', 'too'), 5.196775920219096e-06),
 (('yourself', 'when'), 5.196775920219096e-06),
 (('youth', 'had'), 5.196775920219096e-06),
 (('youth', 'is'), 5.196775920219096e-06),
 (('youth', 'on'), 5.196775920219096e-06),
 (('youthful', 'expression'), 5.196775920219096e-06),
 (('youthful', 'simplicity'), 5.196775920219096e-06),
 (('zeal', '!--'), 5.196775920219096e-06),
 (('zeal', 'and'), 5.196775920219096e-06),
 (('zeal', 'in'), 5.196775920219096e-06),
 (('zeal', 'of'), 5.196775920219096e-06),
 (('zigzags', 'of'), 5.196775920219096e-06)]

In [35]:
# apply a filter (on a new finder) to remove low frequency words
finder2 = BigramCollocationFinder.from_words(emmawords)
finder2.apply_freq_filter(2)
scored2 = finder2.score_ngrams(bigram_measures.raw_freq)
scored2[-20:]


[(('your', 'little'), 1.0393551840438192e-05),
 (('your', 'manners'), 1.0393551840438192e-05),
 (('your', 'meaning'), 1.0393551840438192e-05),
 (('your', 'name'), 1.0393551840438192e-05),
 (('your', 'not'), 1.0393551840438192e-05),
 (('your', 'only'), 1.0393551840438192e-05),
 (('your', 'picture'), 1.0393551840438192e-05),
 (('your', 'real'), 1.0393551840438192e-05),
 (('your', 'resolution'), 1.0393551840438192e-05),
 (('your', 'sensations'), 1.0393551840438192e-05),
 (('your', 'sister'), 1.0393551840438192e-05),
 (('your', 'taste'), 1.0393551840438192e-05),
 (('your', 'thoughts'), 1.0393551840438192e-05),
 (('your', 'tippet'), 1.0393551840438192e-05),
 (('your', 'way'), 1.0393551840438192e-05),
 (('yours', ','), 1.0393551840438192e-05),
 (('yourself', '."'), 1.0393551840438192e-05),
 (('yourself', '.--'), 1.0393551840438192e-05),
 (('yourself', 'for'), 1.0393551840438192e-05),
 (('yourself', 'very'), 1.0393551840438192e-05)]

In [36]:
### pointwise mutual information
finder3 = BigramCollocationFinder.from_words(emmawords)
scored3 = finder3.score_ngrams(bigram_measures.pmi)
scored3[:20]

[(('----', 'mum'), 17.553951716544702),
 (('1816', ']'), 17.553951716544702),
 (('26th', 'ult'), 17.553951716544702),
 (('_______', 'regiment'), 17.553951716544702),
 (('_a_', '_source_'), 17.553951716544702),
 (('_amor_', '_patriae_'), 17.553951716544702),
 (('_and_', '_misery_'), 17.553951716544702),
 (('_any_', '_thing_'), 17.553951716544702),
 (('_be_', '_a_'), 17.553951716544702),
 (('_caro_', '_sposo_'), 17.553951716544702),
 (('_dissolved_', '_it_'), 17.553951716544702),
 (('_great_', '_way_'), 17.553951716544702),
 (('_most_', '_precious_'), 17.553951716544702),
 (('_precious_', '_treasures_'), 17.553951716544702),
 (('_repentance_', '_and_'), 17.553951716544702),
 (('_robin_', '_adair_'), 17.553951716544702),
 (('_with_', '_time_'), 17.553951716544702),
 (('adequate', 'restoratives'), 17.553951716544702),
 (('austen', '1816'), 17.553951716544702),
 (('base', 'aspersion'), 17.553951716544702)]

In [37]:
# to get good results, should first apply frequency filter
finder3.apply_freq_filter(5)
scored4 = finder3.score_ngrams(bigram_measures.pmi)
scored4[:20]

[(('brunswick', 'square'), 13.968989215823544),
 (('sore', 'throat'), 13.5539517165447),
 (('mill', 'farm'), 13.246523191352455),
 (('william', 'larkins'), 13.094520097907404),
 (('baked', 'apples'), 13.062098620215025),
 (('e', '.,'), 12.83148569207361),
 (('box', 'hill'), 12.740883143292704),
 (('sixteen', 'miles'), 12.618491968739413),
 (('maple', 'grove'), 12.599755406157826),
 (('hair', 'cut'), 12.06852488937446),
 (('south', 'end'), 11.968989215823546),
 (('colonel', 'campbell'), 11.440953783187224),
 (('protest', 'against'), 11.352317855375052),
 (('robert', 'martin'), 11.098757090793873),
 (('vast', 'deal'), 10.767355354653896),
 (('five', 'couple'), 10.702202675128644),
 (('ready', 'wit'), 10.635088479270106),
 (('musical', 'society'), 10.483562388653304),
 (('infinitely', 'superior'), 10.185881840012618),
 (('donwell', 'abbey'), 10.15497056348493)]