In [1]:
# Week 3:  Bigram Frequencies and Mutual Information
# This file has small examples that are meant to be run individually
#   in the Python interpreter or jupyter notebook cells

In [2]:
## Part 1: word frequency distribution

# Getting started to process a text example
import nltk

# get the tokens of the book Emma from the Gutenberg corpus,
# and reduce the tokens to lowercase.

from nltk.corpus import gutenberg
gutenberg.fileids()
emma=gutenberg.words(fileids='austen-emma.txt')
emmawords =[w.lower() for w in emma]

In [3]:
# show some of the words
len(emmawords)
emmawords[ :30]

['[',
 'emma',
 'by',
 'jane',
 'austen',
 '1816',
 ']',
 'volume',
 'i',
 'chapter',
 'i',
 'emma',
 'woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich',
 ',',
 'with',
 'a',
 'comfortable',
 'home',
 'and',
 'happy',
 'disposition',
 ',',
 'seemed']

In [4]:
# Creating a frequency distribution of words
from nltk import FreqDist
ndist = FreqDist(emmawords)

In [5]:
# print the top 30 tokens by frequency
nitems = ndist.most_common(30)
nitems

[(',', 11454),
 ('.', 6928),
 ('to', 5239),
 ('the', 5201),
 ('and', 4896),
 ('of', 4291),
 ('i', 3178),
 ('a', 3129),
 ('it', 2528),
 ('her', 2469),
 ('was', 2398),
 ('she', 2340),
 (';', 2199),
 ('in', 2188),
 ('not', 2140),
 ('"', 2004),
 ('you', 1980),
 ('be', 1975),
 ('that', 1806),
 ('he', 1806),
 ('had', 1624),
 ('but', 1441),
 ('as', 1436),
 ('--', 1382),
 ('for', 1347),
 ('have', 1320),
 ('is', 1240),
 ('with', 1217),
 ('very', 1202),
 ('mr', 1153)]

In [6]:
## Part 2: Define Python functions  


# the function doublesum takes 2 numbers as parameters, either int or float
#  and returns a result which is the sum of those numbers multiplied by 2
def doublesum (x, y):
    result = 2 * (x + y)
    return result

doublesum(3, 5)
num = doublesum(3.4, 2)
num


10.8

In [7]:
# this following function takes a string and a list of words as parameters.
#   It will return all the words in the list that contain the string as a substring
def searchstring (substring, wordlist):
    # initialize the result
    result = [ ]
    #  loop over all the words
    for word in wordlist:
        # test each word if it contains the substring
        if substring in word:
            # add it to the result
            result.append(word)
    return result

searchstring('zz', emmawords)

['drizzle',
 'puzzled',
 'puzzles',
 'puzzling',
 'puzzled',
 'puzzle',
 'puzzle',
 'puzzled']

In [8]:
# multiple variable assignment and use
name, phone, location = ('Zack', '22-15', 'Room 159')
print(name)
print(phone)
print(location)

Zack
22-15
Room 159


In [9]:
# Part 3: Bigrams and Bigram frequency distribution

emmabigrams = list(nltk.bigrams(emmawords))
emmawords[:20]
emmabigrams[:19]

[('[', 'emma'),
 ('emma', 'by'),
 ('by', 'jane'),
 ('jane', 'austen'),
 ('austen', '1816'),
 ('1816', ']'),
 (']', 'volume'),
 ('volume', 'i'),
 ('i', 'chapter'),
 ('chapter', 'i'),
 ('i', 'emma'),
 ('emma', 'woodhouse'),
 ('woodhouse', ','),
 (',', 'handsome'),
 ('handsome', ','),
 (',', 'clever'),
 ('clever', ','),
 (',', 'and'),
 ('and', 'rich')]

In [10]:
# setup for bigrams and bigram measures
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [11]:
# create the bigram finder and score the bigrams by frequency
finder = BigramCollocationFinder.from_words(emmawords)
scored = finder.score_ngrams(bigram_measures.raw_freq)

In [12]:
# scored is a list of bigram pairs with their score
type(scored)
first = scored[0]
type(first)
first
scored[:20]
scored[-20:]

[(('yourself', 'harriet'), 5.196775920219096e-06),
 (('yourself', 'how'), 5.196775920219096e-06),
 (('yourself', 'mistaken'), 5.196775920219096e-06),
 (('yourself', 'one'), 5.196775920219096e-06),
 (('yourself', 'only'), 5.196775920219096e-06),
 (('yourself', 'out'), 5.196775920219096e-06),
 (('yourself', 'particularly'), 5.196775920219096e-06),
 (('yourself', 'this'), 5.196775920219096e-06),
 (('yourself', 'too'), 5.196775920219096e-06),
 (('yourself', 'when'), 5.196775920219096e-06),
 (('youth', 'had'), 5.196775920219096e-06),
 (('youth', 'is'), 5.196775920219096e-06),
 (('youth', 'on'), 5.196775920219096e-06),
 (('youthful', 'expression'), 5.196775920219096e-06),
 (('youthful', 'simplicity'), 5.196775920219096e-06),
 (('zeal', '!--'), 5.196775920219096e-06),
 (('zeal', 'and'), 5.196775920219096e-06),
 (('zeal', 'in'), 5.196775920219096e-06),
 (('zeal', 'of'), 5.196775920219096e-06),
 (('zigzags', 'of'), 5.196775920219096e-06)]

In [13]:
# apply a filter (on a new finder) to remove low frequency words
finder2 = BigramCollocationFinder.from_words(emmawords)
finder2.apply_freq_filter(2)
scored2 = finder2.score_ngrams(bigram_measures.raw_freq)
scored2[-20:]

[(('your', 'little'), 1.0393551840438192e-05),
 (('your', 'manners'), 1.0393551840438192e-05),
 (('your', 'meaning'), 1.0393551840438192e-05),
 (('your', 'name'), 1.0393551840438192e-05),
 (('your', 'not'), 1.0393551840438192e-05),
 (('your', 'only'), 1.0393551840438192e-05),
 (('your', 'picture'), 1.0393551840438192e-05),
 (('your', 'real'), 1.0393551840438192e-05),
 (('your', 'resolution'), 1.0393551840438192e-05),
 (('your', 'sensations'), 1.0393551840438192e-05),
 (('your', 'sister'), 1.0393551840438192e-05),
 (('your', 'taste'), 1.0393551840438192e-05),
 (('your', 'thoughts'), 1.0393551840438192e-05),
 (('your', 'tippet'), 1.0393551840438192e-05),
 (('your', 'way'), 1.0393551840438192e-05),
 (('yours', ','), 1.0393551840438192e-05),
 (('yourself', '."'), 1.0393551840438192e-05),
 (('yourself', '.--'), 1.0393551840438192e-05),
 (('yourself', 'for'), 1.0393551840438192e-05),
 (('yourself', 'very'), 1.0393551840438192e-05)]

In [14]:
### pointwise mutual information
finder3 = BigramCollocationFinder.from_words(emmawords)
scored3 = finder3.score_ngrams(bigram_measures.pmi)
scored3[:20]

[(('----', 'mum'), 17.553951716544702),
 (('1816', ']'), 17.553951716544702),
 (('26th', 'ult'), 17.553951716544702),
 (('_______', 'regiment'), 17.553951716544702),
 (('_a_', '_source_'), 17.553951716544702),
 (('_amor_', '_patriae_'), 17.553951716544702),
 (('_and_', '_misery_'), 17.553951716544702),
 (('_any_', '_thing_'), 17.553951716544702),
 (('_be_', '_a_'), 17.553951716544702),
 (('_caro_', '_sposo_'), 17.553951716544702),
 (('_dissolved_', '_it_'), 17.553951716544702),
 (('_great_', '_way_'), 17.553951716544702),
 (('_most_', '_precious_'), 17.553951716544702),
 (('_precious_', '_treasures_'), 17.553951716544702),
 (('_repentance_', '_and_'), 17.553951716544702),
 (('_robin_', '_adair_'), 17.553951716544702),
 (('_with_', '_time_'), 17.553951716544702),
 (('adequate', 'restoratives'), 17.553951716544702),
 (('austen', '1816'), 17.553951716544702),
 (('base', 'aspersion'), 17.553951716544702)]

In [15]:
# to get good results, should first apply frequency filter
finder3.apply_freq_filter(5)
scored4 = finder3.score_ngrams(bigram_measures.pmi)
scored4[:20]

[(('brunswick', 'square'), 13.968989215823544),
 (('sore', 'throat'), 13.553951716544704),
 (('mill', 'farm'), 13.246523191352455),
 (('william', 'larkins'), 13.094520097907408),
 (('baked', 'apples'), 13.062098620215028),
 (('e', '.,'), 12.83148569207361),
 (('box', 'hill'), 12.740883143292704),
 (('sixteen', 'miles'), 12.618491968739413),
 (('maple', 'grove'), 12.599755406157827),
 (('hair', 'cut'), 12.06852488937446),
 (('south', 'end'), 11.968989215823546),
 (('colonel', 'campbell'), 11.440953783187222),
 (('protest', 'against'), 11.35231785537505),
 (('robert', 'martin'), 11.098757090793876),
 (('vast', 'deal'), 10.767355354653896),
 (('five', 'couple'), 10.702202675128644),
 (('ready', 'wit'), 10.63508847927011),
 (('musical', 'society'), 10.483562388653304),
 (('infinitely', 'superior'), 10.18588184001262),
 (('donwell', 'abbey'), 10.15497056348493)]

In [24]:
from nltk.corpus import gutenberg
gutenberg.fileids()
a=gutenberg.words(fileids='shakespeare-hamlet.txt')
aw =[w.lower() for w in a]

In [17]:
ab = list(nltk.bigrams(aw))
print("Top 20 frequent bigrams")
ab[:20]

Top 20 frequent bigrams


[('[', 'the'),
 ('the', 'tragedie'),
 ('tragedie', 'of'),
 ('of', 'hamlet'),
 ('hamlet', 'by'),
 ('by', 'william'),
 ('william', 'shakespeare'),
 ('shakespeare', '1599'),
 ('1599', ']'),
 (']', 'actus'),
 ('actus', 'primus'),
 ('primus', '.'),
 ('.', 'scoena'),
 ('scoena', 'prima'),
 ('prima', '.'),
 ('.', 'enter'),
 ('enter', 'barnardo'),
 ('barnardo', 'and'),
 ('and', 'francisco'),
 ('francisco', 'two')]

In [18]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [19]:
finder = BigramCollocationFinder.from_words(aw)
x = finder.score_ngrams(bigram_measures.raw_freq)

In [20]:
x[:20]

[((',', 'and'), 0.012473233404710921),
 (('ham', '.'), 0.0090203426124197),
 (("'", 'd'), 0.005968950749464668),
 (('my', 'lord'), 0.004684154175588865),
 (('.', 'i'), 0.004041755888650964),
 ((',', 'that'), 0.0036402569593147753),
 (("'", 's'), 0.003265524625267666),
 ((',', 'i'), 0.002810492505353319),
 (('king', '.'), 0.002569593147751606),
 (('hor', '.'), 0.002542826552462527),
 ((',', 'the'), 0.0024892933618843685),
 (("'", 't'), 0.0022483940042826553),
 ((',', 'to'), 0.002114561027837259),
 ((',', 'as'), 0.001980728051391863),
 (('.', 'enter'), 0.0019539614561027837),
 (('in', 'the'), 0.0019539614561027837),
 (('lord', ','), 0.0019539614561027837),
 ((',', 'but'), 0.0017933618843683084),
 ((',', 'my'), 0.0017933618843683084),
 (('.', 'what'), 0.0017933618843683084)]

In [23]:
f = BigramCollocationFinder.from_words(aw)
f.apply_freq_filter(3)
s = f.score_ngrams(bigram_measures.pmi)
s[:20]

[(('lords', 'attendant'), 12.45224124043082),
 (('christian', 'buriall'), 11.867278739709661),
 (('second', 'husband'), 10.45224124043082),
 (('young', 'fortinbras'), 9.903804615734776),
 (('god', 'buy'), 9.751801522289727),
 (('noise', 'within'), 9.644886318373214),
 (('mine', 'vnkle'), 9.314737716680884),
 (('am', 'glad'), 9.073729617177088),
 (('too', 'blame'), 8.867278739709661),
 (('any', 'thing'), 8.779815898459324),
 (('fathers', 'death'), 8.657825374080714),
 (('manet', 'hamlet'), 8.5453506448223),
 (('father', 'lost'), 8.545350644822298),
 (('marcellus', 'mar'), 8.498044930043942),
 (('-', 'historicall'), 8.31884211501362),
 (('an', 'act'), 8.29136137859151),
 (('dead', 'body'), 8.282316238988503),
 (('were', 'sent'), 8.266374695119485),
 (('set', 'downe'), 8.244348388789486),
 (('our', 'selues'), 8.166839021568569)]

In [144]:
filepath="C:\\Users\\shadowmane\\Desktop\\nlp_lab2.txt"

In [145]:
bigrams = gutenberg.words(fileids=filepath)

In [146]:
bg =[w.lower() for w in bigrams]

In [147]:
ndist = FreqDist(nltk.bigrams(bg))

In [148]:
num  = ndist[('our','people')]

In [149]:
den = FreqDist(bg)['our']

In [150]:
print("The bigram predictive probability of our people is " ,num," / ", den)

The bigram predictive probability of our people is  1  /  23


In [151]:
num = ndist[('our','journey')]

In [152]:
print("The bigram predictive probability of our journey is " ,num," / ", den)

The bigram predictive probability of our journey is  4  /  23
