# Demo: a very simple distributional model

In [1]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ljman\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
def compute_space(context_size, corpus):
    space = nltk.ConditionalFreqDist()
    
    for index in range(len(corpus)):
        # current word
        current = corpus[ index ]
            
        # context before the current word: count each item
        # but no preceding context for index 0
        if index > 0:
            # don't start from a cxword_index < 0 in case index < context_size
            for cxword_index in range(max(index - context_size, 0), index):
                cxword = corpus[ cxword_index ]
                # In a ConditionalFreqDist, if 'current' is not a condition yet,
                # then accessing it creates a new empty FreqDist for 'current'
                # The FreqDist method inc() increments the count for the given item by one.
                space[ current ].update([cxword])
        
        # context after the current word: count each item
        # but no succeeding context for the last item (index len(corpus - 1))
        if index < len(corpus) - 1:
            # don't run until a cxword_index > len(corpus) in case
            # index + context_size > len(corpus)
            for cxword_index in range(index + 1, min(index + context_size + 1, len(corpus))):
                cxword = corpus[ cxword_index ]
                # In a ConditionalFreqDist, if 'current' is not a condition yet,
                # then accessing it creates a new empty FreqDist for 'current'
                # The FreqDist method inc() increments the count for the given item by one.
                space[ current ].update([cxword])
                
    return space




In [4]:
print( "reading Brown corpus...")
brown_words = list(nltk.corpus.brown.words())


reading Brown corpus...


In [5]:
print( "computing space...")
sp = compute_space(2, brown_words)

computing space...


## 10 most frequent context words: 
similar across many items
(what can we do about that?)

In [11]:
print("election:\n", sp["election"].most_common(10))
print("love:\n", sp["love"].most_common(10))
print("car:\n", sp["car"].most_common(10))

election:
 [('the', 35), ('of', 20), (',', 15), ('.', 13), ('to', 10), ('in', 8), ('``', 5), ('for', 5), ('was', 4), ('and', 4)]
love:
 [(',', 56), ('of', 47), ('and', 40), ('the', 34), ('in', 33), ('.', 26), ('to', 25), ('for', 25), ('I', 16), ('you', 14)]
car:
 [('the', 127), ('.', 64), ('a', 55), (',', 47), ('in', 29), ('and', 27), ('The', 24), ('his', 24), ('of', 23), ('was', 18)]


### 100 most frequent context words
now we are starting to see differences


In [13]:
print("election:\n", sp["election"].most_common(100))

election:
 [('the', 35), ('of', 20), (',', 15), ('.', 13), ('to', 10), ('in', 8), ('``', 5), ('for', 5), ('was', 4), ('and', 4), ('an', 4), ('his', 3), ('I', 3), ('on', 3), ('presidential', 3), ('The', 3), ('recent', 2), ('primary', 2), ('which', 2), ('ever', 2), ('not', 2), ('campaign', 2), ('were', 2), ('judges', 2), ('last', 2), ('a', 2), ('be', 2), ('Presidential', 2), ('results', 2), ('November', 2), ('is', 2), ('close', 2), ('board', 2), ('commissioners', 2), ('that', 2), (';', 2), ('April', 2), ('year', 2), ('produced', 1), ('conducted', 1), ('registration', 1), ('laws', 1), ('city', 1), ('general', 1), ('ballot', 1), ('coolest', 1), ("Saturday's", 1), ('calmest', 1), ('During', 1), ('scheduled', 1), ('Many', 1), ('orderly', 1), ('Sheriff', 1), ('Nov.', 1), ('8', 1), ('dismissed', 1), ('investigation', 1), ('attorney', 1), ('him', 1), ('day', 1), ('told', 1), ('us', 1), ('possible', 1), ('special', 1), ('might', 1), ("fall's", 1), ('did', 1), ('program', 1), ('reforms', 1), ('ne

In [14]:
print("love:\n", sp["love"].most_common(100))

love:
 [(',', 56), ('of', 47), ('and', 40), ('the', 34), ('in', 33), ('.', 26), ('to', 25), ('for', 25), ('I', 16), ('you', 14), ('is', 14), ('with', 14), ('a', 12), ('his', 9), ("''", 8), ('him', 8), ('that', 8), ('God', 8), ('this', 8), ('it', 7), ('as', 6), ('her', 6), ('``', 6), ('was', 6), ('my', 6), (';', 6), ('we', 5), ('not', 5), ('The', 4), ('us', 4), ('know', 4), ('His', 4), ('which', 4), ('faith', 4), ('!', 4), ('true', 4), ('by', 3), ('their', 3), ('fell', 3), ('them', 3), ('--', 3), ('knowledge', 3), ('but', 3), ('those', 3), ('only', 3), ('fallen', 3), ('me', 3), ('We', 2), ('bound', 2), ("God's", 2), ('being', 2), ('through', 2), ('how', 2), ('give', 2), ("mother's", 2), ('without', 2), ('fall', 2), ('shared', 2), ('even', 2), ('Christ', 2), ('Christian', 2), ('brethren', 2), ('men', 2), ('just', 2), ('expressed', 2), ('songs', 2), ('nature', 2), ('They', 2), ('(', 2), ("I'd", 2), ('he', 2), ('force', 2), ('country', 2), ('forbidden', 2), ('on', 2), ('an', 2), ('almost',

In [15]:
print("car:\n", sp["car"].most_common(100))

car:
 [('the', 127), ('.', 64), ('a', 55), (',', 47), ('in', 29), ('and', 27), ('The', 24), ('his', 24), ('of', 23), ('was', 18), ('with', 17), ('to', 14), ('by', 10), ('is', 9), ('that', 9), ('on', 9), ("''", 9), ('police', 8), ('into', 7), ('parked', 7), ('my', 7), ('when', 6), ('which', 6), ('he', 6), ('big', 6), ('at', 6), ('for', 6), ('your', 6), ('?', 6), ('had', 6), ('as', 5), ('approaching', 5), ('it', 5), ('be', 5), ('driven', 5), ('little', 5), ('one', 5), ('you', 5), ('motor', 5), ('up', 5), ('coming', 5), ('but', 4), (';', 4), ('drive', 4), ('I', 4), ('second', 4), ('other', 3), ('their', 3), ('around', 3), ('new', 3), ('She', 3), ('take', 3), ('any', 3), ('her', 3), ('sports', 3), ('before', 3), ('left', 3), ('driving', 3), ('could', 3), ('--', 3), ('will', 3), ('reserve', 3), ('than', 3), ('It', 3), ('If', 3), ('see', 3), ('from', 3), ('here', 3), ('A', 3), ('He', 3), ('run', 2), ('wanted', 2), ('Kimmell', 2), ('passing', 2), ('were', 2), ('Mr.', 2), ('against', 2), ('sal

### some ambiguous words

In [16]:
print("bat:\n", sp["bat"].most_common(100))
print("bank:\n", sp["bank"].most_common(100))
print("bar:\n", sp["bar"].most_common(100))
print("leave:\n", sp["leave"].most_common(100))

bat:
 [('a', 5), ('the', 5), ('to', 5), (',', 4), ('with', 4), ('.', 4), ('on', 3), ('and', 3), ('at', 2), ('clay', 2), ('up', 2), ('grabbed', 1), ('headed', 1), ('ball', 1), ('in', 1), ('their', 1), ('night', 1), ('four', 1), ("teammate's", 1), ('became', 1), ('times', 1), ('two', 1), ("He'll", 1), ('just', 1), ("'", 1), ('or', 1), ('plaster', 1), ('reverse', 1), ('as', 1), ('Willie', 1), ('go', 1), ('first', 1), ('managed', 1), ('muzzle', 1), ('through', 1), ('it', 1), ('nimbly', 1), ('hit', 1), ('Everyone', 1), ('of', 1), ('his', 1), ('He', 1), ('holding', 1), ('both', 1)]
bank:
 [('the', 50), ('of', 15), ('.', 15), (',', 11), ('a', 7), ('in', 5), ('The', 5), ('and', 5), ('which', 3), ('to', 3), ('from', 2), ('--', 2), ('with', 2), ('local', 2), ('south', 2), ('that', 2), ('held', 2), ('east', 2), ('by', 2), ('west', 2), ('on', 2), ('through', 2), ('take', 1), ('over', 1), ('accounts', 1), ('confidence', 1), ('customers', 1), ('convicted', 1), ('robber', 1), ('Lawrence', 1), ('Switz

In [6]:
print("waffle:\n", sp["waffle"].most_common(100))


waffle:
 []


In [7]:
print("pancake:\n", sp["pancake"].most_common(100))


pancake:
 []


In [8]:
print("university:\n", sp["university"].most_common(100))


university:
 [('the', 42), (',', 24), ('.', 21), ('of', 14), ('and', 13), ('at', 12), ('in', 10), ('or', 10), ('college', 10), ('to', 9), ('a', 9), ('that', 6), ('state', 4), ('for', 3), ('president', 3), ('The', 3), ('as', 3), ('students', 3), ('their', 3), ('true', 2), ('did', 2), ('not', 2), ('major', 2), ('Catholic', 2), ('which', 2), ('And', 2), ('owns', 2), ('In', 2), ('``', 2), ('church', 2), ('Hans', 2), ('with', 2), ('have', 2), (';', 2), ('former', 2), ('only', 2), ('?', 2), ('milieu', 2), ('--', 2), ('on', 2), ('At', 2), ('bringing', 1), ('Tulane', 1), ('where', 1), ('grade', 1), ('itself', 1), ('criticized', 1), ('irrespective', 1), ('continuing', 1), ('policy', 1), ('Cochran', 1), ('vice', 1), ('American', 1), ('succeeds', 1), ('she', 1), ('school', 1), ('But', 1), ('resembles', 1), ('its', 1), (')', 1), ('is', 1), ('whose', 1), ("''", 1), (':', 1), ('without', 1), ('gaining', 1), ('Religious', 1), ('ordered', 1), ('admit', 1), ('into', 1), ('They', 1), ('rejected', 1), ('

In [9]:
print("waffe:\n", sp["waffle"].most_common(100))


waffe:
 []
