# Natural Language Processing with Python – Analyzing Text with the Natural Language Toolkit

## Ch. 1 Language Processing and Python

In [5]:
#source = https://www.nltk.org/book/ch01.html
#loads all sample text of books
from nltk.draw.dispersion import dispersion_plot
import nltk
from nltk.book import *
from matplotlib.pyplot import figure
from nltk.collocations import *

In [6]:
text1.concordance("monstrous")

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


In [8]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(nltk.corpus.genesis.words('english-web.txt'))
finder.nbest(bigram_measures.pmi, 10)

[('Allon', 'Bacuth'),
 ('Ashteroth', 'Karnaim'),
 ('Ben', 'Ammi'),
 ('En', 'Mishpat'),
 ('Jegar', 'Sahadutha'),
 ('Salt', 'Sea'),
 ('Whoever', 'sheds'),
 ('appoint', 'overseers'),
 ('aromatic', 'resin'),
 ('cutting', 'instrument')]

In [None]:
text1.similar("monstrous")

In [None]:
#note differences from Melville to Austen w/r/t similarity
text2.similar("monstrous") 

In [None]:
text2.common_contexts(["monstrous","very"])

In [None]:
text3.concordance("garment")

In [None]:
text3.similar("garment")

In [None]:
text3.common_contexts(["city", "garment"])

In [None]:
# can use a dispersion plot to determine the location of the word in the text
# text4 = Inaugural Address Corpus
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America", "liberty"])

In [None]:
text3.generate() #generates random text in the style of the given text

In [None]:
len(text3)

In [None]:
#returns the individual words used since duplicaets are collapsed in a set
sorted(set(text3)) 

In [None]:
len(set(text3)) #number of unique words present

In [None]:
#percentage of unique words-> "lexical richness"
len(set(text3))/ len(text3) 

In [None]:
text3.count("smote")

In [None]:
(text4.count('a')/ len(text4)) * 100 #percent of words that are 'a'

In [None]:
(text5.count('lol')/ len(text5)) * 100

In [None]:
text5.concordance("lol")

In [None]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

def percentage(count, total):
    return (count / total) * 100

In [None]:
lexical_diversity(text3)

In [None]:
lexical_diversity(text5)

In [None]:
percentage(text4.count('a'), len(text4))

In [None]:
sent3 = ['My', 'name', 'is', 'Jo', 'jo', '.']

In [None]:
sent3

In [None]:
sorted(sent3) #sorts alphabetically with symbols then upper case first then lowercase

In [None]:
sent4 + sent1

In [None]:
sent1.append("Some")
sent1

In [None]:
sent = ['word1', 'word2', 'word3', 'word4', 'word5',
        'word6', 'word7', 'word8', 'word9', 'word10']
sent[0]

In [None]:
sent[0] = 'First'
sent[9] = 'Last'
len(sent)

In [None]:
sent[1:9] = ['Second', 'Third']
sent

In [None]:
'Monty . Python'.split(' . ')

In [None]:
saying = ['After', 'all', 'is', 'said', 'and', 'done',
         'more', 'is', 'said', 'than', 'done']
tokens = set(saying)
tokens = sorted(tokens)
tokens

In [None]:
tokens[-2:]

In [None]:
#3.1 Frequency Distributions
fdist1 = FreqDist(text1)
print(fdist1)
fdist1.most_common(10)

In [None]:
fdist1['whale']

In [None]:
fdist1.plot(25, cumulative = True)

In [None]:
#hapaxes are words that occur only once
fdist1.hapaxes()

In [None]:
listHap = fdist1.hapaxes()
len(listHap)

In [None]:
V = set(text2) #removes duplicates
long_words = [w for w in V if (len(w)) > 15]
sorted(long_words)

In [None]:
fdist5 = FreqDist(text5)
sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 7])

In [None]:
#collocation = sequence of words that occur togehter unusually often, e.g. red wine
#resistant to substitution e.g. red wine not equal to maroon wine
list(bigrams(['more', 'is', 'said', 'than', 'done'])) #bigram = extracts adjacent word pairs, bigrams produces generator object that list iterates through

In [None]:
[len(w) for w in text1]

In [None]:
fdist = FreqDist(len(w) for w in text1)
print(fdist)
fdist #where key is the length and the corresponding value is the frequency of that length

In [None]:
fdist.most_common()

In [None]:
fdist.max()

In [None]:
fdist[3]+=1

In [None]:
sorted(w for w in set(text1) if w.endswith('ableness'))

In [None]:
sorted(w for w in set(text7) if '-' in w and 'index' in w)

In [None]:
sorted(w for w in set(text3) if w.istitle() and len(w) >10)

In [None]:
sorted(t for t in set(text2) if 'cie' in t or 'cei' in t)

In [None]:
len(text1)

In [None]:
len(set(word.lower() for word in text1)) #convert all to lower to consider e.g. "This" and "this" duplicates

In [None]:
len(set(word.lower() for word in text1 if word.isalpha()))

In [None]:
l = [sent1, sent2, sent3, sent4, sent5, sent6, sent7, sent8, sent9]
for i in l:
    print(sorted(set(i)))

## Ch. 2 Accessing Text Corpora and Lexical Resources

In [None]:
import nltk
nltk.corpus.gutenberg.fileids()

In [None]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)

In [None]:
len(set(emma))

In [None]:
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance("surprize")

In [None]:
from nltk.corpus import gutenberg
gutenberg.fileids()

In [None]:
emma = gutenberg.words('austen-emma.txt')

In [None]:
for f in gutenberg.fileids():
    num_chars = len(gutenberg.raw(f))
    num_words = len(gutenberg.words(f))
    num_sents = len(gutenberg.sents(f))
    num_vocab = len(set(w.lower() for w in gutenberg.words(f)))
    print(round(num_chars / num_words), round(num_words/ num_sents), round(num_words/ num_vocab), f)

In [None]:
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences

In [None]:
macbeth_sentences[1116]

In [None]:
longest_len = max(len(s) for s in macbeth_sentences)
[s for s in macbeth_sentences if len(s) == longest_len]

In [None]:
from nltk.corpus import webtext
for f in webtext.fileids():
    print(f, webtext.raw(f)[:65], "....")

In [None]:
from nltk.corpus import brown
brown.categories()

In [None]:
gov_text = brown.words(categories = 'government')

In [None]:
brown.words(fileids = ['cg22'])

In [None]:
fdist = nltk.FreqDist(w.lower() for w in gov_text)
modals = ['who', 'what', 'where', 'when', 'why']
for m in modals:
    print(m + ':', fdist[m], end = '; ')

In [None]:
#conditional frequency distribution to iterate through all genres
cfd = nltk.ConditionalFreqDist((genre, word) 
                               for genre in brown.categories()
                               for word in brown.words(categories = genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions = genres, samples = modals) #generates table on frequency of modal by genre

In [None]:
from nltk.corpus import reuters
reuters.fileids()

In [None]:
reuters.categories()

In [None]:
reuters.categories('training/9865')

In [None]:
reuters.categories(['training/9865', 'training/9880'])

In [None]:
reuters.fileids('barley')

In [None]:
reuters.fileids(['barley', 'corn'])

In [None]:
reuters.words('training/9865')[:14]

In [None]:
reuters.words(categories = ['barley', 'corn'])

In [None]:
from nltk.corpus import inaugural
inaugural.fileids()

In [None]:
#extract the text year by slicing the first 4 elements
dates = [f[:4] for f in inaugural.fileids()]
dates

In [None]:
from matplotlib.pyplot import figure
#look at conditional frequency distributioons over time
cfd = nltk.ConditionalFreqDist((target, fileid[:4]) 
                               for fileid in inaugural.fileids()
                               for w in inaugural.words(fileid)
                               for target in ['america', 'citizen', 'democracy']
                               if w.lower().startswith(target))
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
cfd.plot()

In [None]:
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist((lang, len(word)) 
                               for lang in languages
                               for word in udhr.words(lang + '-Latin1'))
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
cfd.plot(cumulative = True)

In [None]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/Users/laurashi/Desktop/2021_Spring_Cal/Extracurricular/BANG/NLP_BANG/dict'
wordlists = PlaintextCorpusReader(corpus_root, ".*")
wordlists.fileids()

In [None]:
wordlists.words('test1.txt')

In [None]:
wordlists.sents()

In [None]:
wordlists.sents(fileids = 'test1.txt')[2]

In [None]:
#conditional frequency distribution based on the category (genre of text)
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist( (genre, word) #produces pairs of the genre and word
                                for genre in brown.categories() #outer loop for each genre
                                for word in brown.words(categories = genre))#inner loop for each word in the genre
genre_word = [(genre, word) 
              for genre in ['news', 'romance']
              for word in brown.words(categories = genre)]
len(genre_word)

In [None]:
genre_word[:4] #front of the set list will have news

In [None]:
genre_word[-4:] #back of set list will have romance

In [None]:
cfd = nltk.ConditionalFreqDist(genre_word)
cfd

In [None]:
cfd.conditions()

In [None]:
print('news: ', cfd['news'])
print('romance: ', cfd['romance'])

In [None]:
cfd['romance'].most_common(20)

In [None]:
cfd['romance']['could'] #index by category then word will give frequency

### Plotting and Tabulating Distributions

In [None]:
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in inaugural.fileids()
                               for w in inaugural.words(fileid)
                               for target in ['america', 'citizen']
                               if w.lower().startswith(target))

In [None]:
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch',
             'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist((lang, len(word))
                               for lang in languages
                               for word in udhr.words(lang + '-Latin1'))
cfd.tabulate(conditions=['English', 'German_Deutsch'], samples = range(10), cumulative= True)

In [None]:
from nltk.corpus import brown
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
cfd = nltk.ConditionalFreqDist((genre, word)
            for genre in ['news', 'romance']
            for word in brown.words(categories = genre))
cfd.tabulate(samples = days)
cfd.plot(samples = days)

In [None]:
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()

text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams) # [_bigram-condition]

In [None]:
cfd['living']

In [None]:
generate_model(cfd, 'living')

In [None]:
def lexical_diversity(text_in):
    word_count = len(text_in)
    vocab_size = len(set(text_in)) #individual/ no duplicate words
    return (vocab_size / word_count)

In [None]:
from nltk.corpus import genesis
kjv = genesis.words('english-kjv.txt')
lexical_diversity(kjv)

In [None]:
def plural(word):
    if word.endswith('y'):
        return word[:-1] + 'ies'
    elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
        return word + 'es'
    elif word.endswith('man'):
        return word[:-3] + 'men'
    else:
        return word + 's'

In [None]:
plural('fairy')

In [None]:
plural('woman')

In [None]:
plural('fan')

In [None]:
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)

In [None]:
unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))

In [None]:
unusual_words(nltk.corpus.nps_chat.words())

In [None]:
#find fraction of words that are not in the stopwords list
#stopwords = high frequency words such as helper verbs, 'the', 'to', 'also', 'so'
from nltk.corpus import stopwords
stopwords.words('english')
def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)

In [None]:
content_fraction(nltk.corpus.reuters.words())

In [None]:
#solve the word puzzle problem of finding all the words given the number of letters
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
puzzle_letters = nltk.FreqDist('loyrnci')
must_have = 'i'
[w for w in english_vocab if len(w) >=5 and must_have in w and nltk.FreqDist(w) <= puzzle_letters]

In [None]:
'ironically' in english_vocab

In [None]:
names = nltk.corpus.names
names.fileids()
male = names.words('male.txt')
female = names.words('female.txt')
[w for w in male if w in female]

In [None]:
cfd = nltk.ConditionalFreqDist( (file_id, name[-1]) #last letter of the name
                              for file_id in names.fileids()
                              for name in names.words(file_id))
figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
cfd.plot()

In [None]:
from nltk.corpus import swadesh
swadesh.fileids()

In [None]:
swadesh.words('en')

In [None]:
french2english = swadesh.entries(['fr', 'en'])
french2english

In [None]:
translate = dict(french2english)
translate['jeter']

In [None]:
translate['je']

In [None]:
languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']

for i in range(44,48):
    print(swadesh.entries(languages)[i])

In [None]:
from nltk.corpus import toolbox
toolbox.entries('rotokas.dic')

In [None]:
from nltk.corpus import wordnet as wn
wn.synsets('motorcar')

In [None]:
wn.synset('car.n.01').lemma_names() #lemmas are synonymous words

In [None]:
wn.synset('car.n.01').definition()

In [None]:
wn.synset('car.n.01').examples()

In [None]:
wn.synset('car.n.01').lemmas()

In [None]:
wn.synsets('car') #multiple different meanings for car

In [None]:
for synset in wn.synsets('car'):
    print(synset.lemma_names())

In [None]:
wn.lemmas('car')

In [None]:
for synset in wn.synsets('dish'):
    print(synset.lemma_names(), synset.definition())

In [None]:
wn.lemmas('dish')

In [None]:
wn.synsets('dish')

In [None]:
motorcar = wn.synset('car.n.01')
types = motorcar.hyponyms()
types[0]

In [None]:
sorted(lemma.name() for synset in types for lemma in synset.lemmas())

In [None]:
motorcar.hypernyms()

In [None]:
paths = motorcar.hypernym_paths()
len(paths)

In [None]:
[synset.name() for synset in paths[0]]

In [None]:
[synset.name() for synset in paths[1]]

In [None]:
#most general hypernym ("root" hypernym)
motorcar.root_hypernyms()

In [None]:
wn.synset('tree.n.01').part_meronyms()

In [None]:
wn.synset('tree.n.01').substance_meronyms()

In [None]:
wn.synset('tree.n.01').member_holonyms()

In [None]:
for s in wn.synsets('mint', wn.NOUN):
    print(s.name() + ": "+ s.definition())

In [None]:
testing testing