<a href="https://colab.research.google.com/github/jnrahul92/nltk_book/blob/main/C6_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
def gender_features(word):
  return {"first_letter":word[0],
          "last_letter":word[-1],
          "length":len(word)}

In [16]:
gender_features("Shrek")

{'first_letter': 'S', 'last_letter': 'k', 'length': 5}

In [3]:
from nltk.corpus import names

In [6]:
import nltk
nltk.download('names')

labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


In [7]:
import random
random.shuffle(labeled_names)

In [17]:
featuresets = [(gender_features(n), gender) for n, gender in labeled_names]

In [18]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [19]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [20]:
classifier.classify(gender_features('Neo'))

'male'

In [25]:
classifier.classify(gender_features('Natasha'))

'female'

In [26]:
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'a'            female : male   =     36.0 : 1.0
             last_letter = 'k'              male : female =     31.3 : 1.0
             last_letter = 'f'              male : female =     15.7 : 1.0
             last_letter = 'p'              male : female =     12.4 : 1.0
             last_letter = 'v'              male : female =     11.1 : 1.0
             last_letter = 'd'              male : female =      9.8 : 1.0
             last_letter = 'o'              male : female =      8.6 : 1.0
             last_letter = 'm'              male : female =      8.4 : 1.0
             last_letter = 'r'              male : female =      6.8 : 1.0
             last_letter = 'g'              male : female =      5.4 : 1.0


In [28]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [29]:
gender_features2("John")

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}

In [30]:
featuresets = [(gender_features2(n), gender) for n, gender in labeled_names]

In [31]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [32]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [34]:
print(nltk.classify.accuracy(classifier, test_set))

0.764


In [36]:
train_names = labeled_names[1500:]
dev_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [37]:
train_set = [(gender_features(n), gender) for n, gender in train_names]
dev_set = [(gender_features(n), gender) for n, gender in dev_names]
test_set = [(gender_features(n), gender) for n, gender in test_names]

In [38]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [39]:
print(nltk.classify.accuracy(classifier, dev_set))

0.769


In [40]:
errors = []
for (name, tag) in dev_names:
  guess = classifier.classify(gender_features(name))
  if guess != tag:
    errors.append((tag, guess, name))

In [41]:
for (tag, guess, name) in sorted(errors):
  print("Correct : {}, Guess : {}, Name : {}".format(tag, guess, name))

Correct : female, Guess : male, Name : Adelind
Correct : female, Guess : male, Name : Agnes
Correct : female, Guess : male, Name : Alexis
Correct : female, Guess : male, Name : Alis
Correct : female, Guess : male, Name : Alison
Correct : female, Guess : male, Name : Ambur
Correct : female, Guess : male, Name : Austin
Correct : female, Guess : male, Name : Barb
Correct : female, Guess : male, Name : Beatrix
Correct : female, Guess : male, Name : Beau
Correct : female, Guess : male, Name : Beryl
Correct : female, Guess : male, Name : Bess
Correct : female, Guess : male, Name : Betsey
Correct : female, Guess : male, Name : Blanch
Correct : female, Guess : male, Name : Bridgett
Correct : female, Guess : male, Name : Brier
Correct : female, Guess : male, Name : Brigit
Correct : female, Guess : male, Name : Brit
Correct : female, Guess : male, Name : Brook
Correct : female, Guess : male, Name : Charis
Correct : female, Guess : male, Name : Cher
Correct : female, Guess : male, Name : Cleo
Cor

In [42]:
def gender_features(word):
  return {"suffix1":word[-1:],
          "suffix2":word[-2:]}

In [43]:
train_set = [(gender_features(n), gender) for n, gender in train_names]
dev_set = [(gender_features(n), gender) for n, gender in dev_names]
test_set = [(gender_features(n), gender) for n, gender in test_names]

In [44]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [45]:
print(nltk.classify.accuracy(classifier, dev_set))

0.78


In [47]:
from nltk.corpus import movie_reviews

In [48]:
nltk.download("movie_reviews")

documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() 
for fileid in movie_reviews.fileids(category)]

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [49]:
random.shuffle(documents)

In [50]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [51]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'conta

In [52]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [53]:
print(nltk.classify.accuracy(classifier, test_set))

0.76


In [54]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.0 : 1.0
         contains(mulan) = True              pos : neg    =      8.3 : 1.0
        contains(seagal) = True              neg : pos    =      7.4 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.7 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0


In [55]:
from nltk.corpus import brown
nltk.download("brown")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [56]:
suffix_fdist = nltk.FreqDist()
for word in brown.words():
  word = word.lower()
  suffix_fdist[word[-1:]] += 1
  suffix_fdist[word[-2:]] += 1
  suffix_fdist[word[-3:]] += 1

In [57]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [58]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [59]:
def pos_features(word):
  features = {}
  for suffix in common_suffixes:
    features["endswith({})".format(suffix)] = word.lower().endswith(suffix)
  return features

In [60]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [61]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [62]:
classifier = nltk.DecisionTreeClassifier.train(train_set)

In [63]:
nltk.classify.accuracy(classifier , test_set)

0.6270512182993535

In [64]:
classifier.classify(pos_features('cats'))

'NNS'

In [65]:
print(classifier.pseudocode(depth = 4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [66]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [67]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [68]:
tagged_sents = brown.tagged_sents(categories='news')

In [69]:
featuresets = []
for sents in tagged_sents:
  untagged_sent = nltk.tag.untag(sents)
  for i, (word,tag) in enumerate(sents):
    featuresets.append((pos_features(untagged_sent, i), tag) )

In [70]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [71]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [72]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [73]:
def pos_features(sentence, i, history):
     features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
     if i == 0:
         features["prev-word"] = "<START>"
         features["prev-tag"] = "<START>"
     else:
         features["prev-word"] = sentence[i-1]
         features["prev-tag"] = history[i-1]
     return features

class ConsecutivePosTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [74]:
tagged_sents = brown.tagged_sents(categories='news')

In [79]:
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]

In [80]:
tagger = ConsecutivePosTagger(train_sents)

In [81]:
print(tagger.evaluate(test_sents))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  """Entry point for launching an IPython kernel.


0.7980528511821975


In [84]:
nltk.download('treebank')
nltk.download('punkt')
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
  tokens.extend(sent)
  offset += len(sent)
  boundaries.add(offset - 1)

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [85]:
def punct_features(tokens, i):
  return {'next-word-capitalized': tokens[i+1][0].isupper(),
          'prev-word': tokens[i-1].lower(),
          'punct': tokens[i],
          'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [86]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
for i in range(1, len(tokens)-1)
if tokens[i] in '.?!']

In [87]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.9461279461279462

In [89]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [91]:
nltk.download('nps_chat')

posts = nltk.corpus.nps_chat.xml_posts()[:10000]

[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.


In [93]:
def dialogue_act_features(post):
  features = {}
  for word in nltk.word_tokenize(post):
    features['contains({})'.format(word.lower())] = True
  return features

In [94]:
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]

In [98]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.667

In [99]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [101]:
nltk.download('rte')

rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]

[nltk_data] Downloading package rte to /root/nltk_data...
[nltk_data]   Unzipping corpora/rte.zip.


In [102]:
extractor = nltk.RTEFeatureExtractor(rtepair)

In [103]:
print(extractor.text_words)

{'fledgling', 'Parviz', 'Iran', 'operation', 'at', 'four', 'central', 'fight', 'Soviet', 'China', 'association', 'meeting', 'Davudi', 'former', 'SCO', 'republics', 'Shanghai', 'together', 'Co', 'Organisation', 'binds', 'Asia', 'that', 'terrorism.', 'representing', 'was', 'Russia'}


In [104]:
print(extractor.hyp_words)

{'China', 'member', 'SCO.'}


In [105]:
print(extractor.overlap('word'))

set()


In [106]:
print(extractor.overlap('ne'))

{'China'}


In [111]:
print(extractor.hyp_extra('word'))

{'member'}


In [112]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)

In [113]:
print(entropy(['male', 'male', 'male', 'male'])) 

-0.0


In [115]:
print(entropy(['male', 'female', 'female', 'male']))

1.0
