In [137]:
import os
import re
import nltk
import nltk.corpus
from nltk import ne_chunk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, blankline_tokenize
from nltk.probability import FreqDist
from nltk.util import bigrams, trigrams, ngrams
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

In [3]:
print(os.listdir(nltk.data.find('corpora')))

['abc', 'abc.zip', 'alpino', 'alpino.zip', 'biocreative_ppi', 'biocreative_ppi.zip', 'brown', 'brown.zip', 'brown_tei', 'brown_tei.zip', 'cess_cat', 'cess_cat.zip', 'cess_esp', 'cess_esp.zip', 'chat80', 'chat80.zip', 'city_database', 'city_database.zip', 'cmudict', 'cmudict.zip', 'comparative_sentences', 'comparative_sentences.zip', 'comtrans.zip', 'conll2000', 'conll2000.zip', 'conll2002', 'conll2002.zip', 'conll2007.zip', 'crubadan', 'crubadan.zip', 'dependency_treebank', 'dependency_treebank.zip', 'dolch', 'dolch.zip', 'europarl_raw', 'europarl_raw.zip', 'floresta', 'floresta.zip', 'framenet_v15', 'framenet_v15.zip', 'framenet_v17', 'framenet_v17.zip', 'gazetteers', 'gazetteers.zip', 'genesis', 'genesis.zip', 'gutenberg', 'gutenberg.zip', 'ieer', 'ieer.zip', 'inaugural', 'inaugural.zip', 'indian', 'indian.zip', 'jeita.zip', 'kimmo', 'kimmo.zip', 'knbc.zip', 'lin_thesaurus', 'lin_thesaurus.zip', 'machado.zip', 'mac_morpho', 'mac_morpho.zip', 'masc_tagged.zip', 'movie_reviews', 'movie

### Accessing inbuilt NLTK corpora

In [4]:
from nltk.corpus import brown
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [5]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [149]:
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
hamlet

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...]

In [8]:
for words in hamlet[:400]:
    print(words, sep=' ', end=' ')

[ The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had quiet Guard ? Fran . Not a Mouse stirring Barn . Well , goodnight . If you do meet Horatio and Marcellus , the Riuals of my Watch , bid them make hast . Enter Horatio and Marcellus . Fran . I thinke I heare them . Stand : who ' s there ? Hor . Friends to this ground Mar . And Leige - men to the Dane Fran . Giue you good night Mar . O farwel honest Soldier , who hath relieu ' d you ? Fra . Barnardo ha ' s my place : giue you goodnight . Exit Fran . Mar . Holla Barnardo Bar . Say , what is Horatio there ? Hor . A peece of him Bar 

### Coverting sample text to tokens

In [63]:
text="""This is a GitHub repository which contains course on deep NLP by the University of Oxford in the form of lecture slides and videos. This course is focused on recent advances in analysing and generating speech and text using recurrent neural networks. 

You will be introduced with mathematical definitions of the relevant machine learning models and derive their associated optimisation algorithms. 

The course covers a range of applications of neural networks in NLP including analysing latent dimensions in text, transcribing speech to text, translating between languages, and answering questions."""

In [155]:
text_tokenize = word_tokenize(text)
# text_tokenize
len(text_tokenize)

96

### Frequency Distribution of words

In [65]:
fdist=FreqDist()
for word in text_tokenize:
    fdist[word.lower()]+=1

In [66]:
# fdist
len(fdist)

64

In [67]:
fdist.most_common(10)

[('of', 5),
 ('and', 5),
 ('the', 4),
 ('in', 4),
 ('.', 4),
 ('course', 3),
 ('text', 3),
 (',', 3),
 ('this', 2),
 ('is', 2)]

### Paragraph tokenizer

In [68]:
blank_tokenize = blankline_tokenize(text)

In [70]:
blank_tokenize[0]

'This is a GitHub repository which contains course on deep NLP by the University of Oxford in the form of lecture slides and videos. This course is focused on recent advances in analysing and generating speech and text using recurrent neural networks.'

### Bigrams, trigrams and ngrams

In [130]:
text1 = 'This is a GitHub repository which contains course on deep NLP by the University of Oxford in the form of lecture slides and videos.'

In [131]:
text1_bigrams = list(bigrams(word_tokenize(text1)))
text1_bigrams[:10]

[('This', 'is'),
 ('is', 'a'),
 ('a', 'GitHub'),
 ('GitHub', 'repository'),
 ('repository', 'which'),
 ('which', 'contains'),
 ('contains', 'course'),
 ('course', 'on'),
 ('on', 'deep'),
 ('deep', 'NLP')]

### Stemming

In [102]:
pstem = PorterStemmer()
lcstem = LancasterStemmer()
sbstem = SnowballStemmer('english')
lemma = WordNetLemmatizer()

In [108]:
words=['having','visit','visiting','visitation','give','gave','given','glue','corpora']
print('Porter Stemmer')
for i in words:
    print(i + ' : ' + pstem.stem(i))
print('')
print('Lancaster Stemmer')
for i in words:
    print(i + ' : ' + lcstem.stem(i))
print('')
print('Snowball Stemmer')
for i in words:
    print(i + ' : ' + sbstem.stem(i))
print('')
print('WordNet Lemmatizer')
for i in words:
    print(i + ' : ' + lemma.lemmatize(i))    # happens be because no POS tagging

Porter Stemmer
having : have
visit : visit
visiting : visit
visitation : visit
give : give
gave : gave
given : given
glue : glue
corpora : corpora

Lancaster Stemmer
having : hav
visit : visit
visiting : visit
visitation : visit
give : giv
gave : gav
given : giv
glue : glu
corpora : corpor

Snowball Stemmer
having : have
visit : visit
visiting : visit
visitation : visit
give : give
gave : gave
given : given
glue : glue
corpora : corpora

WordNet Lemmatizer
having : having
visit : visit
visiting : visiting
visitation : visitation
give : give
gave : gave
given : given
glue : glue
corpora : corpus


### Stop words

In [110]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### Removing punctuation

In [120]:
punctuation = re.compile(r'[-_?!,;:0-9.]')

In [126]:
text_wo_punctuation = [punctuation.sub("", words) for words in text_tokenize if len(punctuation.sub("", words))>0]
len(text_wo_punctuation)

89

### Part of Speech (POS) tagging

In [153]:
text1_tokens = word_tokenize(text1)
for i in text1_tokens:
    print(nltk.pos_tag([i]))
print(nltk.pos_tag(text1_tokens))

[('This', 'DT')]
[('is', 'VBZ')]
[('a', 'DT')]
[('GitHub', 'NN')]
[('repository', 'NN')]
[('which', 'WDT')]
[('contains', 'NNS')]
[('course', 'NN')]
[('on', 'IN')]
[('deep', 'NN')]
[('NLP', 'NN')]
[('by', 'IN')]
[('the', 'DT')]
[('University', 'NNP')]
[('of', 'IN')]
[('Oxford', 'NN')]
[('in', 'IN')]
[('the', 'DT')]
[('form', 'NN')]
[('of', 'IN')]
[('lecture', 'NN')]
[('slides', 'NNS')]
[('and', 'CC')]
[('videos', 'NNS')]
[('.', '.')]
[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('GitHub', 'NNP'), ('repository', 'NN'), ('which', 'WDT'), ('contains', 'VBZ'), ('course', 'NN'), ('on', 'IN'), ('deep', 'JJ'), ('NLP', 'NNP'), ('by', 'IN'), ('the', 'DT'), ('University', 'NNP'), ('of', 'IN'), ('Oxford', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('form', 'NN'), ('of', 'IN'), ('lecture', 'NN'), ('slides', 'NNS'), ('and', 'CC'), ('videos', 'NNS'), ('.', '.')]


### Named Entity Recognition (NER)

In [141]:
NE_sent = 'The US President stays in the White House'
NE_tokens = word_tokenize(NE_sent)
NE_tags = nltk.pos_tag(NE_tokens)

In [152]:
NE_NER = ne_chunk(NE_tags)
print(NE_NER)

(S
  The/DT
  (ORGANIZATION US/NNP)
  President/NNP
  stays/VBZ
  in/IN
  the/DT
  (FACILITY White/NNP House/NNP))
