In [1]:
"""Natural Language Processing (NLP)  is the ability of a computer program to understand human language.
Here the following text mining operations have been shown using NLTK package in Python:

(1) Tokenizing
(2) Stop Words
(3) Stemming
(4) Part of Speech (POS) tagging
(5) Chunking
(6) Chinking
(7) Named Entity Recognition
(8) Lemmatizing
(9) Wordnet """
import nltk
import ssl

# 跳过证书验证
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


In [None]:
nltk.download()

In [3]:
""" (1) Tokenizing words and sentences (Lexical Analysis)"""

"""Tokenization is a process of breaking up a given text into words or sentences. Eg: Word Tokenizer and Sentence Tokenizer."""

from nltk.tokenize import sent_tokenize, word_tokenize

sample_text = "Hello Mr. Bob, how are you doing today? why are you playing outside? Hope you come back soon. Icecream smells good."
print(sent_tokenize(sample_text))
print(word_tokenize(sample_text))


['Hello Mr. Bob, how are you doing today?', 'why are you playing outside?', 'Hope you come back soon.', 'Icecream smells good.']
['Hello', 'Mr.', 'Bob', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'why', 'are', 'you', 'playing', 'outside', '?', 'Hope', 'you', 'come', 'back', 'soon', '.', 'Icecream', 'smells', 'good', '.']


In [None]:
""" (2) Stop words are the most commonly used words which are removed/ignored in Natural Language Processing"""

from nltk.corpus import stopwords
sample_text = "This is just an example to show stop words removals."
stop_words = set(stopwords.words('english'))
print(stop_words)
words = word_tokenize(sample_text)


filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

print(filtered_sentence)
filtered_sentence = [ w for w in words if not w in stop_words]
print(filtered_sentence)


In [None]:
""" (3) Stemming is a process of reducing words to its root form even if the root has no dictionary meaning.  """

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
sample_text = ["python", "pythoner", "pythoning","pythoned","pythonly"]

ps = PorterStemmer()

for w in sample_text:
    print(ps.stem(w))

new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."


words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))



In [None]:
""" (4) POS Tagging is assigning part of speech tags to words"""

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")


""" PunktSentenceTokenizer is an sentence boundary detection algorithm that must be trained to be used. NLTK includes a pre-trained version of the PunktSentenceTokenizer. """

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

print(tokenized)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            print(tagged)

    except Exception as e:
        print(str(e))


print(process_content())


In [None]:
""" (5) Chunking is a process of grouping words into chunks. These are phrases of one or more words that contain noun. """

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer


train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")


custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()

    except Exception as e:
        print(str(e))


print(process_content())

In [None]:

""" Chinking is a process of removing a chunk from a chunk. This chunk which needs to be removed is called chink """

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer



train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

""" PunktSentenceTokenizer is an sentence boundary detection algorithm that must be trained to be used. NLTK includes a pre-trained version of the PunktSentenceTokenizer. """

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()

    except Exception as e:
        print(str(e))


print(process_content())

In [None]:
""" Named Entity Recognition is a process to extract "entities" like people, place, locations, monetary figures and more """

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

""" PunktSentenceTokenizer is an sentence boundary detection algorithm that must be trained to be used. NLTK includes a pre-trained version of the PunktSentenceTokenizer. """

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")


custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()

    except Exception as e:
        print(str(e))


print(process_content())

In [None]:
""" Lemmatisation is a process of reducing words into their lemma or dictionary. It takes into account the meaning of the word in the sentence."""

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))


In [None]:

""" WordNet is a lexical database of English language, which was created by Princeton, and is part of the NLTK corpus.
It can be used to find meanings of words, synonyms, antonyms and more."""

from nltk.corpus import wordnet

syns = wordnet.synsets("program")

print(syns)
print(syns[0])
print(syns[0].name())
print(syns[0].lemmas())
print(syns[0].lemmas()[0].name)
print(syns[0].definition())
print(syns[0].examples())
