# Most Common Text Processing Tasks in NLP

## Install modules

In [1]:
%%bash 
pip3 -qqq install nltk
pip3 -qqq install spacy
pip3 -qqq install scikit-learn

## Import modules & dataset

In [2]:
import nltk
nltk.download('punkt')
import spacy
from sklearn.datasets import fetch_20newsgroups

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zoumanakeita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
news_data = fetch_20newsgroups(subset='all')
articles = news_data.data

In [4]:
len(articles)

18846

In [5]:
print(articles[0])

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




# Processing Tasks

## Tokenization

In [7]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [8]:
first_article = """ 
I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!
"""

word_tokens = word_tokenize(first_article)
sentence_tokens = sent_tokenize(first_article)

In [9]:
print(word_tokens)

['I', 'am', 'sure', 'some', 'bashers', 'of', 'Pens', 'fans', 'are', 'pretty', 'confused', 'about', 'the', 'lack', 'of', 'any', 'kind', 'of', 'posts', 'about', 'the', 'recent', 'Pens', 'massacre', 'of', 'the', 'Devils', '.', 'Actually', ',', 'I', 'am', 'bit', 'puzzled', 'too', 'and', 'a', 'bit', 'relieved', '.', 'However', ',', 'I', 'am', 'going', 'to', 'put', 'an', 'end', 'to', 'non-PIttsburghers', "'", 'relief', 'with', 'a', 'bit', 'of', 'praise', 'for', 'the', 'Pens', '.', 'Man', ',', 'they', 'are', 'killing', 'those', 'Devils', 'worse', 'than', 'I', 'thought', '.', 'Jagr', 'just', 'showed', 'you', 'why', 'he', 'is', 'much', 'better', 'than', 'his', 'regular', 'season', 'stats', '.', 'He', 'is', 'also', 'a', 'lot', 'fo', 'fun', 'to', 'watch', 'in', 'the', 'playoffs', '.', 'Bowman', 'should', 'let', 'JAgr', 'have', 'a', 'lot', 'of', 'fun', 'in', 'the', 'next', 'couple', 'of', 'games', 'since', 'the', 'Pens', 'are', 'going', 'to', 'beat', 'the', 'pulp', 'out', 'of', 'Jersey', 'anyway',

In [10]:
print(sentence_tokens)

[' \nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils.', 'Actually,\nI am  bit puzzled too and a bit relieved.', "However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens.", 'Man, they\nare killing those Devils worse than I thought.', 'Jagr just showed you why\nhe is much better than his regular season stats.', 'He is also a lot\nfo fun to watch in the playoffs.', 'Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway.', 'I was very disappointed not to see the Islanders lose the final\nregular season game.', 'PENS RULE!!', '!']


In [11]:
for sent in sentence_tokens:
    print(sent)
    print("\n")

 
I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils.


Actually,
I am  bit puzzled too and a bit relieved.


However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens.


Man, they
are killing those Devils worse than I thought.


Jagr just showed you why
he is much better than his regular season stats.


He is also a lot
fo fun to watch in the playoffs.


Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway.


I was very disappointed not to see the Islanders lose the final
regular season game.


PENS RULE!!


!




## Stop Words Removal

In [12]:
from nltk.corpus import stopwords

In [15]:
# Acquire the stop words
english_stw = stopwords.words("english")

In [16]:
non_stop_words = [word for word in word_tokens if word not in english_stw]
print(non_stop_words)

['I', 'sure', 'bashers', 'Pens', 'fans', 'pretty', 'confused', 'lack', 'kind', 'posts', 'recent', 'Pens', 'massacre', 'Devils', '.', 'Actually', ',', 'I', 'bit', 'puzzled', 'bit', 'relieved', '.', 'However', ',', 'I', 'going', 'put', 'end', 'non-PIttsburghers', "'", 'relief', 'bit', 'praise', 'Pens', '.', 'Man', ',', 'killing', 'Devils', 'worse', 'I', 'thought', '.', 'Jagr', 'showed', 'much', 'better', 'regular', 'season', 'stats', '.', 'He', 'also', 'lot', 'fo', 'fun', 'watch', 'playoffs', '.', 'Bowman', 'let', 'JAgr', 'lot', 'fun', 'next', 'couple', 'games', 'since', 'Pens', 'going', 'beat', 'pulp', 'Jersey', 'anyway', '.', 'I', 'disappointed', 'see', 'Islanders', 'lose', 'final', 'regular', 'season', 'game', '.', 'PENS', 'RULE', '!', '!', '!']


## Remove Punctuations

In [17]:
import string

without_punct = list(filter(lambda word: word not in string.punctuation, non_stop_words))

print(without_punct)

['I', 'sure', 'bashers', 'Pens', 'fans', 'pretty', 'confused', 'lack', 'kind', 'posts', 'recent', 'Pens', 'massacre', 'Devils', 'Actually', 'I', 'bit', 'puzzled', 'bit', 'relieved', 'However', 'I', 'going', 'put', 'end', 'non-PIttsburghers', 'relief', 'bit', 'praise', 'Pens', 'Man', 'killing', 'Devils', 'worse', 'I', 'thought', 'Jagr', 'showed', 'much', 'better', 'regular', 'season', 'stats', 'He', 'also', 'lot', 'fo', 'fun', 'watch', 'playoffs', 'Bowman', 'let', 'JAgr', 'lot', 'fun', 'next', 'couple', 'games', 'since', 'Pens', 'going', 'beat', 'pulp', 'Jersey', 'anyway', 'I', 'disappointed', 'see', 'Islanders', 'lose', 'final', 'regular', 'season', 'game', 'PENS', 'RULE']


## Stemming & Lemmatization

In [18]:
sample_text = """This thing really confuses. 
                 But you confuse me more than what is written here.  
                 So stay away from explaining things you do not understand. 
              """

from nltk.stem import WordNetLemmatizer
# Download wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

# Instanciate Lemmatizer
my_lemmatizer = WordNetLemmatizer()


from nltk.stem.porter import PorterStemmer

# Create instance of stemmer
my_stemmer = PorterStemmer()


def stem_words(sentence, model=my_stemmer):
  
    for word in sentence.split():
        stem = model.stem(word)
        print("Word: {} ---> : {}".format(word, stem))

def lemmatize_words(sentence, model = my_lemmatizer):

    for word in sentence.split():
        lemma = model.lemmatize(word)
        print("Word: {} ---> : {}".format(word, lemma))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zoumanakeita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zoumanakeita/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [19]:
stem_words(sample_text, model=my_stemmer)

Word: This ---> : thi
Word: thing ---> : thing
Word: really ---> : realli
Word: confuses. ---> : confuses.
Word: But ---> : but
Word: you ---> : you
Word: confuse ---> : confus
Word: me ---> : me
Word: more ---> : more
Word: than ---> : than
Word: what ---> : what
Word: is ---> : is
Word: written ---> : written
Word: here. ---> : here.
Word: So ---> : so
Word: stay ---> : stay
Word: away ---> : away
Word: from ---> : from
Word: explaining ---> : explain
Word: things ---> : thing
Word: you ---> : you
Word: do ---> : do
Word: not ---> : not
Word: understand. ---> : understand.


In [20]:
lemmatize_words(sample_text, model = my_lemmatizer)

Word: This ---> : This
Word: thing ---> : thing
Word: really ---> : really
Word: confuses. ---> : confuses.
Word: But ---> : But
Word: you ---> : you
Word: confuse ---> : confuse
Word: me ---> : me
Word: more ---> : more
Word: than ---> : than
Word: what ---> : what
Word: is ---> : is
Word: written ---> : written
Word: here. ---> : here.
Word: So ---> : So
Word: stay ---> : stay
Word: away ---> : away
Word: from ---> : from
Word: explaining ---> : explaining
Word: things ---> : thing
Word: you ---> : you
Word: do ---> : do
Word: not ---> : not
Word: understand. ---> : understand.


## Part of Speech Tagging

In [21]:
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zoumanakeita/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [22]:
tagged_tokens = pos_tag(without_punct)
print(tagged_tokens)

[('I', 'PRP'), ('sure', 'VBP'), ('bashers', 'NNS'), ('Pens', 'NNPS'), ('fans', 'NNS'), ('pretty', 'RB'), ('confused', 'JJ'), ('lack', 'NN'), ('kind', 'NN'), ('posts', 'VBZ'), ('recent', 'JJ'), ('Pens', 'NNP'), ('massacre', 'NN'), ('Devils', 'NNP'), ('Actually', 'NNP'), ('I', 'PRP'), ('bit', 'VBP'), ('puzzled', 'JJ'), ('bit', 'NN'), ('relieved', 'VBD'), ('However', 'RB'), ('I', 'PRP'), ('going', 'VBG'), ('put', 'JJ'), ('end', 'VB'), ('non-PIttsburghers', 'NNS'), ('relief', 'JJ'), ('bit', 'NN'), ('praise', 'NN'), ('Pens', 'NNP'), ('Man', 'NNP'), ('killing', 'VBG'), ('Devils', 'NNP'), ('worse', 'NN'), ('I', 'PRP'), ('thought', 'VBD'), ('Jagr', 'NNP'), ('showed', 'VBD'), ('much', 'JJ'), ('better', 'JJR'), ('regular', 'JJ'), ('season', 'NN'), ('stats', 'NNS'), ('He', 'PRP'), ('also', 'RB'), ('lot', 'VBD'), ('fo', 'JJ'), ('fun', 'NN'), ('watch', 'NN'), ('playoffs', 'NNS'), ('Bowman', 'NNP'), ('let', 'VBP'), ('JAgr', 'NNP'), ('lot', 'NN'), ('fun', 'NN'), ('next', 'JJ'), ('couple', 'NN'), ('ga