# Basic text cleaning and analysis 

Author: Dr. Hickman  

```
#DEPENDENCIES

#FROM COMMAND LINE
conda install -c anaconda nltk

#FROM INSIDE PYTHON RUN
import nltk; 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
])
```

In [19]:
# # UN-COMMENT AND RUN THE FOLLOWING CELL THE FIRST TIME YOU USE THE NOTEBOOK
# import nltk; 
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download([
#     "names",
#     "stopwords",
#     "state_union",
#     "twitter_samples",
#     "movie_reviews",
#     "averaged_perceptron_tagger",
#     "vader_lexicon",
#     "punkt",])

### NLTK examples

In [20]:
### IMPORT PACKAGES 
import nltk
from nltk.corpus import wordnet

In [21]:
print("-----------------------------")
print("EXAMPLE: SENTENCE SEGMENTATION")
print("-----------------------------")
text = "The universe is Great! I won the lottery today and I'm very happy. What is the best way to break sentences into chunks?"
print(text)
print(nltk.tokenize.sent_tokenize(text))
# Output: ['The universe is Great!', 'I won a lottery ']


-----------------------------
EXAMPLE: SENTENCE SEGMENTATION
-----------------------------
The universe is Great! I won the lottery today and I'm very happy. What is the best way to break sentences into chunks?
['The universe is Great!', "I won the lottery today and I'm very happy.", 'What is the best way to break sentences into chunks?']


In [22]:
print("-----------------------------")
print("EXAMPLE: VADAR SENTIMENT COMPUTE")
print("-----------------------------")
from nltk.sentiment import SentimentIntensityAnalyzer

#INITIALIZE 

#STRING-1
sia = SentimentIntensityAnalyzer()
text="I love NLTK, its a great library"
score=sia.polarity_scores(text)
print('TEXT:',text)
print("SCORE:",score)

#STRING-2
text="I hate NLTK, its a terrible library"
score=sia.polarity_scores(text)
print('TEXT:',text)
print("SCORE:",score)

#WORDS
print("---WORDS---")
for word in text.split():
    print(word,sia.polarity_scores(word))

-----------------------------
EXAMPLE: VADAR SENTIMENT COMPUTE
-----------------------------
TEXT: I love NLTK, its a great library
SCORE: {'neg': 0.0, 'neu': 0.265, 'pos': 0.735, 'compound': 0.8519}
TEXT: I hate NLTK, its a terrible library
SCORE: {'neg': 0.694, 'neu': 0.306, 'pos': 0.0, 'compound': -0.7783}


In [23]:
print("\n-----------------------------")
print("EXAMPLE: WORD TOKENIZE")
print("-----------------------------")
from nltk.tokenize import word_tokenize
s = '''Good muffins cost $3.88\nin New York.  Please buy me
... two of them.\n\nThanks.'''
print(word_tokenize(s))


-----------------------------
EXAMPLE: WORD TOKENIZE
-----------------------------
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', '...', 'two', 'of', 'them', '.', 'Thanks', '.']


In [24]:
print("\n-----------------------------")
print("EXAMPLE: LEMITIZATION")
print("-----------------------------")
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print("rocks -->", lemmatizer.lemmatize("rocks"))
print("corpora -->", lemmatizer.lemmatize("corpora"))
# a denotes adjective in "pos"
print("better -->", lemmatizer.lemmatize("better", pos ="a"))



-----------------------------
EXAMPLE: LEMITIZATION
-----------------------------
rocks --> rock
corpora --> corpus
better --> good


In [25]:
print("\n-----------------------------")
print("EXAMPLE: STEMMING")
print("-----------------------------")
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print("rocks -->", stemmer.stem("rocks"))
print("rocking -->", stemmer.stem("rocks"))
print("rocked -->", stemmer.stem("rocks"))
print("corpora -->", stemmer.stem("corpora"))
print("better -->", stemmer.stem("better"))
# a denotes adjective in "pos"




-----------------------------
EXAMPLE: STEMMING
-----------------------------
rocks --> rock
rocking --> rock
rocked --> rock
corpora --> corpora
better --> better


In [26]:
print("\n-----------------------------")
print("EXAMPLE: STOPWORDS")
print("-----------------------------") 
from nltk.corpus import stopwords
print(stopwords.words('english'))



-----------------------------
EXAMPLE: STOPWORDS
-----------------------------
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'ot

In [27]:
print("\n-----------------------------")
print("EXAMPLE: SEARCHING TEXT")
print("-----------------------------")
text = "line-1: Starbucks has the best coffee \n line-2: This is more text"
text = word_tokenize(text)
text = nltk.Text(text)
print(text.concordance("has"))


-----------------------------
EXAMPLE: SEARCHING TEXT
-----------------------------
Displaying 1 of 1 matches:
line-1 : Starbucks has the best coffee line-2 : This is more
None


In [28]:

print("\n-----------------------------")
print("EXAMPLE: SYNONYMS,HYPERNYMS, AND ANTONYMS")
print("-----------------------------")
original_word='testing'
# original_word=lemmatizer.lemmatize(original_word)
synonyms = []
antonyms = []
hypernyms = []
print(original_word)
syns=wordnet.synsets(original_word)
print(syns)
for syn in syns:

	for h in syn.hypernyms(): 
		h=(h.name().split('.')[0].lower())
		hypernyms.append(h)
	for l in syn.lemmas():
		# print(l)
		synonyms.append(l.name().lower())
		if l.antonyms():
			antonyms.append(l.antonyms()[0].name().lower())
  
print(sorted(set(synonyms)))
print(sorted(set(antonyms)))
print(sorted(set(hypernyms)))

hypernyms.append(original_word)
shortest_word=original_word
for i in sorted(set(synonyms)):
	if(len(i)<len(shortest_word)): shortest_word=i 

print("SHORTEST:",shortest_word)


-----------------------------
EXAMPLE: SYNONYMS,HYPERNYMS, AND ANTONYMS
-----------------------------
testing
[Synset('testing.n.01'), Synset('testing.n.02'), Synset('examination.n.05'), Synset('test.v.01'), Synset('screen.v.01'), Synset('quiz.v.01'), Synset('test.v.04'), Synset('test.v.05'), Synset('test.v.06'), Synset('test.v.07')]
['essay', 'examination', 'examine', 'prove', 'quiz', 'screen', 'test', 'testing', 'try', 'try_out']
[]
['be', 'check', 'determine', 'evaluate', 'examination', 'examine', 'experiment', 'investigation', 'score', 'take']
SHORTEST: try


In [29]:

print("\n-----------------------------")
print("EXAMPLE: FILTERING CHARACTERS AND STOPWORDS")
print("-----------------------------")
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

import string 
import nltk 

# # WORDS TO REMOVE
print("STOPWORDS = \n",nltk.corpus.stopwords.words('english'))

# # CHAR TO KEEP 
print("\nPRINTABLE = \n",string.printable)

text="""
“My dear Professor, surely a sensible person like
yourself can call him by his name? All this You-
Know-Who’ nonsense — for eleven years I have been
trying to persuade people to call him by his proper
name: Voldemort” said Dumbledore, Professor McGonagall flinched, but
Dumbledore, who was unsticking two lemon drops,
seemed not to notice. “It all gets so confusing if we
keep saying You-Know-Who.’ I have never seen any
reason to be frightened of saying Voldemort’s name.”
"""

# #FILTER OUT UNWANTED CHAR
new_text=""
for character in text:
    if character in string.printable:
        new_text+=character
text=new_text
print(text)

# #FILTER OUT UNWANTED WORDS
new_text=""
for word in nltk.tokenize.word_tokenize(text):
    if word not in nltk.corpus.stopwords.words('english'):
        if word in [".",",","!","?",":",";"]:
            #remove the last space
            new_text=new_text[0:-1]+word+" "
        else: #add a space
            new_text+=word.lower()+" "
text=new_text
print(text)



-----------------------------
EXAMPLE: FILTERING CHARACTERS AND STOPWORDS
-----------------------------
STOPWORDS = 
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'bo

### Part of speech tagging

In [30]:
import nltk 
print("\n-----------------------------")
print("EXAMPLE: PARTS OF SPEECH TAGGIN (POS)")
print("-----------------------------")
text = "line-1: Starbucks has the best coffee \n line-2: This is more text"
text = nltk.word_tokenize(text)
print("Tokenized text:\n", text)
tagged = nltk.pos_tag(text)
print("Tagged text:\n", tagged)
print(tagged)


-----------------------------
EXAMPLE: PARTS OF SPEECH TAGGIN (POS)
-----------------------------
Tokenized text:
 ['line-1', ':', 'Starbucks', 'has', 'the', 'best', 'coffee', 'line-2', ':', 'This', 'is', 'more', 'text']
Tagged text:
 [('line-1', 'NN'), (':', ':'), ('Starbucks', 'NNP'), ('has', 'VBZ'), ('the', 'DT'), ('best', 'JJS'), ('coffee', 'NN'), ('line-2', 'NN'), (':', ':'), ('This', 'DT'), ('is', 'VBZ'), ('more', 'JJR'), ('text', 'JJ')]
[('line-1', 'NN'), (':', ':'), ('Starbucks', 'NNP'), ('has', 'VBZ'), ('the', 'DT'), ('best', 'JJS'), ('coffee', 'NN'), ('line-2', 'NN'), (':', ':'), ('This', 'DT'), ('is', 'VBZ'), ('more', 'JJR'), ('text', 'JJ')]


### Name entity recognition

```
conda install -c conda-forge spacy
python -m spacy download en_core_web_sm
```

In [31]:

#IMPORT: SPACY FOR NAME ENTITY RECOGNITION (NER)
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

#INITIALIZE NER ENGINE
nlp = en_core_web_sm.load()

text="""
“My dear Professor, surely a sensible person like
yourself can call him by his name? All this You-
Know-Who’ nonsense — for eleven years I have been
trying to persuade people to call him by his proper
name: Voldemort” said Dumbledore, Professor McGonagall flinched, but
Dumbledore, who was unsticking two lemon drops,
seemed not to notice. “It all gets so confusing if we
keep saying You-Know-Who.’ I have never seen any
reason to be frightened of saying Voldemort’s name.”
"""

# RUN NER ON TEXT
doc = nlp(text)
print("doc.ents",doc.ents)
for X in doc.ents:
    print(X.text.lower(),X.label_)


doc.ents (eleven years, Dumbledore, McGonagall, two, You-Know-Who, Voldemort)
eleven years DATE
dumbledore ORG
mcgonagall PERSON
two CARDINAL
you-know-who ORG
voldemort PERSON


### Some python string operations 

In [32]:

print("-----------------------------")
print("EXAMPLE: CHANGE CASE")
print("-----------------------------")
text = "The universe is Great! I won a lottery."
print(text.upper())
print(text.lower())


-----------------------------
EXAMPLE: CHANGE CASE
-----------------------------
THE UNIVERSE IS GREAT! I WON A LOTTERY.
the universe is great! i won a lottery.


In [33]:
print("\n-----------------------------")
print("EXAMPLE: KEEP ONLY PRINTABLE CHAR")
print("-----------------------------")
s = "some\x00string. with\x15 funny characters"
import string 
print(s)
print("printable = ",string.printable)
printable = set(string.printable); tmp=''
for char in s:
	if(char in printable): tmp=tmp+char
print(tmp)


-----------------------------
EXAMPLE: KEEP ONLY PRINTABLE CHAR
-----------------------------
some string. with funny characters
printable =  0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

somestring. with funny characters


In [34]:

print("\n-----------------------------")
print("EXAMPLE: REMOVE NEW LINES")
print("-----------------------------")
str1 = "\n line-1: Starbucks has the best coffee \n line-2: This is more text"
print(str1)
newstr = str1.splitlines()
print(newstr)



-----------------------------
EXAMPLE: REMOVE NEW LINES
-----------------------------

 line-1: Starbucks has the best coffee 
 line-2: This is more text
['', ' line-1: Starbucks has the best coffee ', ' line-2: This is more text']
