In [1]:
! pip install spacy==3.4.3
! pip install textblob==0.17.1
! pip install nltk==3.6.5
! pip install pyenchant==3.2.2
! pip install pyspellchecker==0.7.0


Collecting spacy==3.4.3
  Downloading spacy-3.4.3-cp39-cp39-macosx_10_9_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.10.0-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp39-cp39-macosx_10_9_x86_64.whl (18 kB)
Collecting requests<3.0.0,>=2.13.0
  Using cached requests-2.28.1-py3-none-any.whl (62 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.8-cp39-cp39-macosx_10_9_x86_64.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.9/107.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typer<0.8.0,>=0.3.0
  Downloading

In [6]:
! python -m spacy download en_core_web_sm


2023-03-31 21:01:33.864456: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 2.1 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.5.0
    Uninstalling en-core-web-sm-3.5.0:
      Successfully uninstalled en-core-web-sm-3.5.0
Successfully installed en-core-web-sm-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Sentence segmentation

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")


doc = nlp(u"Hi!. I like NLP. Do you??")

for sent in doc.sents:
    print(sent)


Hi!.
I like NLP.
Do you??


In [5]:
from nltk import sent_tokenize

sentences = sent_tokenize("I like it. Did you like it too?")
print(sentences)

['I like it.', 'Did you like it too?']


# Word tokentization

In [6]:
from textblob import TextBlob

text = "Hi! I like NLP. Do you?? Do you live in the U.K.?"
tokens = TextBlob(text).words
print(tokens)

['Hi', 'I', 'like', 'NLP', 'Do', 'you', 'Do', 'you', 'live', 'in', 'the', 'U.K']


In [7]:
from nltk import word_tokenize

text = "Hi! I like NLP. Do you?? Do you live in the U.K.?"
tokens = word_tokenize(text)
print(tokens)

['Hi', '!', 'I', 'like', 'NLP', '.', 'Do', 'you', '?', '?', 'Do', 'you', 'live', 'in', 'the', 'U.K.', '?']


In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
# spaCy offers many pre-trained models that you can choose from

text = "Hi! I like NLP. Do you?? Do you live in the U.K.?"
doc = nlp(text)
print([token for token in doc])

[Hi, !, I, like, NLP, ., Do, you, ?, ?, Do, you, live, in, the, U.K., ?]


# Part-of-speech (POS) tagging

In [15]:
from nltk import word_tokenize, pos_tag

tokens = word_tokenize(
    "Can you please buy me an Arizona Ice Tea? It's $0.57."
)
pos = pos_tag(tokens)
print(pos)

[('Can', 'MD'), ('you', 'PRP'), ('please', 'VB'), ('buy', 'VB'), ('me', 'PRP'), ('an', 'DT'), ('Arizona', 'NNP'), ('Ice', 'NNP'), ('Tea', 'NNP'), ('?', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('$', '$'), ('0.57', 'CD'), ('.', '.')]


# N-grams

In [9]:
from textblob import TextBlob

text = "natural language processing"

TextBlob(text).ngrams(2)

[WordList(['natural', 'language']), WordList(['language', 'processing'])]

# Punctuation removal

In [10]:
import re

text = "Hi. I like NLP, do you?"

# .sub substitutes all matches with empty string below
punc_cleaned = re.sub(r'[^\w\s]', '', text)
print(punc_cleaned)

Hi I like NLP do you


# URL removal

In [11]:
import re

text = """
    Check it out on https://google.com or www.google.com for more information. Reach out to abc@xyz.com for inquiries.
"""

url_cleaned = re.sub(r"https?://\S+|www\.\S+", "", text)
print(url_cleaned)


    Check it out on  or  for more information. Reach out to abc@xyz.com for inquiries.



# Emoji removal 

In [12]:
import re

text = "What does 😲 emoji mean?"
emoji_cleaned = re.sub(
    r'[\U00010000-\U0010ffff]', '' , text, flags=re.UNICODE
)
print(emoji_cleaned)
# >> 'What does  emoji mean?'

What does  emoji mean?


# Spelling correction

In [13]:
from spellchecker import SpellChecker

spell = SpellChecker()

# List the words that might be misspelled
misspelled = spell.unknown(
    ['mispell', 'craazy', 'craaaazy']
)

for word in misspelled:
    # Get the one `most likely` answer
    print(f"{word} -> {spell.correction(word)}")


mispell -> misspell
craazy -> crazy
craaaazy -> None


In [14]:
from textblob import TextBlob

data = "Are yu suuree about your decisiion?"
output = TextBlob(data).correct()
print(output)

data = "Are yu suuuree about your decisiion?"
output = TextBlob(data).correct()
print(output)


Are you sure about your decision?
Are you suture about your decision?


In [20]:
# if you get errors, try "brew install enchant" 
# Don't have homebrew? Visit https://brew.sh/

from enchant.checker import SpellChecker
  
# Creating the SpellChecker object
chkr = SpellChecker("en_US")
  
# Spelling error detection
chkr.set_text("This is sme sample txt with erors.")

for err in chkr:
    corrections = chkr.suggest(err.word)
    if len(corrections) > 0:
        # Get top likely correction
        correction = corrections[0]
        print("ERROR:", err.word, "Correction:", correction)

ERROR: sme Correction: same
ERROR: txt Correction: text
ERROR: erors Correction: errors


# Stopwords removal

In [21]:
# Get token from text using word tokenizers 
# described in the previous section

import nltk
from nltk import word_tokenize
# Only need to run the below download once 
# nltk.download()
from nltk.corpus import stopwords

sw = stopwords.words('english')

text = "Hi I like NLP, do you?"

tokens = word_tokenize(text)

stop_cleaned = [
    w for w in tokens if w.lower() not in sw
] 
# instead, you can also lowercase the text before tokenizing, 
# unless retaining case is required for your application

print(stop_cleaned)


['Hi', 'like', 'NLP', ',', '?']


# Lowercasing

In [22]:
text = "NATURAL LANGUAGE PROCESSING"
lower_cleaned = text.lower()
print(lower_cleaned)

natural language processing


In [43]:
from nltk.stem import PorterStemmer

tokens = ["cars", "car", "fabric", "fabrication", "computation", "computer"]

st = PorterStemmer()

stemmed = " ".join([st.stem(word) for word in tokens])
print(stemmed)

car car fabric fabric comput comput


# Lemmatization

In [44]:
from textblob import Word

tokens = ["fabric", "fabrication", "car", "cars", "computation", "computer"]
lemmatized = " ".join(
  [Word(word).lemmatize() for word in tokens]
)
print(lemmatized)

fabric fabrication car car computation computer


In [45]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(u'the bats saw the cats')

# Lemmatize each token
lemmatized = " ".join([token.lemma_ for token in doc])

print(lemmatized)


the bat see the cat


In [46]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

tokens = ["cats", "bats", "computer", "compute"]

wnl = WordNetLemmatizer()
lemmatized = " ".join(
    [wnl.lemmatize(word) for word in tokens]
)
print(lemmatized)


cat bat computer compute


[nltk_data] Downloading package wordnet to /Users/jsingh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Example scenario

In [47]:
import re
from nltk.corpus import stopwords
from nltk import word_tokenize

text = "Hi all! I saw there was a big snake at https://xyz.he.com. Come check out the big python snake video!!!!"

stop_words = stopwords.words("english")

url_cleaned = re.sub(r"https?://\S+|www\.\S+", "", text)

cleaned = re.sub(
    r"[^a-zA-Z\s+]+", " ", url_cleaned
).lower()

tokens = word_tokenize(cleaned)

stop_removed = [
    word
    for word in tokens
    if word not in stop_words
]

print(stop_removed)
# >> ['hi', 'saw', 'big', 'snake', 'come', 'check', 'big', 'python', 'snake', 'video']

['hi', 'saw', 'big', 'snake', 'come', 'check', 'big', 'python', 'snake', 'video']
