# NLP with TextBlob

### Loading Libraries

In [4]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Sys
import sys

# Warnings
import warnings

# Path
from pathlib import Path

# TextBlob & NLTK
import nltk
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

# Scikit-Learn
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
%matplotlib inline

In [8]:
np.random.seed(42)

sns.set_style('white')

warnings.filterwarnings('ignore')

In [9]:
# NLTK resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joaquinromero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Loading BBC Data

In [11]:
path = Path('..', 'data', 'bbc')

files = sorted(list(path.glob('**/*.txt')))

doc_list = []

for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]]).strip()
    doc_list.append([topic, heading, body])

In [12]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    0 non-null      object
 1   heading  0 non-null      object
 2   body     0 non-null      object
dtypes: object(3)
memory usage: 132.0+ bytes


### Introduction to TextBlob

#### Selecting Random Article

In [14]:
article = docs.sample(1).squeeze()

In [15]:
print(f'Topic:\t{article.topic.capitalize()}\n\n{article.heading}\n')
print(article.body.strip())

In [16]:
parsed_body = TextBlob(article.body)

### Tokenization

In [17]:
parsed_body.words

### Sentence Boundary Detection

In [18]:
parsed_body.sentences

### Stemming

In [19]:
stemmer = SnowballStemmer('english')

# Stem each word.
[(word, stemmer.stem(word)) for i, word in enumerate(parsed_body.words) 
 if word.lower() != stemmer.stem(parsed_body.words[i])]

### Lemmatization

In [20]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joaquinromero/nltk_data...


True

In [22]:
[(word, word.lemmatize()) for i, word in enumerate(parsed_body.words) 
 if word != parsed_body.words[i].lemmatize()]

In [23]:
[(word, word.lemmatize(pos='v')) for i, word in enumerate(parsed_body.words) 
 if word != parsed_body.words[i].lemmatize(pos='v')]

### Sentiment & Polarity

In [25]:
parsed_body.sentiment

In [26]:
parsed_body.sentiment_assessments

### Combining Textblob Lemmatization with `CountVectorizer`

In [27]:
def lemmatizer(text):
    words = TextBlob(text.lower()).words
    return [word.lemmatize() for word in words]

In [28]:
vectorizer = CountVectorizer(analyzer=lemmatizer, decode_error='replace')