Unstructured text data, like the contents of a book or a tweet, is both one of the most interesting sources of features and one of the most complex to handle. 
In this chapter, we will cover strategies for transforming text into information-rich features and use some out-of-the-box features (termed embeddings) that have become increasingly ubiquitous in tasks that involve natural language processing (NLP).

In [1]:
import nltk
import spacy
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Error loading punkt: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>


False

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import re
import sys
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import pos_tag
import unicodedata
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import linear_kernel
from transformers import pipeline


In [3]:
# Cleaning Text
text_data = [
    "     Interrobang. By Aishwarya Henriette ",
    "Parking And Going. By Karl Gautier",
    "Today Is The night. By Jarek Prakash            "
]
strip_whitespace = [s.strip() for s in text_data]
strip_whitespace
remove_periods = [string.replace(".", "") for string in strip_whitespace]
remove_periods
def capitalizer(s: str) -> str:
    return s.upper()

[capitalizer(s) for s in remove_periods]

def replace_letters(s: str) -> str:
    return re.sub(r'[a-zA-Z]', 'X', s)

[replace_letters(s) for s in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

In [None]:
# Parsing and Cleaning HTML
html = "<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"
soup = BeautifulSoup(html)
soup.find("div", {"class": "full_name"}).text

In [None]:
# Removing Punctuations
text_data = ['Hi!!!! I. Love. This. Song....','10000% Agree!!!! #LoveIT','Right?!?!']
punctuation = dict.fromkeys(
    (i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')), None
)

[s.translate(punctuation) for s in text_data]

In [None]:
# Tokenizing Text
string = "The science of today is the technology of tomorrow"
word_tokenize(string)

sent_tokenize(string)

In [None]:
# Removing StopWords
tokenized_words = ['i','am','going','to','go','to','the','store','and','park']
stop_words = stopwords.words('english')
[w for w in tokenized_words if w not in stop_words]

In [None]:
# Stemming Words
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']
porter = PorterStemmer()
[porter.stem(w) for w in tokenized_words]

In [None]:
# Tagging parts of a speech
text_tagged = pos_tag(word_tokenize(string))
text_tagged

In [None]:
# Filter Words
[w for w, t in text_tagged if t in ['NN', 'NNS', 'NNP', 'NNPS']]

In [None]:
tweets = ["I am eating a burrito for breakfast","Political science is an amazing field","San Francisco is an awesome city"]

tagged_tweets = []
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])

one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)
one_hot_multi.classes_

In [None]:
# Performing named-entity recognition
nlp = spacy.load('en_core_web_sm')
doc = nlp("Elon Musk offered to buy Twitter using $21B of his own money.")
print(doc.ents)
for e in doc.ents:
    print(e.text, e.label_, sep=', ')

In [4]:
# Encoding text as a bag of words
text_data = np.array(['I love Brazil. Brazil!','Sweden is best', 'Germany beats both'])
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
# Show feature matrix
bag_of_words.toarray()

count.get_feature_names_out()

array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

In [5]:
count_2gram = CountVectorizer(ngram_range=(1,2),
stop_words="english",
vocabulary=['brazil'])
bag = count_2gram.fit_transform(text_data)
# View feature matrix
bag.toarray()

# View the 1-grams and 2-grams
count_2gram.vocabulary_

{'brazil': 0}

In [7]:
# Weighting word importance
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix.toarray()
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

In [10]:
# Using Text Vectors to Calculate Text Similarity in a Search Query
query = 'Brazil'
vector = tfidf.transform([query])
cosine_similarities = linear_kernel(vector, feature_matrix).flatten()
related_doc_indicies = cosine_similarities.argsort()[:-10:-1]
print([(text_data[i], cosine_similarities[i]) for i in related_doc_indicies])

[(np.str_('I love Brazil. Brazil!'), np.float64(0.8944271909999159)), (np.str_('Germany beats both'), np.float64(0.0)), (np.str_('Sweden is best'), np.float64(0.0))]


In [None]:
# Sentiment Analysis Classifier
classifier = pipeline('sentiment-analysis')
sentiment_1 = classifier("I hate machine learning! It's the absolute worst.")
sentiment_2 = classifier("Machine learning is the absolute bees knees I love it so much!")
print(sentiment_1, sentiment_2)