<a href="https://colab.research.google.com/github/hblacksmith/Clustering/blob/main/WordTokenization_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
#Construct a DataFrame of bag-of-words vectors
import pandas as pd

sentences = "Thomas Jefferson began building Monticello at the age of 26.\n"
sentences += "Construction was done mostly by local masons and carpenters.\n"
sentences += "He moved into the South Pavillion in 1770.\n"
sentences += "Turning Monticello into a neoclassical masterpiece was Jefferson's obsession.\n"

corpus = {}

for i, sent in enumerate(sentences.split('\n')):
  corpus['sent{}'.format(i)] = dict ((tok, 1) for tok in sent.split())

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns[:10]]
#print(df)


Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent0,1,1,1,1,1,1,1,1,1,1
sent1,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,1,0,0,0
sent3,0,0,0,0,1,0,0,0,0,0
sent4,0,0,0,0,0,0,0,0,0,0


In [57]:
# Dot Product - measuring bag of words overlap
df = df.T
df.sent0.dot(df.sent1)


0

In [58]:
df.sent0.dot(df.sent2)

1

In [59]:
df.sent0.dot(df.sent3)

1

In [60]:
# Here is a way to find the word that is shared by sent0 and sent3, the word that gave you that last Dot Product of 1
[(k, v) for (k ,v) in (df.sent0 & df.sent3).items() if v]

[('Monticello', 1)]

In [61]:
#Tokenize the Monticello sentence with a regular expression
import re

sentence = "Thomas Jefferson began building Monticello at the age of 26."

tokens = re.split(r'[-\s.,;!?]+', sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [73]:
#Simpler Regular Expression
import re

sentence = "Thomas Jefferson began building Monticello at the age of 26."

pattern = re.compile(r"([-\s\s+.,;!?])+")
tokens = pattern.split(sentence)
[x for x in tokens if x and x not in '- \t\n.,;!?']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [63]:
#NLTK RegexpTokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)
# this tokenizer is better than the one previous because it ignores whitespace tokens, and separates 
#sentence-ending 
#trailing punctuation from tokens that do not contain any other punctuation characters

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [64]:
#Treebank Word Tokenizer for the NLTK package. Separates phrase-terminating punctutation from adjacent
# tokens and retains decimal numbers containing a period as a single token
from nltk.tokenize import TreebankWordTokenizer
sentence = "Monticello wasn't designated as UNCESCO World Heritage Site until 1987."
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNCESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987',
 '.']

In [84]:
#Extending Vocabularly with n-grams

# 1gram tokenizer
import re

sentence = "Thomas Jefferson began building Monticello at the age of 26."

pattern = re.compile(r'([- \s.,;!?])+')
tokens = pattern.split(sentence)
[x for x in tokens if x and x not in '- \t\n.,;!?']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [85]:
#2gram tokenizer
from nltk.util import ngrams
list(ngrams(tokens, 2))

[('Thomas', ' '),
 (' ', 'Jefferson'),
 ('Jefferson', ' '),
 (' ', 'began'),
 ('began', ' '),
 (' ', 'building'),
 ('building', ' '),
 (' ', 'Monticello'),
 ('Monticello', ' '),
 (' ', 'at'),
 ('at', ' '),
 (' ', 'the'),
 ('the', ' '),
 (' ', 'age'),
 ('age', ' '),
 (' ', 'of'),
 ('of', ' '),
 (' ', '26'),
 ('26', '.'),
 ('.', '')]

In [71]:
list(ngrams(tokens, 3))

[('Thomas', ' ', 'Jefferson'),
 (' ', 'Jefferson', ' '),
 ('Jefferson', ' ', 'began'),
 (' ', 'began', ' '),
 ('began', ' ', 'building'),
 (' ', 'building', ' '),
 ('building', ' ', 'Monticello'),
 (' ', 'Monticello', ' '),
 ('Monticello', ' ', 'at'),
 (' ', 'at', ' '),
 ('at', ' ', 'the'),
 (' ', 'the', ' '),
 ('the', ' ', 'age'),
 (' ', 'age', ' '),
 ('age', ' ', 'of'),
 (' ', 'of', ' '),
 ('of', ' ', '26'),
 (' ', '26', '.'),
 ('26', '.', '')]

In [87]:
two_grams = list(ngrams(tokens, 2))
[" ". join(x) for x in two_grams]

['Thomas ',
 ' Jefferson',
 'Jefferson ',
 ' began',
 'began ',
 ' building',
 'building ',
 ' Monticello',
 'Monticello ',
 ' at',
 'at ',
 ' the',
 'the ',
 ' age',
 'age ',
 ' of',
 'of ',
 ' 26',
 '26.',
 '.']

In [88]:
#NLTK List of Stopwords
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
len(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


179

In [90]:
stop_words[:7]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

In [93]:
[sw for sw in stop_words if len(sw) == 1]

['i', 'a', 's', 't', 'd', 'm', 'o', 'y']

In [94]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
len(sklearn_stop_words)

318

In [96]:
#Case Folding
tokens = ['House', ' Visitor', 'Centre']
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens) 

['house', ' visitor', 'centre']


In [104]:
#Stemming - remove small meaning differencens of pluralisation or possives endings of words, or even various verb forms e.g. "house" (singular), "houses" (plural)
#Stemming can reduce precision score as it may return more irrelevant items along with the relevant ones, creating a higher false-positives rate
def stem(phrase):
  return ' '.join([re.findall('^(.*ss|.*?)(s)?$', word) [0][0].strip("'") for word in phrase.lower().split()])
stem('houses')

'house'

In [106]:
stem("Doctor House's calls")

'doctor house call'

In [110]:
#Porter Stemmer
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in "dish washer's washed dishes".split()])' '.join() 

'dish washer wash dish'

In [114]:
from nltk.classify.rte_classify import lemmatize
#Lemmatization - potentially more accurate way to normalise a word than stemming or case normalisation
# it takes into account a word's meaning. A lemmatizer uses a knowledge of base word synonynms and word
# endings to ensure that word that mean similar things are consolidated into a single token
 
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'good'

In [115]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("better", pos="a")

'good'

In [None]:
# Stemmers - faster to compute and require less-complex code and datasets. Can make more errors
# Lemmatizer - do better job retaining as much of information content as possible nased on how the 
# word was used withing the test and its itended meaning. 
# If you can, avoid using stemming or lemmatization unless using limited text that has usages