## 1. Tokenization using NLTK

In [1]:
import nltk
import numpy as np

In [2]:
text_1 = """Societal goals are difficult to determine, but need to be reflected in the code, since 
they are the reason regulations for buildings and facilities exist. Society expects a 
certain performance from buildings and facilities, and demands local codes and their 
enforcement to provide that protection. Such goals need to match what policy makers 
expect. These goals will vary among communities because of specific needs and 
concerns, such as the preservation of an historic part of a community, or perhaps a 
business that employs a majority of the town’s work force. The model codes have 
been relied on by policy makers to reflect these goals, but the model codes have 
focused on protecting life and property of individual buildings to minimize life loss and 
property protection to “acceptable levels.” Desired goals are not always achieved by 
the adoption of model codes. Variations in community social objectives are reflected 
by local amendments. In the performance-based code, objectives, functional 
statements and performance requirements are general in nature and use terms such 
as “reasonable,” “adequate” or “acceptable.”  In the current prescriptive code, there is 
only one value deemed “reasonable;” thus, communities must amend the code to 
reflect their local needs. Justifying amendments is often difficult in a prescriptive code 
environment since there is a single solution versus understanding outcomes tolerated 
by society in events such as earthquakes. Much of the structural provisions in the 
prescriptive building code are somewhat performance-oriented and easily 
accommodate a variety of design approaches and unique building features. In the 
performance code, an environment is being created where “reasonable” is qualified 
by what level of damage is tolerable to a community, based on the type of events 
expected, and use and importance of the building impacted. It is hoped this code will 
create a framework policy makers can use to clearly reflect what society expects in 
the built environment.   """

# removing line breaks
text_1 = text_1.replace('\n',"")

#### 1.1 Default Sentence Tokenizer

In [3]:
# default sentence tokenizer 
default_st = nltk.sent_tokenize
text_1_sentences = default_st(text_1)
text_1_sentences

['Societal goals are difficult to determine, but need to be reflected in the code, since they are the reason regulations for buildings and facilities exist.',
 'Society expects a certain performance from buildings and facilities, and demands local codes and their enforcement to provide that protection.',
 'Such goals need to match what policy makers expect.',
 'These goals will vary among communities because of specific needs and concerns, such as the preservation of an historic part of a community, or perhaps a business that employs a majority of the town’s work force.',
 'The model codes have been relied on by policy makers to reflect these goals, but the model codes have focused on protecting life and property of individual buildings to minimize life loss and property protection to “acceptable levels.” Desired goals are not always achieved by the adoption of model codes.',
 'Variations in community social objectives are reflected by local amendments.',
 'In the performance-based cod

#### 1.2 Default Word Tokenizer

In [4]:
text_2 = """Performance requirements are detailed statements 
that break down the functional statements into measurable terms. This is where the 
link is made to acceptable methods such as the International Building Code. 
"""
# removing line breaks
text_2 = text_2.replace('\n',"").strip()
text_2

'Performance requirements are detailed statements that break down the functional statements into measurable terms. This is where the link is made to acceptable methods such as the International Building Code.'

In [5]:
# default work tokenizer 
default_wt = nltk.word_tokenize
words = default_wt(text_2)
np.array(words)

array(['Performance', 'requirements', 'are', 'detailed', 'statements',
       'that', 'break', 'down', 'the', 'functional', 'statements', 'into',
       'measurable', 'terms', '.', 'This', 'is', 'where', 'the', 'link',
       'is', 'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the',
       'International', 'Building', 'Code', '.'], dtype='<U13')

#### 1.3 TreebankWordTokenizer

In [6]:
treebank_wt = nltk.TreebankWordTokenizer()
words = treebank_wt.tokenize(text_2)
np.array(words)

array(['Performance', 'requirements', 'are', 'detailed', 'statements',
       'that', 'break', 'down', 'the', 'functional', 'statements', 'into',
       'measurable', 'terms.', 'This', 'is', 'where', 'the', 'link', 'is',
       'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the',
       'International', 'Building', 'Code', '.'], dtype='<U13')

#### 1.4 TokTokTokenizer

In [7]:
from nltk.tokenize import ToktokTokenizer
tokenizer = ToktokTokenizer()
words = tokenizer.tokenize(text_2)
np.array(words)

array(['Performance', 'requirements', 'are', 'detailed', 'statements',
       'that', 'break', 'down', 'the', 'functional', 'statements', 'into',
       'measurable', 'terms.', 'This', 'is', 'where', 'the', 'link', 'is',
       'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the',
       'International', 'Building', 'Code', '.'], dtype='<U13')

## 2. Tokenizer Function using NLTK

In [8]:
def tokenize_text_nltk_1(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = []
    for sentence in sentences:
        word_tokens.append(nltk.word_tokenize(sentence))
    return word_tokens

In [9]:
np.array(tokenize_text_nltk_1(text_2), dtype = object)

array([list(['Performance', 'requirements', 'are', 'detailed', 'statements', 'that', 'break', 'down', 'the', 'functional', 'statements', 'into', 'measurable', 'terms', '.']),
       list(['This', 'is', 'where', 'the', 'link', 'is', 'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the', 'International', 'Building', 'Code', '.'])],
      dtype=object)

In [10]:
# List comprehension
default_wt = nltk.word_tokenize
default_st = nltk.sent_tokenize

def tokenize_text_nltk_2(text):
    sentences = default_st(text)
    word_tokens = [default_wt(sentence) for sentence in sentences]
    return word_tokens

In [11]:
sents = tokenize_text_nltk_2(text_2)
np.array(sents, dtype= object)

array([list(['Performance', 'requirements', 'are', 'detailed', 'statements', 'that', 'break', 'down', 'the', 'functional', 'statements', 'into', 'measurable', 'terms', '.']),
       list(['This', 'is', 'where', 'the', 'link', 'is', 'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the', 'International', 'Building', 'Code', '.'])],
      dtype=object)

## 3. Tokenizater Function using Spacy

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

#### Sentence Tokenization

In [13]:
text_spacy = nlp(text_2)

In [14]:
sents = list(text_spacy.sents)
sents

[Performance requirements are detailed statements that break down the functional statements into measurable terms.,
 This is where the link is made to acceptable methods such as the International Building Code.]

#### Word Tokenization

In [15]:
sent_words = [[word.text for word in sent] for sent in sents]
np.array(sent_words, dtype=object)

array([list(['Performance', 'requirements', 'are', 'detailed', 'statements', 'that', 'break', 'down', 'the', 'functional', 'statements', 'into', 'measurable', 'terms', '.']),
       list(['This', 'is', 'where', 'the', 'link', 'is', 'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the', 'International', 'Building', 'Code', '.'])],
      dtype=object)

In [16]:
for sent in sents:
    sent_words = []
    for word in sent:
        sent_words.append(word.text)
    print(sent_words)

['Performance', 'requirements', 'are', 'detailed', 'statements', 'that', 'break', 'down', 'the', 'functional', 'statements', 'into', 'measurable', 'terms', '.']
['This', 'is', 'where', 'the', 'link', 'is', 'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the', 'International', 'Building', 'Code', '.']


#### Function

In [17]:
def tokenize_text_spacy_1(text):
    text_spacy = nlp(text)
    sents = np.array(list(text_spacy.sents),dtype = object)
    sent_words = np.array([[word.text for word in sent] for sent in sents], dtype = object)
    return sent_words

In [18]:
tokenize_text_spacy_1(text_2)

array([list(['Performance', 'requirements', 'are', 'detailed', 'statements', 'that', 'break', 'down', 'the', 'functional', 'statements', 'into', 'measurable', 'terms', '.']),
       list(['This', 'is', 'where', 'the', 'link', 'is', 'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the', 'International', 'Building', 'Code', '.'])],
      dtype=object)

In [19]:
# list comprehension 1
def tokenize_text_spacy_2(text):
    text_spacy = nlp(text)
    sents = list(text_spacy.sents)
    sent_words = []
    for sent in sents:        
        sent_words.append([word.text for word in sent])
    return np.array(sent_words, dtype = object)

In [20]:
tokenize_text_spacy_2(text_2)

array([list(['Performance', 'requirements', 'are', 'detailed', 'statements', 'that', 'break', 'down', 'the', 'functional', 'statements', 'into', 'measurable', 'terms', '.']),
       list(['This', 'is', 'where', 'the', 'link', 'is', 'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the', 'International', 'Building', 'Code', '.'])],
      dtype=object)

In [21]:
# list comprehension 2
def tokenize_text_spacy_3(text):
    text_spacy = nlp(text)
    sents = list(text_spacy.sents)
    
    sent_words_1=[]
    for sent in sents:
        sent_words_2 = []
        for word in sent:
            sent_words_2.append(word.text)
        sent_words_1.append(sent_words_2)
    return np.array(sent_words_1, dtype = object)

In [22]:
tokenize_text_spacy_3(text_2)

array([list(['Performance', 'requirements', 'are', 'detailed', 'statements', 'that', 'break', 'down', 'the', 'functional', 'statements', 'into', 'measurable', 'terms', '.']),
       list(['This', 'is', 'where', 'the', 'link', 'is', 'made', 'to', 'acceptable', 'methods', 'such', 'as', 'the', 'International', 'Building', 'Code', '.'])],
      dtype=object)