# textTiling a mixture of WIKI paragraphs

* Test with mixture of wikipedia paragraphs from different topics: Life, Death, Purpose, Technology, Science 

In [57]:
# import libraries
import nltk
from nltk.corpus import brown
from nltk.tokenize import TextTilingTokenizer 
import numpy as np
import time

* obs:
    * `np-tl`: noun proper-title: means that it is a `proper noun` found in a `title` header

In [45]:
text_01 = "Life is a characteristic that distinguishes physical entities that have biological processes, such as signaling and self-sustaining processes, from those that do not, either because such functions have ceased (they have died), or because they never had such functions and are classified as inanimate. Various forms of life exist, such as plants, animals, fungi, protists, archaea, and bacteria. Biology is the science concerned with the study of life.\n\n"

text_02 = "Death is the permanent cessation of all biological functions that sustain a living organism.[1] The remains of a living organism begin to decompose shortly after death.[2] It is an inevitable process eventually occurring in all living organisms. As of the early 21st century, over 150,000 humans die each day.[3][4]. Many cultures and religions have the idea of an afterlife, and also hold the idea of judgement and reward for good deeds or punishment for sin.\n\n"

text_03 = "Folk psychology explains human behavior on the basis of mental states, including beliefs, desires, and intentions.[2][3] Mental mechanisms, including intention, explain behavior in that individuals are seen as actors who have desires and who attempt to achieve goals that are directed by beliefs.[4] Thus, an intentional action is a function to accomplish a desired goal and is based on the belief that the course of action will satisfy a desire.\n\n"

text_04="Technology is the sum of techniques, skills, methods, and processes used in the production of goods or services or in the accomplishment of objectives, such as scientific investigation. Technology can be the knowledge of techniques, processes, and the like, or it can be embedded in machines to allow for operation without detailed knowledge of their workings. Systems (e.g. machines) applying technology by taking an input, changing it according to the system's use, and then producing an outcome are referred to as technology systems or technological systems.\n\n"

text_05 = "The earliest roots of science can be traced to Ancient Egypt and Mesopotamia in around 3500 to 3000 BCE.[5][6] Their contributions to mathematics, astronomy, and medicine entered and shaped Greek natural philosophy of classical antiquity, whereby formal attempts were made to provide explanations of events in the physical world based on natural causes.[5][6] After the fall of the Western Roman Empire, knowledge of Greek conceptions of the world deteriorated in Western Europe during the early centuries (400 to 1000 CE) of the Middle Ages[7] but was preserved in the Muslim world during the Islamic Golden Age.[8] The recovery and assimilation of Greek works and Islamic inquiries into Western Europe from the 10th to 13th century revived natural philosophy,[7][9] which was later transformed by the Scientific Revolution that began in the 16th century[10] as new ideas and discoveries departed from previous Greek conceptions and traditions."


text = text_01 + text_02 + text_03 + text_04 + text_05

In [46]:
# display formatted text
print(text)

Life is a characteristic that distinguishes physical entities that have biological processes, such as signaling and self-sustaining processes, from those that do not, either because such functions have ceased (they have died), or because they never had such functions and are classified as inanimate. Various forms of life exist, such as plants, animals, fungi, protists, archaea, and bacteria. Biology is the science concerned with the study of life.

Death is the permanent cessation of all biological functions that sustain a living organism.[1] The remains of a living organism begin to decompose shortly after death.[2] It is an inevitable process eventually occurring in all living organisms. As of the early 21st century, over 150,000 humans die each day.[3][4]. Many cultures and religions have the idea of an afterlife, and also hold the idea of judgement and reward for good deeds or punishment for sin.

Folk psychology explains human behavior on the basis of mental states, including beli

In [47]:
# instantiate "textTiling" model  
NB_WORDS_IN_PSEUDOSENT = 20  # number of words of a pseudo-sentence
NB_PSEUDOSENT_IN_BLOCKS = 10 # number of pseudo-sentences in a block  
BLOCK_COMPARISON = 0         # activate block comparison method
DEMO_MODE = False 
tt = TextTilingTokenizer(w=NB_WORDS_IN_PSEUDOSENT, k=NB_PSEUDOSENT_IN_BLOCKS, similarity_method=BLOCK_COMPARISON, demo_mode=DEMO_MODE) 

In [48]:
Segmented_text = tt.tokenize(text)

In [49]:
# Display topic segmented text
Segmented_text = tt.tokenize(text)
for segment_id, segment in enumerate(Segmented_text):
    print("\n", segment_id+1, "==================")
    print(segment)


Life is a characteristic that distinguishes physical entities that have biological processes, such as signaling and self-sustaining processes, from those that do not, either because such functions have ceased (they have died), or because they never had such functions and are classified as inanimate. Various forms of life exist, such as plants, animals, fungi, protists, archaea, and bacteria. Biology is the science concerned with the study of life.



Death is the permanent cessation of all biological functions that sustain a living organism.[1] The remains of a living organism begin to decompose shortly after death.[2] It is an inevitable process eventually occurring in all living organisms. As of the early 21st century, over 150,000 humans die each day.[3][4]. Many cultures and religions have the idea of an afterlife, and also hold the idea of judgement and reward for good deeds or punishment for sin.

Folk psychology explains human behavior on the basis of mental states, including b

In [50]:
# Get model's scores & boundaries 
DEMO_MODE = True 
tt_demo = TextTilingTokenizer(w=NB_WORDS_IN_PSEUDOSENT, k=NB_PSEUDOSENT_IN_BLOCKS, similarity_method=BLOCK_COMPARISON, demo_mode=DEMO_MODE) 

In [51]:
# apply to text  
tic = time.time()
a,b,c,boundaries = tt_demo.tokenize(text)
toc = time.time()
print('(textTiling duration)', toc-tic,'sec')

(textTiling duration) 0.038736820220947266 sec


In [52]:
# display segmented topic block boundaries
print(boundaries)

[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]


# Explain

* The text is divided into blocks of `k` same-size pseudo-sentences (`w` words each).
> Example: A text sample of `n`=1,000 characters is divided into `5` (`n`/`k`*`w`) blocks of `k`=10 pseudosentences \* `w`=20 words 

In [53]:
# Display examples pseudo-sentences  
print("Nb of pseudo-sentences:", len(tt._divide_to_tokensequences(text)))
print("\n", tt._divide_to_tokensequences(text)[0].wrdindex_list)
print("Nb of words (w):",len(tt._divide_to_tokensequences(text)[0].wrdindex_list))
print("\n", tt._divide_to_tokensequences(text)[1].wrdindex_list)
print("Nb of words (w):", len(tt._divide_to_tokensequences(text)[1].wrdindex_list))
print("\n....\n")
print(tt._divide_to_tokensequences(text)[-1].wrdindex_list)
print("Nb of words (w):", len(tt._divide_to_tokensequences(text)[-1].wrdindex_list))

Nb of pseudo-sentences: 24

 [('Life', 0), ('is', 5), ('a', 8), ('characteristic', 10), ('that', 25), ('distinguishes', 30), ('physical', 44), ('entities', 53), ('that', 62), ('have', 67), ('biological', 72), ('processes', 83), ('such', 94), ('as', 99), ('signaling', 102), ('and', 112), ('self', 116), ('sustaining', 121), ('processes', 132), ('from', 143)]
Nb of words (w): 20

 [('those', 148), ('that', 154), ('do', 159), ('not', 162), ('either', 167), ('because', 174), ('such', 182), ('functions', 187), ('have', 197), ('ceased', 202), ('they', 210), ('have', 215), ('died', 220), ('or', 227), ('because', 230), ('they', 238), ('never', 243), ('had', 249), ('such', 253), ('functions', 258)]
Nb of words (w): 20

....

[('conceptions', 2844), ('and', 2856), ('traditions', 2860)]
Nb of words (w): 3


In [54]:
# Display example of first block  
first_block = tt._divide_to_tokensequences(text)[:NB_PSEUDOSENT_IN_BLOCKS]
[print('\n',ix, block.wrdindex_list) for ix,block in enumerate(first_block)]


 0 [('Life', 0), ('is', 5), ('a', 8), ('characteristic', 10), ('that', 25), ('distinguishes', 30), ('physical', 44), ('entities', 53), ('that', 62), ('have', 67), ('biological', 72), ('processes', 83), ('such', 94), ('as', 99), ('signaling', 102), ('and', 112), ('self', 116), ('sustaining', 121), ('processes', 132), ('from', 143)]

 1 [('those', 148), ('that', 154), ('do', 159), ('not', 162), ('either', 167), ('because', 174), ('such', 182), ('functions', 187), ('have', 197), ('ceased', 202), ('they', 210), ('have', 215), ('died', 220), ('or', 227), ('because', 230), ('they', 238), ('never', 243), ('had', 249), ('such', 253), ('functions', 258)]

 2 [('and', 268), ('are', 272), ('classified', 276), ('as', 287), ('inanimate', 290), ('Various', 301), ('forms', 309), ('of', 315), ('life', 318), ('exist', 323), ('such', 330), ('as', 335), ('plants', 338), ('animals', 346), ('fungi', 355), ('protists', 362), ('archaea', 372), ('and', 381), ('bacteria', 385), ('Biology', 395)]

 3 [('is', 4

[None, None, None, None, None, None, None, None, None, None]

In [55]:
# Display example of first block    
first_block = tt._divide_to_tokensequences(text)[:NB_PSEUDOSENT_IN_BLOCKS]
[print('\n',ix, block.wrdindex_list) for ix,block in enumerate(first_block)]


 0 [('Life', 0), ('is', 5), ('a', 8), ('characteristic', 10), ('that', 25), ('distinguishes', 30), ('physical', 44), ('entities', 53), ('that', 62), ('have', 67), ('biological', 72), ('processes', 83), ('such', 94), ('as', 99), ('signaling', 102), ('and', 112), ('self', 116), ('sustaining', 121), ('processes', 132), ('from', 143)]

 1 [('those', 148), ('that', 154), ('do', 159), ('not', 162), ('either', 167), ('because', 174), ('such', 182), ('functions', 187), ('have', 197), ('ceased', 202), ('they', 210), ('have', 215), ('died', 220), ('or', 227), ('because', 230), ('they', 238), ('never', 243), ('had', 249), ('such', 253), ('functions', 258)]

 2 [('and', 268), ('are', 272), ('classified', 276), ('as', 287), ('inanimate', 290), ('Various', 301), ('forms', 309), ('of', 315), ('life', 318), ('exist', 323), ('such', 330), ('as', 335), ('plants', 338), ('animals', 346), ('fungi', 355), ('protists', 362), ('archaea', 372), ('and', 381), ('bacteria', 385), ('Biology', 395)]

 3 [('is', 4

[None, None, None, None, None, None, None, None, None, None]

## References
https://www.nltk.org/book/ch05.html    
    see 2.7 Unsimplified Tags  
https://www.cl.cam.ac.uk/teaching/1011/L104/lec10-2x2.pdf  