In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [4]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [5]:
List = list(doc.sents)

In [6]:
List[0]

This is the first sentence.

In [7]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter drucker')

In [8]:
doc.text

'"Management is doing the right things; leadership is doing the right things." - Peter drucker'

In [10]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


- Peter drucker




In [14]:
# Add a segmentation rule
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text ==';':
            doc[token.i + 1].is_sent_start = True
    
    return doc
        #print(token.i) #token index

In [15]:
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [16]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter drucker')

In [17]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
- Peter drucker


In [12]:
set_custom_boundaries(doc)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


In [13]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." - Peter

In [18]:
#CHANGE Segmentation Rules

In [19]:
nlp = spacy.load('en_core_web_sm')

In [20]:
mystring = u"This is a sentence. This is another.\n\nThis is a third sentence."

In [21]:
print(mystring)

This is a sentence. This is another.

This is a third sentence.


In [22]:
#Default behavior
doc = nlp(mystring)

In [23]:
for sent in doc.sents:
    print(sent)

This is a sentence.
This is another.


This is a third sentence.


In [24]:
from spacy.pipeline import SentenceSegmenter

In [25]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i  #word.i is the token index. we set it as a new starting point
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
            
    yield doc[start:]

In [27]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [28]:
nlp.add_pipe(sbd)

In [29]:
doc = nlp(mystring)

In [30]:
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another.


This is a third sentence.
