In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [3]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [4]:
sents = list(doc.sents)
sents[0]

This is the first sentence.

## Adding new rules

In [5]:
doc = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [6]:
doc

"Management is doing things right; leadership is doing the right things." -Peter Drucker

In [7]:
for sent in doc.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


## 1.  Add a segmentation rules.
## 2.  Change segmentation rules.

## Add a segmentation rules

In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [9]:
for token in doc:
    print(f"{token.i:<{5}} {token}")

0     "
1     Management
2     is
3     doing
4     things
5     right
6     ;
7     leadership
8     is
9     doing
10    the
11    right
12    things
13    .
14    "
15    -Peter
16    Drucker


In [10]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [11]:
from spacy.language import Language

@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:         # Exclude last token (i+1)
        if token.text == ';':
            doc[token.i+1].is_sent_start = True # after ';' separate sentence
    return doc

In [12]:
nlp.add_pipe("set_custom_boundaries", before='parser')

<function __main__.set_custom_boundaries(doc)>

In [13]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'ner',
 'attribute_ruler',
 'lemmatizer']

In [14]:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [15]:
for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker


## Change segmentation rules

In [24]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [25]:
from spacy.language import Language

@Language.component("split_on_newlines")
def split_on_newlines(doc):
    for i, token in enumerate(doc[:-2]):
        # Define sentence start if pipe + titlecase token
        if token.text == '\n':
            doc[i + 1].is_sent_start = True
        else:
            # Explicitly set sentence start to False otherwise, to tell
            # the parser to leave those tokens alone
            doc[i + 1].is_sent_start = False
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("split_on_newlines", before="parser")  # Insert before the parser

<function __main__.split_on_newlines(doc)>

In [26]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."
doc = nlp(mystring)
for sent in doc.sents:
    print(sent.text)

This is a sentence. This is another.

This is a 

third sentence.
