<a href="https://colab.research.google.com/github/kunjkinger/machine-learning-projects/blob/nlp/sentence_segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [4]:
for sent in doc.sents:
  print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [5]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [6]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [7]:
for sent in doc.sents:
  print(sent)
  print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter


Drucker




In [8]:
# add a segmentation rule

In [9]:
def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == ';':
      doc[token.i+1].is_sent_start = True
  return doc

In [10]:
nlp.add_pipe(set_custom_boundaries,before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [11]:
doc4= nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [13]:
for sent in doc4.sents:
  print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter
Drucker


#change segmentation rules

In [17]:
nlp = spacy.load('en_core_web_sm') # to reload 

In [21]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [22]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [23]:
doc = nlp(mystring)

In [24]:
for sent in doc.sents:
  print(sent)

This is a sentence.
This is another.


This is a 
third sentence.


In [25]:
from spacy.pipeline import SentenceSegmenter

In [26]:
def split_on_newline(doc):
  start = 0
  seen_new_line = False

  for word in doc:
    if seen_new_line:
      yield doc[start:word.i]
      start = word.i
      seen_new_line = False
    elif word.text.startswith('\n'):
      seen_new_line = True
  
  yield doc[start:]

In [27]:
sbd = SentenceSegmenter(nlp.vocab,strategy=split_on_newline)

In [28]:
nlp.add_pipe(sbd)

In [29]:
doc = nlp(mystring)

In [30]:
for sent in doc.sents:
  print(sent)

This is a sentence. This is another.


This is a 

third sentence.
