In [9]:
# Some from Spacy tutorial, some from https://realpython.com/natural-language-processing-spacy-python/
file = open('Corpus_of_Paras.txt', 'r')
text= file.read() 
text=text[0:10000]

In [10]:
#This is how you can convert a text file into a processed Doc object.
#nlp now refers to the language model instance
# a convenient nomenclature would be using the suffix _text for Unicode string objects.
# and the suffix _doc for spaCy’s language model objects
import spacy
nlp = spacy.load("en_core_web_sm") #disable=["tagger", "parser"])

**OPTIONAL PREPROCESSING**
spaCy allows you to customize tokenization by updating the tokenizer property on the nlp object:
you can create a preprocessing function that takes text as input and applies the following operations:
Lowercases the text
Lemmatizes each token
Removes punctuation symbols
Removes stop words

In [4]:
import re
>>> from spacy.tokenizer import Tokenizer
>>> custom_nlp = spacy.load('en_core_web_sm')

>>> prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
>>> suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
>>> infix_re = re.compile(r'''[-~]''')
>>> def customize_tokenizer(nlp):
...     # Adds support to use `-` as the delimiter for tokenization
...     return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
...                      suffix_search=suffix_re.search,
...                      infix_finditer=infix_re.finditer,
...                      token_match=None
...                      )

>>> custom_nlp.tokenizer = customize_tokenizer(custom_nlp)

In [None]:
>>> custom_tokenizer_doc = custom_nlp(text)
>>> print([token.text for token in custom_tokenizer_doc])

**1st STEPS**

In [None]:
doc = nlp(text)
print([t.text for t in doc])

In [None]:
# It has already been tokenised. 
# See instances of tokens
print(doc[-1].text)   
print(doc[0].text)         
print(doc[1].text)  

In [None]:
print(doc[1:20].text)      

In [None]:
# See tokens for the given doc
print ([token.text for token in doc])

# Or similarly you can print tokens by iterating on the Doc object:

#for token in doc:
#    print (token, token.idx) 

#Note how spaCy preserves the starting index of the tokens. 
#It’s useful for in-place word replacement. 

**ALTERNATIVE OPTIONAL PREPROCESSING ** AT THIS POINT I DON'T UNDERSTAND HOW THE OUTPUT GETS SET AS PART OF THE MODEL. 

In [None]:
>>> def is_token_allowed(token):
...     '''
...         Only allow valid tokens which are not stop words
...         and punctuation symbols.
...     '''
...     if (not token or not token.string.strip() or
...         token.is_stop or token.is_punct):
...         return False
...     return True
...
>>> def preprocess_token(token):
...     # Reduce token to its lowercase lemma form
...     return token.lemma_.strip().lower()
...
>>> complete_filtered_tokens = [preprocess_token(token)
...     for token in doc if is_token_allowed(token)]
>>> complete_filtered_tokens
#Note that the complete_filtered_tokens does not contain any stop word or punctuation symbols
#symbols and consists of lemmatized lowercase tokens.

**BASIC PREPROCESSING** only appears to be useful when doing some stats on word types etc. 
This seems to be done by creating a new list of words, rather than by changing the Spacy object. 

In [18]:
#I then jog on past stop words and lemmatization
# in effect .....
# Remove stop words and punctuation symbols
words = [token.text for token in doc
          if not token.is_stop and not token.is_punct]

**WORD FRQUENCIES ETC**

In [21]:
from collections import Counter
word_freq = Counter(words)
# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(5)
print (common_words)

[('\n', 81), ('\n\n', 72), ('security', 23), ('Nuclear', 17), ('Security', 13)]


In [None]:
# Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)

**NOUN CHUNKS**
spaCy has the property noun_chunks on Doc object. You can use it to extract noun phrases:Shallow parsing, or chunking, is the process of extracting phrases from unstructured text. Chunking groups adjacent tokens into phrases on the basis of their POS tags. There are some standard well-known chunks such as noun phrases, verb phrases, and prepositional phrases.

Noun Phrase Detection: A noun phrase is a phrase that has a noun as its head. It could also include other kinds of words, such as adjectives, ordinals, determiners. Noun phrases are useful for explaining the context of the sentence. They help you infer what is being talked about in the sentence.I avoided the next bit about verb phrases

In [None]:
noun_chunks = list(doc.noun_chunks)
print(noun_chunks[2].text)  
print (noun_chunks)

**FIND SENTENCES**

In [None]:
sentences = list(doc.sents)
#assert len(sentences) == 3

print (len(sentences)
print(sentences[100].text) 

for sentence in sentences:
    print ('-' + str(sentence))

In [None]:
#OPTIONAL ALTERNATIVE FOR SENTENCES
#Here’s an example, where an ellipsis(...) is used as the delimiter. 
#These sentences are still obtained via the sents attribute, a
def set_custom_boundaries(doc):
     # Adds support to use `...` as the delimiter for sentence detection
     for token in doc[:-1]:
         if token.text == '...':
             doc[token.i+1].is_sent_start = True
     return doc

# Load a new model instance
custom_nlp = spacy.load('en_core_web_sm')
custom_nlp.add_pipe(set_custom_boundaries, before='parser')
custom_ellipsis_doc = custom_nlp(text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
    print("--",sentence)

**OPTIONAL REFERRING TO PARTICULAR PARTS OF THE TEXT**

In [None]:
#use hash values for any string, building up a vocabulary
screening_hash = nlp.vocab.strings["screening"]  
screening_text = nlp.vocab.strings[screening_hash]  
print(screening_hash, screening_text)
print(doc[2].orth, screening_hash)  #presumably only comes back with the same numbers if it is the same word
print(doc[2].text, screening_text)   #presumably only comes back with the same words if it is the same word

necessary_hash = doc.vocab.strings.add("necessary")  
necessary_text = doc.vocab.strings[necessary_hash]  
print(necessary_hash, necessary_text)

**NAMED ENTITY RECOGNITION**

In [13]:
#Named Entity Recognition (NER) is the process of locating named entities in unstructured text and then classifying them into pre-defined categories
#such as person names, organizations, locations, monetary values, percentages, time expressions, and so on.
#You can use NER to know more about the meaning of your text.
#For example, you could use it to populate tags for a set of documents in order to improve the keyword search. 
#You could also use it to categorize customer support tickets into relevant categories.
#spaCy has the property ents on Doc objects. You can use it to extract named entities:
#recognise and update named entities i.e. add an entity
from spacy.tokens import Span

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

ONR GUIDE 64 73 ORG
Unique Document ID 161 179 PERSON
CNS 200 203 ORG
March 2017 244 254 DATE
March 2020 272 282 DATE
David Pascoe 300 312 PERSON
2 588 589 CARDINAL
2 592 593 CARDINAL
PURPOSE 596 603 ORG
2 716 717 CARDINAL
3 720 721 CARDINAL
2 826 827 CARDINAL
4 830 831 CARDINAL
IAEA 850 854 ORG
2 921 922 CARDINAL
5 925 926 CARDINAL
3 1023 1024 CARDINAL
6 1027 1028 CARDINAL
INSPECTORS 1041 1051 ORG
3 1150 1151 CARDINAL
SUPPLY 1158 1164 PERSON
4 1244 1245 CARDINAL
SUPPLY CHAIN MANAGEMENT SYSTEMS 1252 1283 ORG
5 1354 1355 CARDINAL
9 1358 1359 CARDINAL
8 1490 1491 CARDINAL
ABBREVIATIONS 1511 1524 ORG
9 1608 1609 CARDINAL
Office for Nuclear Regulation 1654 1683 ORG
2017 1685 1689 DATE
www.onr.org.uk 1735 1749 ORG
Revision 3 1841 1851 FAC
1 1859 1860 CARDINAL
9 1864 1865 CARDINAL
1 1906 1907 CARDINAL
1.2 1949 1952 CARDINAL
2.1 1955 1958 CARDINAL
3.1 1961 1964 CARDINAL
3.2 1967 1970 CARDINAL
The Office for Nuclear Regulation 1973 2006 ORG
ONR 2008 2011 ORG
Security Assessment 
 2038 2059 ORG

In [14]:
#Adding specific entities
doc1 = nlp("FB is hiring a new VP of global policy")
doc1.ents = [Span(doc, 0, 1, label="ORG")]
for ent in doc1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

FB 0 2 ORG


In [None]:
doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google "
              "in 2007, few people outside of the company took him seriously.")
displacy.render(doc_ent, style="ent")

**DEPENDENCY PARSING** OPTIONAL

In [None]:
#there is then a section on dependency parsing , and navigating the consqeuent tree for it. 
#I avoid the detail of this but then show a dependency print out here
#view a dependency parser
# i seem to have to stop this for it to print out

#syntactic dependencies
#doc = nlp("When Sebastian Thrun started working on self-driving cars at Google " "in 2007, few people outside of the company took him seriously.")
dep_labels = []
for token in doc:
    while token.head != token:
        dep_labels.append(token.dep_)
        token = token.head
print(dep_labels)

doc_dep = nlp("This is a sentence.")
displacy.render(doc_dep, style="dep")

**SIMILARITY BETWEEN WORDS** OPTIONAL This would not be my go-to package for this

In [None]:
#For the best results, this example should use the the en_vectors_web_lg model -as the small model doesnt have all the stuff
doc = nlp("Apple and banana are similar. Pasta and hippo aren't.")

apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]

print("apple <-> banana", apple.similarity(banana))
print("pasta <-> hippo", pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)

**SAVING MODEL**

In [None]:
#If you’ve been modifying the pipeline, vocabulary, vectors and entities
#or made updates to the model, you’ll eventually want to save your progress – for example, everything that’s in your nlp object.
# simple and efficient serialisation
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc.to_disk("Example saving of spacy nlp object.bin")
new_doc = Doc(Vocab()).from_disk("Example saving of spacy nlp object.bin")
print (new_doc[1:1000])

In [None]:
#export to Numpy arrays #not checked
from spacy.attrs import ORTH, LIKE_URL

doc = nlp("Check out https://spacy.io")
for token in doc:
    print(token.text, token.orth, token.like_url)

attr_ids = [ORTH, LIKE_URL]
doc_array = doc.to_array(attr_ids)
print(doc_array.shape)
print(len(doc), len(attr_ids))

assert doc[0].orth == doc_array[0, 0]
assert doc[1].orth == doc_array[1, 0]
assert doc[0].like_url == doc_array[0, 1]

assert list(doc_array[:, 1]) == [t.like_url for t in doc]
print(list(doc_array[:, 1]))

**STREAM PROCESSING *** This takes up too much memory at moment

In [None]:
# Import list from CSV (could also do from txt)
import os
directory = '/Users/lawrence/'
Corpus_CSV=[]
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        with open(os.path.join(directory, filename)) as f:
            content=f.read()
            Corpus_CSV.append(content)
            print ('\n\nFile',filename)
            f.close()

In [None]:
#Minibatched stream processing
# .pipe streams input, and produces streaming output
iter_texts = (Corpus_CSV[i % 3] for i in range(5))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)):
    print (i,doc) # i added this in to show what can be done
    assert doc.is_parsed
    if i == 7:
        break

**SEEING WHAT IT KNOWS ABOUT TOKENS** optional

In [None]:
#spaCy provides various attributes for the Token class:
for token in doc:
     print (token, token.idx, token.text_with_ws, token.is_space, token.is_stop)
#In this example, some of the commonly required attributes are accessed:
#text_with_ws prints token text with trailing space (if present).
#is_space detects if the token is a space or not.
#is_stop detects if the token is a stop word or not.

In [None]:
#get POS tags and flags
Particular_token=doc[100]
print (Particular_token)
print("Fine-grained POS tag", Particular_token.pos_, Particular_token.pos)
print("Coarse-grained POS tag", Particular_token.tag_, Particular_token.tag)
print("Word shape", Particular_token.shape_, Particular_token.shape)
print("Alphabetic characters?", Particular_token.is_alpha)
print("Punctuation mark?", Particular_token.is_punct)
print("Digit?", Particular_token.is_digit)
print("Like a number?", Particular_token.like_num)
print("Like an email address?", Particular_token.like_email)

**extract entities (such as phone numbers) from an unstructured text**

Rule-based matching is one of the steps in extracting information from unstructured text. It’s used to identify and extract tokens and phrases according to patterns (such as lowercase) and grammatical features (such as part of speech).

Rule-based matching can use regular expressions to extract entities (such as phone numbers) from an unstructured text. It’s different from extracting text using regular expressions only in the sense that regular expressions don’t consider the lexical and grammatical attributes of the text.

With rule-based matching, you can extract a first name and a last name, which are always proper nouns:
There is also an example of extracting phone numbers which I avoided

In [16]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
def extract_full_name(nlp_doc):
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('FULL_NAME', None, pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text
extract_full_name(doc)

'ONR GUIDE'