## Get keywords from NER

In [2]:
import pandas as pd
df = pd.read_csv("../input/2020-07-16_Tier-2-5-sponsor-guidance_Jul-2020_v1.0_section.csv", index_col=0); df

Unnamed: 0,section_title,raw_text
0,Contents,\nTier 2 and 5: Guidance for Sponsors - Adden...
1,About this guidance,This guidance is for organisations who want to...
2,Contacts,If you think that the guidance has factual err...
3,Clearance and publication,Below is information on when this version of t...
4,Glossary,Phrase \nWe or Us \nYou or Your \nMeaning \nTh...
...,...,...
245,How to apply for a licence,"To apply, you must complete the online applica..."
246,If your application is approved,If your application for a sponsor licence is a...
247,Assigning a CoS for RQF level 3 to 5 jobs,"If your application is approved, you will not ..."
248,If your application for a licence is refused,If we are not satisfied that you can offer gen...


In [3]:
import re

def first_level_cleaning(doc):
    """
    Cleans the document from unneecessary chars/words, etc.
    """
    doc = str(doc)                               # in case numeric type exsisted 
    doc = re.sub(r"[\r\t\n]+", "", doc)          # find & replace \t and \n with empty string
    doc = re.sub(r"[^\x00-\x7F]+", " ", doc)     # remove non-ascii chars
    doc = re.sub(r" +", " ", doc)                # remove dublicate spaces
    doc = doc.strip()                            # strip leading/trailing spaces
    return doc

def first_level_pdf_cleaning(doc):
    """
    Cleans the document from unneecessary chars/words, etc.
    """
    doc = str(doc)                               # in case numeric type exsisted 
    doc = re.sub(r"[\r\t\n]+", "", doc)          # find & replace \t and \n with empty string
    doc = re.sub(r"[^\x00-\x7F]+", " ", doc)     # remove non-ascii chars
    doc = re.sub(r" +", " ", doc)                # remove dublicate spaces
    #doc = re.sub(r"(Page)\s\d+\s\w+\s\d+\s(Tiers 2 and 5: guidance for sponsors - version 07\/20)", "", doc) # @Eyzo
    doc = doc.strip()                            # strip leading/trailing spaces
    return doc

In [4]:
"""
apply first level cleaning to the PDF raw_texts
"""
df.raw_text = df.raw_text.apply(lambda x:first_level_pdf_cleaning(x))

In [9]:
"""
get phrases/keywords for a given document (raw_text)
"""
import spacy
nlp = spacy.load("en_core_web_sm")                            # load a spaCy model, depending on language, scale, etc.

raw_text = df.raw_text[1]                                      # raw text from scraped dataframe
doc = first_level_cleaning (raw_text)                          # first level cleaning if needed
print(doc)

This guidance is for organisations who want to apply for a sponsor licence to sponsor migrants under Tier 2 and/or Tier 5 of the points-based system. It tells you what we expect if you are a licence holder, the processes you must follow when sponsoring a migrant and how to meet all of the duties and responsibilities associated with being a licensed sponsor. The guidance is subject to change and you should check the dates to make sure you have the latest version. A new points-based immigration system will come into effect from 1 January 2021. The future system will apply to both European Economic Area (EEA) nationals and non-EEA nationals. You should refer to Annex 9 of this guidance if you intend to apply for a licence to sponsor workers under the new system. Separate guidance exists on GOV.UK for UK education providers who wish to apply for and hold a licence to sponsor international students to come to the UK under Tier 4 to study. You can find the appendices mentioned in this guidan

In [10]:
type(doc)

str

In [11]:
doc = nlp(doc)
print(doc)

This guidance is for organisations who want to apply for a sponsor licence to sponsor migrants under Tier 2 and/or Tier 5 of the points-based system. It tells you what we expect if you are a licence holder, the processes you must follow when sponsoring a migrant and how to meet all of the duties and responsibilities associated with being a licensed sponsor. The guidance is subject to change and you should check the dates to make sure you have the latest version. A new points-based immigration system will come into effect from 1 January 2021. The future system will apply to both European Economic Area (EEA) nationals and non-EEA nationals. You should refer to Annex 9 of this guidance if you intend to apply for a licence to sponsor workers under the new system. Separate guidance exists on GOV.UK for UK education providers who wish to apply for and hold a licence to sponsor international students to come to the UK under Tier 4 to study. You can find the appendices mentioned in this guidan

In [12]:
type(doc)

spacy.tokens.doc.Doc

In [20]:
"""
list of tokens in the document
"""
[(token.text, token.ent_type_) for token in doc if token.ent_type != 0]

[('2', 'CARDINAL'),
 ('5', 'CARDINAL'),
 ('1', 'DATE'),
 ('January', 'DATE'),
 ('2021', 'DATE'),
 ('European', 'NORP'),
 ('Economic', 'LOC'),
 ('Area', 'LOC'),
 ('EEA', 'ORG'),
 ('Annex', 'ORG'),
 ('9', 'ORG'),
 ('UK', 'GPE'),
 ('UK', 'GPE')]

In [22]:
"""
list of named entities in the document
"""
[(ent.text, ent.label_) for ent in list(doc.ents)]

[('2', 'CARDINAL'),
 ('5', 'CARDINAL'),
 ('1 January 2021', 'DATE'),
 ('European', 'NORP'),
 ('Economic Area', 'LOC'),
 ('EEA', 'ORG'),
 ('Annex 9', 'ORG'),
 ('UK', 'GPE'),
 ('UK', 'GPE')]

In [34]:
"""
list of named entities excluding DATE and CARDINAL
"""
from collections import Counter

list_ner = [ent.text for ent in list(doc.ents) if ent.label_ != "DATE" and ent.label_ != "CARDINAL"]
print(list_ner)
Counter(list_ner)

['European', 'Economic Area', 'EEA', 'Annex 9', 'UK', 'UK']


Counter({'European': 1, 'Economic Area': 1, 'EEA': 1, 'Annex 9': 1, 'UK': 2})

## Create a NER pipeline


https://support.prodi.gy/t/custom-ner-model/1376/6

If you do **ner = nlp.get_pipe("ner")** you’ll get an instance that’s a subclass of the **spacy.syntax.nn_parser.Parser** class.  
Once the model is loaded, the **ner.model** attribute gives you an instance of **spacy.syntax._parser_model.ParserModel**.  

This class has an attribute **unseen_classes** that is a set of class IDs.  
If you add the class ID to this, you should **prevent the class from being predicted**


In [41]:
"""
TODO : exclude classes to be predicted
"""
import spacy
nlp = spacy.load("en_core_web_sm")                            # load a spaCy model, depending on language, scale, etc.

ner = nlp.get_pipe("ner")
class_ids = {name: i for i, name in enumerate(ner.move_names)}
"""
> https://spacy.io/usage/training
O is a token outside an entity
U an single entity unit
B the beginning of an entity
I a token inside an entity
L the last token of an entity
"""
print(class_ids)

# add unseen classes to the model
ner.model.unseen_classes.add(class_ids["U-DATE"])

# tokenize the sentence by using nlp model
doc = ner(nlp(df.raw_text[1]))

for ent in doc.ents:
    print("{:30s} {:30s}".format(ent.text, ent.label_))

{'B-ORG': 0, 'B-DATE': 1, 'B-PERSON': 2, 'B-GPE': 3, 'B-MONEY': 4, 'B-CARDINAL': 5, 'B-NORP': 6, 'B-PERCENT': 7, 'B-WORK_OF_ART': 8, 'B-LOC': 9, 'B-TIME': 10, 'B-QUANTITY': 11, 'B-FAC': 12, 'B-EVENT': 13, 'B-ORDINAL': 14, 'B-PRODUCT': 15, 'B-LAW': 16, 'B-LANGUAGE': 17, 'I-ORG': 18, 'I-DATE': 19, 'I-PERSON': 20, 'I-GPE': 21, 'I-MONEY': 22, 'I-CARDINAL': 23, 'I-NORP': 24, 'I-PERCENT': 25, 'I-WORK_OF_ART': 26, 'I-LOC': 27, 'I-TIME': 28, 'I-QUANTITY': 29, 'I-FAC': 30, 'I-EVENT': 31, 'I-ORDINAL': 32, 'I-PRODUCT': 33, 'I-LAW': 34, 'I-LANGUAGE': 35, 'L-ORG': 36, 'L-DATE': 37, 'L-PERSON': 38, 'L-GPE': 39, 'L-MONEY': 40, 'L-CARDINAL': 41, 'L-NORP': 42, 'L-PERCENT': 43, 'L-WORK_OF_ART': 44, 'L-LOC': 45, 'L-TIME': 46, 'L-QUANTITY': 47, 'L-FAC': 48, 'L-EVENT': 49, 'L-ORDINAL': 50, 'L-PRODUCT': 51, 'L-LAW': 52, 'L-LANGUAGE': 53, 'U-ORG': 54, 'U-DATE': 55, 'U-PERSON': 56, 'U-GPE': 57, 'U-MONEY': 58, 'U-CARDINAL': 59, 'U-NORP': 60, 'U-PERCENT': 61, 'U-WORK_OF_ART': 62, 'U-LOC': 63, 'U-TIME': 64, 'U-Q