## Get phrases (keywords) from pytextrank

In [1]:
import pandas as pd
df = pd.read_csv("../input/2020-07-16_Tier-2-5-sponsor-guidance_Jul-2020_v1.0_section.csv", index_col=0); df

Unnamed: 0,section_title,raw_text
0,Contents,\nTier 2 and 5: Guidance for Sponsors - Adden...
1,About this guidance,This guidance is for organisations who want to...
2,Contacts,If you think that the guidance has factual err...
3,Clearance and publication,Below is information on when this version of t...
4,Glossary,Phrase \nWe or Us \nYou or Your \nMeaning \nTh...
...,...,...
245,How to apply for a licence,"To apply, you must complete the online applica..."
246,If your application is approved,If your application for a sponsor licence is a...
247,Assigning a CoS for RQF level 3 to 5 jobs,"If your application is approved, you will not ..."
248,If your application for a licence is refused,If we are not satisfied that you can offer gen...


In [2]:
import re

def first_level_cleaning(doc):
    """
    Cleans the document from unneecessary chars/words, etc.
    """
    doc = str(doc)                               # in case numeric type exsisted 
    doc = re.sub(r"[\r\t\n]+", "", doc)          # find & replace \t and \n with empty string
    doc = re.sub(r"[^\x00-\x7F]+", " ", doc)     # remove non-ascii chars
    doc = re.sub(r" +", " ", doc)                # remove dublicate spaces
    doc = doc.strip()                            # strip leading/trailing spaces
    return doc

def first_level_pdf_cleaning(doc):
    """
    Cleans the document from unneecessary chars/words, etc.
    """
    doc = str(doc)                               # in case numeric type exsisted 
    doc = re.sub(r"[\r\t\n]+", "", doc)          # find & replace \t and \n with empty string
    doc = re.sub(r"[^\x00-\x7F]+", " ", doc)     # remove non-ascii chars
    doc = re.sub(r" +", " ", doc)                # remove dublicate spaces
    doc = doc.strip()                            # strip leading/trailing spaces
    doc = re.sub(r"(Page)\s\d+\s\w+\s\d+\s(Tiers 2 and 5: guidance for sponsors - version 07\/20)", "", doc) # @Eyzo
    return doc

In [3]:
import spacy
import pytextrank

def get_phrases_from_pytextrank(doc, max_phrases=10):
    """
    doc : string of sentences in a document
    max_keywords : max keywords to be stored (10 is default)
    returns a dict containing top 10 ranks,counts,keywords
    """
    doc = nlp(doc)
    
    phrases_cols = ["ranks", "counts", "phrases", "chunks"]
    phrases_dict = {key : [] for key in phrases_cols}
    
    for j, p in enumerate(doc._.phrases):
        phrases_dict["ranks"].append(p.rank)
        phrases_dict["counts"].append(p.count)
        phrases_dict["phrases"].append(p.text)
        phrases_dict["chunks"].append(p.chunks)
        if j > max_phrases:
            break
    return phrases_dict

In [4]:
"""
apply first level cleaning to the PDF raw_text
"""
df.raw_text = df.raw_text.apply(lambda x:first_level_pdf_cleaning(x))

In [5]:
"""
get phrases/keywords for a fiven document (raw_text)
"""
nlp = spacy.load("en_core_web_sm")                            # load a spaCy model, depending on language, scale, etc.

# add PyTextRank to the spaCy pipeline
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

raw_text = df.raw_text[1]                                      # raw text from scraped dataframe
doc = first_level_cleaning (raw_text)                          # first level cleaning if needed
dict_phrases = get_phrases_from_pytextrank(doc, max_phrases=100) # return dict of phrases(including keywords)

print(dict_phrases)

{'ranks': [0.10688218348585257, 0.10239409026785604, 0.09204076600255537, 0.08402752341721098, 0.0689115697354489, 0.06606036945854027, 0.06538156619962618, 0.06323012781469459, 0.06251550500874872, 0.05698173521445014, 0.05656627988897946, 0.05439960440715311, 0.05411153777054652, 0.05375026489164127, 0.05231611624145761, 0.051836034095991426, 0.05130378942403131, 0.05042550744840741, 0.05036891020838295, 0.05031281015052348, 0.05000592152019122, 0.04935676217106468, 0.04886967205204002, 0.04885013073241558, 0.04712963647793938, 0.04687474617424535, 0.046672052765148186, 0.04530242964220477, 0.04382656891980708, 0.04336664470835566, 0.037678579908970354, 0.03609464747651431, 0.035996100356353845, 0.0303014232338582, 0.0299988000719952, 0.025394868833931415, 0.018566529548590362, 0.0], 'counts': [1, 1, 1, 1, 1, 2, 1, 3, 3, 1, 1, 1, 4, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 15], 'phrases': ['sponsorship policy guidance', 'separate guidance', 'non-eea nat

In [6]:
df_phrases = pd.DataFrame(dict_phrases) ; df_phrases

Unnamed: 0,ranks,counts,phrases,chunks
0,0.106882,1,sponsorship policy guidance,"[(sponsorship, policy, guidance)]"
1,0.102394,1,separate guidance,"[(Separate, guidance)]"
2,0.092041,1,non-eea nationals,"[(non, -, EEA, nationals)]"
3,0.084028,1,uk education providers,"[(UK, education, providers)]"
4,0.068912,1,economic area,"[(Economic, Area)]"
5,0.06606,2,gov.uk,"[(GOV.UK), (GOV.UK)]"
6,0.065382,1,international students,"[(international, students)]"
7,0.06323,3,tier,"[(Tier), (Tier), (and/or, Tier)]"
8,0.062516,3,uk,"[(UK), (UK), (the, UK)]"
9,0.056982,1,a new points-based immigration system,"[(A, new, points, -, based, immigration, system)]"


In [7]:
df_phrases[df_phrases.phrases=="who"]

Unnamed: 0,ranks,counts,phrases,chunks
37,0.0,15,who,"[(who), (It), (you), (what), (we), (you), (you..."


In [8]:
df_phrases.drop(df_phrases[df_phrases.phrases == "who"].index, inplace=True); df_phrases

Unnamed: 0,ranks,counts,phrases,chunks
0,0.106882,1,sponsorship policy guidance,"[(sponsorship, policy, guidance)]"
1,0.102394,1,separate guidance,"[(Separate, guidance)]"
2,0.092041,1,non-eea nationals,"[(non, -, EEA, nationals)]"
3,0.084028,1,uk education providers,"[(UK, education, providers)]"
4,0.068912,1,economic area,"[(Economic, Area)]"
5,0.06606,2,gov.uk,"[(GOV.UK), (GOV.UK)]"
6,0.065382,1,international students,"[(international, students)]"
7,0.06323,3,tier,"[(Tier), (Tier), (and/or, Tier)]"
8,0.062516,3,uk,"[(UK), (UK), (the, UK)]"
9,0.056982,1,a new points-based immigration system,"[(A, new, points, -, based, immigration, system)]"


In [9]:
sum(df_phrases.ranks)

1.9781018057257238