# Labels
We take the dataframe scrape_df of all html documents.
We change the entries in scrape_df['text_segmented'] to databases. Then colums for labels (hard-coded keywords, automatically found phrases and referential keywords) are added and the sections are labelled accordingly.

In [1]:
!pip install https://github.com/elyase/geotext/archive/master.zip

import pandas as pd
from web_scraping_lib import *
from text_wrangling_utils import *
from keywords import *
from geotext import GeoText
from flashtext import KeywordProcessor
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from scraping_lib import scrape_documents
import pickle
from collections import Counter
from pathlib import Path
pd.options.mode.chained_assignment = None 
import spacy
import pytextrank

Collecting https://github.com/elyase/geotext/archive/master.zip
  Using cached https://github.com/elyase/geotext/archive/master.zip
Building wheels for collected packages: geotext
  Building wheel for geotext (setup.py): started
  Building wheel for geotext (setup.py): finished with status 'done'
  Created wheel for geotext: filename=geotext-0.3.0-py2.py3-none-any.whl size=2272459 sha256=a192b46bb42a08ae08ac4b1371736c02299163d476d003a5f95b28f89730d9f6
  Stored in directory: C:\Users\hannah\AppData\Local\Temp\pip-ephem-wheel-cache-41jlw8eh\wheels\e7\a5\8c\3285a988c40b743f5f24ec003e42cad17d1ea762873b559b31
Successfully built geotext


In [2]:
def build_segments_df(segments,merge_section_headers = False):
    """
    Build a pandas.DataFrame from a list of text segments.
    segments : list of tuples
        each touple has the struncture (label,string),
        where label can be either "text","header","subheader","table". All other labels are treated as "text"
    """
    
    
    section_cntr = 0
    subsection_cntr = 0

    segments_dict = {
        "section" : [],
        "section title" : [],
        "subsection" : [],
        "subsection title" : [],
        "string" : []}
    
    section_title = ""
    subsection_title = ""
    
    was_header = False
    for (ii,segment) in enumerate(segments):

        if segment[0] == "section":
            section_cntr = section_cntr + 1
            subsection_cntr = 0
            section_title = segment[1]
            was_header = True
        elif segment[0] == "subsection":
            subsection_cntr = subsection_cntr + 1
            subsection_title = segment[1]
            was_header = True
        else:
            was_header = False
        
        
        if merge_section_headers and not was_header and ii > 0:
            segments_dict["string"][-1] += "\n" + segment[1]
        else:
            segments_dict["section"].append(section_cntr)
            segments_dict["section title"].append(section_title)
            segments_dict["subsection title"].append(subsection_title)
            segments_dict["subsection"].append(subsection_cntr)
            segments_dict["string"].append(segment[1])

    segments_df = pd.DataFrame(segments_dict)
    return segments_df

Path name in a way that also works for Windows

In [3]:
data_folder = Path("../../data cleaning/")
file_to_open = data_folder / "immigration_rules_scrape.pickle"
with open(file_to_open,'rb') as f: #
    scrape_df = pickle.load(f)

In [4]:
scrape_df["text_segmented"] = scrape_df["text_segmented"].map(lambda x : build_segments_df(x,merge_section_headers = True))

In [5]:
segm_df = [scrape_df.loc[i,"text_segmented"] for i in range(len(scrape_df))]

Adding keyword columns

In [6]:
emptylists =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['keywords'] = emptylists[i]
    segm_df[i]['references'] = emptylists[i]
    segm_df[i]['phrases'] = emptylists[i]

In [7]:
for i in range(len(scrape_df)):
    scrape_df['text_segmented'][i]=segm_df[i]

In [8]:
keyword_processor_ref = KeywordProcessor(case_sensitive=False)
keyword_processor_all = KeywordProcessor(case_sensitive=False)

### Keyword lists and dictionaries

In [9]:
keyword_processor_ref.add_keywords_from_list(reference_list)
keyword_processor_all.add_keywords_from_list(keyword_list)
keyword_processor_all.add_keywords_from_dict(keyword_dict)

In [10]:
# multiple keywords for one term have to be added here
keyword_processor_all.add_keyword('Tier 1 and 2', ['Tier 1', 'Tier 2'])  
keyword_processor_all.add_keyword('Tier 1 and 4', ['Tier 1', 'Tier 4']) 
keyword_processor_all.add_keyword('Tier 1 and 5', ['Tier 1', 'Tier 5']) 
keyword_processor_all.add_keyword('Tier 2 and 4', ['Tier 2', 'Tier 4'])  
keyword_processor_all.add_keyword('Tier 2 and 5', ['Tier 2', 'Tier 5'])
keyword_processor_all.add_keyword('Tier 4 and 5', ['Tier 4', 'Tier 5']) 
keyword_processor_all.add_keyword('Tier 1 or 2', ['Tier 1', 'Tier 2'])  
keyword_processor_all.add_keyword('Tier 1 or 4', ['Tier 1', 'Tier 4']) 
keyword_processor_all.add_keyword('Tier 1 or 5', ['Tier 1', 'Tier 5']) 
keyword_processor_all.add_keyword('Tier 2 or 4', ['Tier 2', 'Tier 4'])  
keyword_processor_all.add_keyword('Tier 2 or 5', ['Tier 2', 'Tier 5']) 
keyword_processor_all.add_keyword('Tier 4 or 5', ['Tier 4', 'Tier 5']) 
keyword_processor_all.add_keyword('Tier 1, 2 and 4', ['Tier 1', 'Tier 2', 'Tier 4'])
keyword_processor_all.add_keyword('Tier 1, 2 and 5', ['Tier 1', 'Tier 2', 'Tier 5'])
keyword_processor_all.add_keyword('Tier 1, 4 and 5', ['Tier 1', 'Tier 4', 'Tier 5'])
keyword_processor_all.add_keyword('Tier 2, 4 and 5', ['Tier 2', 'Tier 4', 'Tier 5'])
keyword_processor_all.add_keyword('Tier 1, 4 or 5', ['Tier 1', 'Tier 4', 'Tier 5'])
keyword_processor_all.add_keyword('Tier 2, 4 or 5', ['Tier 2', 'Tier 4', 'Tier 5'])
keyword_processor_all.add_keyword('Tier 1, 2, 4 and 5', ['Tier 1', 'Tier 2', 'Tier 4', 'Tier 5'])
keyword_processor_all.add_keyword('Tier 1, 2, 4 or 5', ['Tier 1', 'Tier 2', 'Tier 4', 'Tier 5'])

True

Automatic extraction of keywords ("phrases") with PyTextRank

In [11]:
def get_phrases_from_pytextrank(doc, max_phrases=10):
    """
    doc : string of sentences in a document
    max_keywords : max keywords to be stored (10 is default)
    returns a dict containing top 10 ranks,counts,keywords
    """
    doc = nlp(doc)
    phrases, counts = [], []
    for j, p in enumerate(doc._.phrases):
        counts.append(p.count)
        phrases.append(p.text)
        if j > max_phrases:
            break
    return phrases, counts

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")
# add PyTextRank to the spaCy pipeline
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

The "keywords", "references", and "phrases" columns are updated with all the keywords found

In [12]:
for i in range(len(scrape_df)):
    doctitle=scrape_df['title'][i]
    for j in range(len(segm_df[i])):       
        sectitle=segm_df[i]['section title'][j]
        subsectitle=segm_df[i]['subsection title'][j]
        text=doctitle + " " + sectitle + " "+ subsectitle + " " +segm_df[i]['string'][j]
        #The "Region" column is updated with all names of countries except "United Kingdom"
        countrylist=list(Counter(GeoText(text).countries).items())
        count_minus_uk = [t for t in countrylist if (t[0] != 'United Kingdom')] 
        keywords_all=list(Counter(keyword_processor_all.extract_keywords(text)).items())
        segm_df[i]['keywords'][j]=keywords_all+count_minus_uk
        keywords_ref=list(Counter(keyword_processor_ref.extract_keywords(text)).items())
        segm_df[i]['references'][j]=keywords_ref
        phrases, counts = get_phrases_from_pytextrank(text, max_phrases=100)
        segm_df[i]['phrases'][j]=list(zip(phrases, counts))

### Example 1: Introduction
from https://www.gov.uk/guidance/immigration-rules/immigration-rules-introduction

In [13]:
ex1 = scrape_df['text_segmented'][1]
ex1

Unnamed: 0,section,section title,subsection,subsection title,string,keywords,references,phrases
0,0,,0,,Immigration Rules: introduction\nThis contains...,[],"[(the Immigration Acts, 1)]","[(entry clearance officers, 1), (immigration r..."
1,1,Implementation and transitional provisions,0,,Implementation and transitional provisions\n\n...,[],[],"[(transitional provisions, 1), (immigration ru..."
2,2,Application,0,,Application\n\n5.Save where expressly indicate...,"[(EEA, 1)]",[],"[(european economic area, 1), (immigration rul..."
3,3,Interpretation,0,,Interpretation\n\n6.In these Rules the followi...,"[(EEA, 11), (visa, 5), (partner, 17), (Student...","[(the Immigration Acts, 1), (the 1993 Act, 1),...","[(uk degree level qualifications, 1), (such le..."


### Example 2: Appendix J
from https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-j-codes-of-practice-for-skilled-work

In [14]:
ex2 = scrape_df['text_segmented'][39]
ex2

Unnamed: 0,section,section title,subsection,subsection title,string,keywords,references,phrases
0,0,,0,,Immigration Rules Appendix J: codes of practic...,"[(Tier 2, 1), (Tier 5, 1), (Tier 1, 1)]","[(Appendix J, 2)]","[(immigration rules appendix j, 1), (work perm..."
1,1,Introduction,0,,Introduction\n\n\nThis Appendix sets out the s...,"[(salary, 1)]","[(Appendix J, 1)]","[(immigration rules appendix j, 1), (creative ..."
2,1,Introduction,1,Tables,Tables\n\n3. Table 1 sets out PhD-level occupa...,"[(Tier 2 (General), 3), (Tier 2, 3), (SOC, 4)]","[(Appendix J, 1)]","[(regulated qualifications framework level, 2)..."
3,1,Introduction,2,Occupation codes,Occupation codes\n\n11. The Standard Occupatio...,"[(SOC, 9), (sponsor, 1)]","[(Appendix J, 1), (Appendix A, 1)]","[(related job titles, 1), (jobs, 3), (skilled ..."
4,1,Introduction,3,Appropriate salary rates,Appropriate salary rates\n\n14. Where these Ru...,"[(salary, 2), (SOC, 2), (Tier 2 (General), 2),...","[(Appendix J, 1), (Appendix A, 2), (Appendix K...","[(appropriate salary rates, 1), (skilled work ..."
5,2,Table 1 - Occupational skilled to PhD-level,0,Appropriate salary rates,Table 1 - Occupational skilled to PhD-level\n ...,"[(salary, 8), (SOC, 2), (evidence, 6), ( paren...","[(Appendix J, 1)]","[(• scientist, 1), (• designs, 1), (• chemist,..."
6,3,Table 2 - Occupations skilled to Regulated Qua...,0,Appropriate salary rates,Table 2 - Occupations skilled to Regulated Qua...,"[(salary, 73), (SOC, 1), (Charity, 1), (UK reg...","[(Appendix J, 1)]","[(• web design consultant •, 1), (• data centr..."
7,4,Table 2A - Occupations skilled to Regulated Qu...,0,Appropriate salary rates,Table 2A - Occupations skilled to Regulated Qu...,"[(Tier 2 (General), 2), (salary, 2), (SOC, 2)]","[(Appendix J, 1)]","[(other electronic equipment, 1), (• tests, 1)..."
8,5,Table 3 - Occupations skilled to Regulated Qua...,0,Appropriate salary rates,Table 3 - Occupations skilled to Regulated Qua...,"[(salary, 30), (SOC, 1), (family, 2)]","[(Appendix J, 1)]",[(• delivery office manager • office manager •...
9,6,Table 4 - Occupations which are ineligible for...,0,Appropriate salary rates,Table 4 - Occupations which are ineligible for...,"[(Tier 2 (General), 3), (Tier 2, 2), (salary, ...","[(Appendix J, 1), (Appendix K, 1)]","[(• electronics engineer • field engineer, 1),..."


Pickle resulting dataframe

In [15]:
with open('labeled_immigration_rules.pickle','wb') as f:
    pickle.dump(scrape_df,f, pickle.HIGHEST_PROTOCOL)