# Labels
We take the dataframe df of all html documents.
We change the entries in df['text_segmented'] to databases. Then colums for different labels (immigration status, region, nationality etc.) are added and the sections are labelled accordingly.

In [1]:
#!pip install https://github.com/elyase/geotext/archive/master.zip

import pandas as pd
from web_scraping_lib import *
from text_wrangling_utils import *
from keywords import *
from geotext import GeoText
from flashtext import KeywordProcessor
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from scraping_lib import scrape_documents
import pickle
from collections import Counter
from pathlib import Path
pd.options.mode.chained_assignment = None 

In [2]:
def build_segments_df(segments,merge_section_headers = False):
    """
    Build a pandas.DataFrame from a list of text segments.
    segments : list of tuples
        each touple has the struncture (label,string),
        where label can be either "text","header","subheader","table". All other labels are treated as "text"
    """
    
    
    section_cntr = 0
    subsection_cntr = 0

    segments_dict = {
        "section" : [],
        "section title" : [],
        "subsection" : [],
        "subsection title" : [],
        "string" : []}
    
    section_title = ""
    subsection_title = ""
    
    was_header = False
    for (ii,segment) in enumerate(segments):

        if segment[0] == "section":
            section_cntr = section_cntr + 1
            subsection_cntr = 0
            section_title = segment[1]
            was_header = True
        elif segment[0] == "subsection":
            subsection_cntr = subsection_cntr + 1
            subsection_title = segment[1]
            was_header = True
        else:
            was_header = False
        
        
        if merge_section_headers and not was_header and ii > 0:
            segments_dict["string"][-1] += "\n" + segment[1]
        else:
            segments_dict["section"].append(section_cntr)
            segments_dict["section title"].append(section_title)
            segments_dict["subsection title"].append(subsection_title)
            segments_dict["subsection"].append(subsection_cntr)
            segments_dict["string"].append(segment[1])

    segments_df = pd.DataFrame(segments_dict)
    return segments_df

Path name in a way that also works for Windows

In [3]:
#data_folder = Path("../data cleaning/")
#file_to_open = data_folder / "immigration_rules_scrape.pickle"
file_to_open = "immigration_rules_scrape.pickle"
with open(file_to_open,'rb') as f: #
    scrape_df = pickle.load(f)

In [4]:
scrape_df["text_segmented"] = scrape_df["text_segmented"].map(lambda x : build_segments_df(x,merge_section_headers = True))
scrape_df.head()

Unnamed: 0,URL,media type,title,text_dump,text_segmented,hyperlinks_dump,timestamp
0,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules: Index,\nImmigration Rules: Index\nThe rules are divi...,section section title subsection subsectio...,[https://www.gov.uk/guidance/immigration-rules...,2020-08-25T11:04:54+00:00
1,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules: introduction,\nImmigration Rules: introduction\nThis contai...,section secti...,[],2020-08-25T11:04:56+00:00
2,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 1: leave to enter or st...,\nImmigration Rules part 1: leave to enter or ...,section ...,[],2020-08-25T11:04:56+00:00
3,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 2: transitional provisi...,\nImmigration Rules part 2: transitional provi...,section ...,[],2020-08-25T11:04:58+00:00
4,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 3: students,\nImmigration Rules part 3: students\nPersons ...,section ...,[],2020-08-25T11:04:59+00:00


In [5]:
segm_df = [scrape_df.loc[i,"text_segmented"] for i in range(len(scrape_df))]

Adding keyword columns

In [9]:
visa =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['immigration status'] = visa[i]
region =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['region'] = region[i]
nationality =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['nationality'] = nationality[i]
company =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['company'] = company[i]
compbody =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['company body'] = compbody[i]
regbody =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['regulatory body'] = regbody[i]
soc =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['SOC'] = soc[i]
documents =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['documents'] = documents[i]
relation =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['relation'] = relation[i]
general =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['general'] = general[i]
# ------ added from here --------
phrases =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['phrases'] = phrases[i]
counts =[ ([[]] * len(segm_df[i])) for i in range(len(scrape_df))]
for i in range(len(scrape_df)):
    segm_df[i]['counts'] = counts[i]    
# ------ added to here --------

In [10]:
for i in range(len(scrape_df)):
    scrape_df['text_segmented'][i]=segm_df[i]

In [11]:
keyword_processor_reg = KeywordProcessor(case_sensitive=False)
keyword_processor_nat = KeywordProcessor(case_sensitive=False)
keyword_processor_visa = KeywordProcessor(case_sensitive=False)
keyword_processor_comp = KeywordProcessor(case_sensitive=False)
keyword_processor_compbody = KeywordProcessor(case_sensitive=False)
keyword_processor_regbody = KeywordProcessor(case_sensitive=False)
keyword_processor_soc = KeywordProcessor(case_sensitive=True) # case sensitive, no confusion of "SOC" and "soc."
keyword_processor_docs = KeywordProcessor(case_sensitive=False)
keyword_processor_rel = KeywordProcessor(case_sensitive=False)
keyword_processor_gen = KeywordProcessor(case_sensitive=False)

### Keyword lists and dictionaries

In [12]:
keyword_processor_visa.add_keywords_from_list(visa_list)
keyword_processor_visa.add_keywords_from_dict(visa_dict)
keyword_processor_reg.add_keywords_from_list(region_list)
keyword_processor_reg.add_keywords_from_dict(region_dict)
keyword_processor_nat.add_keywords_from_list(nationality_list)
keyword_processor_comp.add_keywords_from_list(company_list)
keyword_processor_comp.add_keywords_from_dict(company_dict)
keyword_processor_compbody.add_keywords_from_list(compbody_list)
keyword_processor_compbody.add_keywords_from_dict(compbody_dict)
keyword_processor_regbody.add_keywords_from_list(regbody_list)
keyword_processor_soc.add_keywords_from_dict(soc_dict)
keyword_processor_docs.add_keywords_from_list(docs_list)
keyword_processor_docs.add_keywords_from_dict(docs_dict)
keyword_processor_rel.add_keywords_from_list(relation_list)
keyword_processor_gen.add_keywords_from_list(general_list)

In [13]:
# multiple keywords for one term have to be added here
keyword_processor_visa.add_keyword('Tier 1 and 2', ['Tier 1', 'Tier 2'])  
keyword_processor_visa.add_keyword('Tier 1 and 4', ['Tier 1', 'Tier 4']) 
keyword_processor_visa.add_keyword('Tier 1 and 5', ['Tier 1', 'Tier 5']) 
keyword_processor_visa.add_keyword('Tier 2 and 4', ['Tier 2', 'Tier 4'])  
keyword_processor_visa.add_keyword('Tier 2 and 5', ['Tier 2', 'Tier 5'])
keyword_processor_visa.add_keyword('Tier 4 and 5', ['Tier 4', 'Tier 5']) 
keyword_processor_visa.add_keyword('Tier 1 or 2', ['Tier 1', 'Tier 2'])  
keyword_processor_visa.add_keyword('Tier 1 or 4', ['Tier 1', 'Tier 4']) 
keyword_processor_visa.add_keyword('Tier 1 or 5', ['Tier 1', 'Tier 5']) 
keyword_processor_visa.add_keyword('Tier 2 or 4', ['Tier 2', 'Tier 4'])  
keyword_processor_visa.add_keyword('Tier 2 or 5', ['Tier 2', 'Tier 5']) 
keyword_processor_visa.add_keyword('Tier 4 or 5', ['Tier 4', 'Tier 5']) 
keyword_processor_visa.add_keyword('Tier 1, 2 and 4', ['Tier 1', 'Tier 2', 'Tier 4'])
keyword_processor_visa.add_keyword('Tier 1, 2 and 5', ['Tier 1', 'Tier 2', 'Tier 5'])
keyword_processor_visa.add_keyword('Tier 1, 4 and 5', ['Tier 1', 'Tier 4', 'Tier 5'])
keyword_processor_visa.add_keyword('Tier 2, 4 and 5', ['Tier 2', 'Tier 4', 'Tier 5'])
keyword_processor_visa.add_keyword('Tier 1, 2 or 4', ['Tier 1', 'Tier 2', 'Tier 4'])
#keyword_processor_visa.add_keyword('Tier 1, 2 or 5', ['Tier 1', 'Tier 2', 'Tier 5'])
#keyword_processor_visa.add_keyword('Tier 1, 4 or 5', ['Tier 1', 'Tier 4', 'Tier 5'])
#keyword_processor_visa.add_keyword('Tier 2, 4 or 5', ['Tier 2', 'Tier 4', 'Tier 5'])
#keyword_processor_visa.add_keyword('Tier 1, 2, 4 and 5', ['Tier 1', 'Tier 2', 'Tier 4', 'Tier 5'])
#keyword_processor_visa.add_keyword('Tier 1, 2, 4 or 5', ['Tier 1', 'Tier 2', 'Tier 4', 'Tier 5'])

True

In [16]:
# ------ added from here --------
import spacy
import pytextrank

def get_phrases_from_pytextrank(doc, max_phrases=10):
    """
    doc : string of sentences in a document
    max_keywords : max keywords to be stored (10 is default)
    returns a dict containing top 10 ranks,counts,keywords
    """
    doc = nlp(doc)
    phrases, counts = [], []
    for j, p in enumerate(doc._.phrases):
        counts.append(p.count)
        phrases.append(p.text)
        if j > max_phrases:
            break
    return phrases, counts

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")
# add PyTextRank to the spaCy pipeline
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
# ------ added to here --------

for i in range(len(scrape_df)):
    for j in range(len(segm_df[i])):
        doc=scrape_df['text_segmented'][i]['string'][j]
        #The "Region" and "Nationality" columns are updated with all names of countries except "United Kingdom"
        countrylist=list(Counter(GeoText(segm_df[i]['string'][j]).countries).items())
        count_minus_uk = [t for t in countrylist if (t[0] != 'United Kingdom')] 
        keywords_visa=list(Counter(keyword_processor_visa.extract_keywords(doc)).items())
        segm_df[i]['immigration status'][j]=keywords_visa
        keywords_reg=list(Counter(keyword_processor_reg.extract_keywords(doc)).items())
        segm_df[i]['region'][j]=keywords_reg+count_minus_uk
        keywords_nat=list(Counter(keyword_processor_nat.extract_keywords(doc)).items())
        segm_df[i]['nationality'][j]=keywords_nat+count_minus_uk
        keywords_comp=list(Counter(keyword_processor_comp.extract_keywords(doc)).items())
        segm_df[i]['company'][j]=keywords_comp 
        keywords_compbody=list(Counter(keyword_processor_compbody.extract_keywords(doc)).items())
        segm_df[i]['company body'][j]=keywords_compbody
        keywords_regbody=list(Counter(keyword_processor_regbody.extract_keywords(doc)).items())
        segm_df[i]['regulatory body'][j]=keywords_regbody  
        keywords_soc=list(Counter(keyword_processor_soc.extract_keywords(doc)).items())
        segm_df[i]['SOC'][j]=keywords_soc
        keywords_rel=list(Counter(keyword_processor_rel.extract_keywords(doc)).items())
        segm_df[i]['relation'][j]=keywords_rel
        keywords_docs=list(Counter(keyword_processor_docs.extract_keywords(doc)).items())
        segm_df[i]['documents'][j]=keywords_docs
        # ------ added from here --------
        phrases, counts = get_phrases_from_pytextrank(doc, max_phrases=100)
        segm_df[i]['phrases'][j]=phrases
        segm_df[i]['counts'][j]=counts
        # ------ added to here --------

### Example 1: Introduction
from https://www.gov.uk/guidance/immigration-rules/immigration-rules-introduction

In [17]:
ex1=scrape_df['text_segmented'][1]
ex1

Unnamed: 0,section,section title,subsection,subsection title,string,immigration status,region,nationality,company,company body,regulatory body,SOC,documents,relation,general,phrases,counts
0,0,,0,,Immigration Rules: introduction\nThis contains...,[],[],[],[],[],[],[],[],[],[],"[entry clearance officers, rules, immigration ...","[1, 4, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, ..."
1,1,Implementation and transitional provisions,0,,Implementation and transitional provisions\n\n...,[],[],[],[],[],[],[],[],[],[],"[transitional provisions, 4.these rules, entry...","[1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 4, 2, 1, 1, 2, 1]"
2,2,Application,0,,Application\n\n5.Save where expressly indicate...,[],"[(EEA, 1)]",[],[],[],[],[],[],[],[],"[european economic area, rules, virtue, regula...","[1, 4, 1, 2, 2, 1, 2, 1, 2, 2]"
3,3,Interpretation,0,,Interpretation\n\n6.In these Rules the followi...,"[(Student, 16), (Tier 4, 12), (International G...","[(EEA, 6), (UK regions, 62), (non-EEA, 2), (Br...","[(refugee, 6), (China, 2), (Ireland, 1), (Cypr...","[(Charity, 2)]",[],"[(General Medical Council, 1)]","[(SOC, 1)]","[(evidence, 1), (letter, 2)]","[(partner, 17), (child, 16), (relationship, 4)...",[],"[uk degree level qualifications, such leave, m...","[1, 4, 61, 1, 15, 2, 20, 3, 3, 1, 1, 1, 1, 1, ..."


### Example 2: Appendix J
from https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-j-codes-of-practice-for-skilled-work

In [18]:
ex2 = scrape_df['text_segmented'][39]
ex2

Unnamed: 0,section,section title,subsection,subsection title,string,immigration status,region,nationality,company,company body,regulatory body,SOC,documents,relation,general,phrases,counts
0,0,,0,,Immigration Rules Appendix J: codes of practic...,"[(Tier 2, 1), (Tier 5, 1), (Tier 1, 1)]",[],[],[],[],[],[],[],[],[],"[immigration rules appendix j, work permit hol...","[1, 1, 1, 2, 1, 2, 1, 1, 1, 2]"
1,1,Introduction,0,,Introduction\n\n\nThis Appendix sets out the s...,[],[],[],[],[],[],[],[],[],[],"[creative sector jobs, appropriate salary rate...","[1, 1, 1, 1, 1, 4, 1, 1, 4, 1, 1]"
2,1,Introduction,1,Tables,Tables\n\n3. Table 1 sets out PhD-level occupa...,"[(Tier 2 (General), 3), (Tier 2, 3)]",[],[],[],[],[],[],[],[],[],"[regulated qualifications framework level, cre...","[2, 1, 5, 2, 2, 1, 1, 3, 3, 2, 1, 2, 3, 3, 1, ..."
3,1,Introduction,2,Occupation codes,Occupation codes\n\n11. The Standard Occupatio...,[],[],[],[],[],[],"[(SOC, 1)]",[],[],[],"[related job titles, jobs, sponsorship checkin...","[1, 3, 2, 1, 3, 1, 1, 2, 1, 1, 2, 1, 3, 2, 1, ..."
4,1,Introduction,3,Appropriate salary rates,Appropriate salary rates\n\n14. Where these Ru...,"[(Tier 2 (General), 2), (Tier 2, 2), (Tier 2 (...",[],[],[],[],[],[],[],[],[],"[appropriate salary rates, rates, national min...","[1, 4, 1, 3, 1, 1, 4, 2, 3, 5, 1, 1, 1, 1, 1, ..."
5,2,Table 1 - Occupational skilled to PhD-level,0,Appropriate salary rates,Table 1 - Occupational skilled to PhD-level\n ...,[],[],[],[],[],[],[],"[(evidence, 6)]","[( parent, 1)]",[],"[• scientist, • designs, • chemist, • compiles...","[1, 1, 1, 1, 1, 1, 25, 1, 1, 1, 1, 1, 1, 1, 1,..."
6,3,Table 2 - Occupations skilled to Regulated Qua...,0,Appropriate salary rates,Table 2 - Occupations skilled to Regulated Qua...,"[(Student, 10), (Tier 2 (General), 4)]","[(UK regions, 2), (Ireland, 2)]","[(Ireland, 2)]","[(Charity, 1)]",[],[],[],"[(evidence, 4)]","[(family, 5), (relationship, 2), (partner, 1)]",[],"[• web design consultant •, • data centre mana...","[1, 1, 1, 1, 1, 1, 252, 1, 1, 1, 1, 1, 2, 1, 1..."
7,4,Table 2A - Occupations skilled to Regulated Qu...,0,Appropriate salary rates,Table 2A - Occupations skilled to Regulated Qu...,"[(Tier 2 (General), 1)]",[],[],[],[],[],[],[],[],[],"[other electronic equipment, • tests, • connec...","[1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ..."
8,5,Table 3 - Occupations skilled to Regulated Qua...,0,Appropriate salary rates,Table 3 - Occupations skilled to Regulated Qua...,[],[],[],[],[],[],[],[],"[(family, 2)]",[],[• delivery office manager • office manager • ...,"[1, 1, 1, 1, 83, 1, 1, 1, 1, 1, 3, 2, 1, 3, 1,..."
9,6,Table 4 - Occupations which are ineligible for...,0,Appropriate salary rates,Table 4 - Occupations which are ineligible for...,"[(Tier 2 (General), 2), (Tier 2, 1), (Student,...","[(UK regions, 5)]",[],"[(Hotel and/or Catering Establishment, 1)]",[],"[(General Dental Council, 1)]",[],"[(evidence, 1), (letter, 2)]","[(family, 1)]",[],"[• electronics engineer • field engineer, • jo...","[1, 1, 1, 1, 344, 1, 1, 1, 1, 1, 4, 3, 1, 1, 6..."


If we want to create an excel file for each document:

In [None]:
#for ii in range(len(scrape_df)):
#    scrape_df['text_segmented'][ii].to_excel("./out_xlsx/document "+ str(ii) + ".xlsx")

Pickle

In [None]:
#with open('labeled_immigration_rules.pickle','wb') as f:
#    pickle.dump(scrape_df,f, pickle.HIGHEST_PROTOCOL)