# Labels
Here we take the dataframes of scraped pdfs and htmls and create a new dataframe for each of them.
We create a new dataframe for each html document that has additional columns for general labels ("keywords"),
labels for the needed paperwork ("needed_records"), and
labels for references to other documents ("references" and "reference urls").

In [1]:
###------- TODO: change ------------
# pip freeze?:
#!pip install https://github.com/elyase/geotext/archive/master.zip
from keywords import * 
###-------------------------------

import pandas as pd
import pickle
import os, re
from geotext import GeoText
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from collections import Counter
from pathlib import Path
pd.options.mode.chained_assignment = None 
import spacy

In [2]:
# For reproduceability use the same path as where the nationbetter 
# package stored the data!
# output_path = '~/S2DS/Nation.Better/nationbetter_data'
# output_path = os.path.expanduser(output_path)
output_path =  os.path.abspath(os.path.join('..','..','..','nationbetter_data'))

formatted_html_dfs = os.path.join(output_path,'formatted_html_dfs')
raw_html_dicts = os.path.join(output_path,'raw_html_dicts')
html_df_pickles = os.listdir(formatted_html_dfs)
html_raw_dict_pickles = os.listdir(raw_html_dicts)
raw_pdf_dicts = os.path.join(output_path,'raw_pdf_dicts')
pdf_raw_dict_pickles = os.listdir(raw_pdf_dicts)
formatted_pdf_path = os.path.join(output_path,'formatted_pdf_dfs')
formatted_pdf_pkls = os.listdir(formatted_pdf_path)

In [3]:
#load all pdfs
pdfs =[]
for file in formatted_pdf_pkls:
    df = pickle.load(open(os.path.join(formatted_pdf_path,file),'rb'))
    pdfs.append(df)
    df[df['page_no']==1]

for df in pdfs:
    title_idx =df[(df['text_type']=='title')&(df['page_no']==1)].index[0]
    has_sub = df[(df['text_type']=='subtitle')&(df['page_no']==1)].index
    if title_idx != 0:
        df.drop([title_idx-1],inplace = True)
    if list(has_sub):
        if has_sub[0]>title_idx:
            new_title= df.iloc[title_idx].text + df.iloc[has_sub[0]].text
            df.at[0,'text']= new_title

In [4]:
pdf_urls = [pdfs[i]['url'][1] for i in range(len(pdfs))]

In [5]:
for df in pdfs:
    title_index = df[df['text_type']=='title'].index.min()
    df.drop(df[~(df.index>=title_index)].index,inplace=True)

In [6]:
# This part of the code fixes a bug dropping the "empty" subsection counter columns in 
# pdfs[0] = Tier 2 Policy Guidance. There are however more issues and this fix should
# be implemented on a deeper leve
keylist = []
for df in pdfs:
    keylist.append(df.keys())
keylist
#dropping empty (sub)title columns
for df_no in range(len(pdfs)):
    find_sec = re.compile(r'\w*title$')
    find=list(filter(None,[find_sec.search(key) for key in keylist[df_no]]))
    old_keys = [key.group() for key in find]
    for key in old_keys:
        if len(pdfs[df_no][key].unique())==1:
            pdfs[df_no].drop(key,axis =1,inplace=True)
#rebuild keylist for new columns
keylist = []
for df in pdfs:
    keylist.append(df.keys())

In [7]:
#renaming the columns in the dataframe with the convention of html 
for df_no in range(len(pdfs)):
    find_sec = re.compile(r'\w*title$')
    find=list(filter(None,[find_sec.search(key) for key in keylist[df_no]]))
    old_keys = [key.group() for key in find]
    #if the dataframe of pdfs[0] is fixed and keys still need to be dropped
    # insert here some code that renames missing keys and columns
    # ['title','subsubsection'] -> ['title','subtitle']
    keep_keys = keylist[df_no].drop(old_keys)
    find_sec = re.compile(r'(title)$')
    final_keys = [find_sec.sub(r'section',key) for key in keylist[df_no]]
    rename_keys = pd.Index(final_keys).drop(keep_keys)
    rename_dict = dict(zip(old_keys,rename_keys))
    pdfs[df_no].rename(columns = rename_dict,inplace = True)

In [8]:
#write function that can be iterated over to obtain section title columns
#for single df
# df_no = 1
# Sorting the keys into agg_keys, which will generate the column names
# and dropping unwanted keys
keylist = []
for df in pdfs:
    keylist.append(df.keys())

def get_title_col_pdf(df,sec_col='text'):
    '''
    Taking a column (of section counters) and check weither a section
    transition occurs. If yes store in column [(sub)*section]_title
    the tite (assuming it happpends) on the first line of the section
    '''
    #Checking if the (sub)section title changes if yes return true
    is_sec_title = (df[sec_col]-df[sec_col].shift().bfill()).astype(bool)
    colname = sec_col + ' title'  
    return is_sec_title, colname


for num, df in enumerate(pdfs):
    #get the keys
    keylist = df.keys()
    find_sec = re.compile(r'\w*section$')
    find=list(filter(None,[find_sec.search(key) for key in keylist]))
    agg_keys = [key.group() for key in find]
    sec_keys = agg_keys + ['page_no','text','url']
    #reduce the dataframe
    df = df[sec_keys]

    #call function
    for key in agg_keys:
        is_sec_title, colname = get_title_col_pdf(df,key)
        #Selects string column where is_sec_title true, returns section name
        titlecol = df[is_sec_title].text.replace('\s+', ' ', regex=True).str.strip()
        titlecol.name = colname
        df = pd.concat([df,titlecol],axis = 1)
        df[colname].fillna(method='ffill',inplace=True)

    #Generate dict for aggregation of text in (sub)*sections
    dex = df.keys().drop(agg_keys+['text']) 
    agg_fn = ['first' for no_dex in range(len(dex))]
    agg_arg = dict(zip(dex,agg_fn))
    agg_arg.update({'text':' '.join})
    df.groupby(agg_keys,as_index = False).agg(agg_arg)
   # df.rename(columns={'text':'string'},inplace=True)
    pdfs[num]=df

In [9]:
# for the html documents we first define a function 
# that builds from a list of text segments a dataframe with columns "text", "section title" etc. 
def build_segments_df(segments,merge_section_headers = False):
    """
    Build a pandas.DataFrame from a list of text segments.
    segments : list of tuples
        each touple has the struncture (label,string),
        where label can be either "text","header","subheader","table". All other labels are treated as "text"
    """
    
    
    section_cntr = 0
    subsection_cntr = 0

    segments_dict = {
        "section" : [],
        "section title" : [],
        "subsection" : [],
        "subsection title" : [],
        "text" : []}
    
    section_title = ""
    subsection_title = ""
    
    was_header = False
    for (ii,segment) in enumerate(segments):

        if segment[0] == "section":
            section_cntr = section_cntr + 1
            subsection_cntr = 0
            section_title = segment[1]
            was_header = True
        elif segment[0] == "subsection":
            subsection_cntr = subsection_cntr + 1
            subsection_title = segment[1]
            was_header = True
        else:
            was_header = False
        
        
        if merge_section_headers and not was_header and ii > 0:
            segments_dict["text"][-1] += "\n" + segment[1]
        else:
            segments_dict["section"].append(section_cntr)
            segments_dict["section title"].append(section_title)
            segments_dict["subsection title"].append(subsection_title)
            segments_dict["subsection"].append(subsection_cntr)
            segments_dict["text"].append(segment[1])

    segments_df = pd.DataFrame(segments_dict)
    return segments_df

In [10]:
# load htmls

# Unlike all the files stored in /raw_html_dicts/ the code below
# stores all the files into one dict. See:
scrape_dict0 = pickle.load(open(os.path.join(raw_html_dicts,html_raw_dict_pickles[1]),'rb'))
# import pprint
# pprint.pprint(scrape_df0,depth = 1)
scrape_dict = {}
scrape_dict = {key: [] for key in scrape_dict0.keys()}
for filename in html_raw_dict_pickles[1:]:
    single_scrape = pickle.load(open(os.path.join(raw_html_dicts,filename),'rb'))
    if single_scrape.keys() == scrape_dict0.keys():
        for key in single_scrape.keys():
            scrape_dict[key]+=[single_scrape[key]]
scrape_df = pd.DataFrame(scrape_dict)
scrape_df["text_segmented"] = scrape_df["text_segmented"].map(lambda x : build_segments_df(x,merge_section_headers = True))

In [11]:
# creating lists of dataframes that will be modified
segm_dfs_list = [scrape_df.loc[i,"text_segmented"] for i in range(len(scrape_df))]
html_list_of_dfs = []
[html_list_of_dfs.append(segm_dfs_list[real_df].copy()) for real_df in range(len(segm_dfs_list))]
pdf_list_of_dfs = pdfs

# indexing in all dataframes should start at 0
pdf_list_of_dfs = [pdfs[i].reset_index() for i in range(len(pdfs))]

In [12]:
# list of all pdf titles
pdf_titles = []
for df in pdfs:
    title_loc = df.index.min()
    pdf_titles.append(df.text.replace('\s+', ' ', regex=True).str.strip()[title_loc])

In [13]:
# the columns for the labels are added, some entries are set to empty lists

# for html
for i in range(len(scrape_df)):
    html_list_of_dfs[i]['document title'] = scrape_df['title'][i] 
    html_list_of_dfs[i]['document type'] = "html" 
    html_list_of_dfs[i]['document url'] = scrape_df['URL'][i]
    html_list_of_dfs[i]['keywords'] = [[] for _ in range(len(html_list_of_dfs[i]))]
    html_list_of_dfs[i]['needed records'] = [[] for _ in range(len(html_list_of_dfs[i]))]
    html_list_of_dfs[i]['references'] = [[] for _ in range(len(html_list_of_dfs[i]))]
    html_list_of_dfs[i]['reference urls'] = [[] for _ in range(len(html_list_of_dfs[i]))]

# for pdf
for i in range(len(pdfs)):
    pdf_list_of_dfs[i]['document title'] = pdf_titles[i]
    pdf_list_of_dfs[i]['document url'] = pdf_urls[i]
    pdf_list_of_dfs[i]['document type'] = "pdf" 
    pdf_list_of_dfs[i]['keywords'] = [[] for _ in range(len(pdf_list_of_dfs[i]))]
    pdf_list_of_dfs[i]['needed records'] = [[] for _ in range(len(pdf_list_of_dfs[i]))]
    pdf_list_of_dfs[i]['references'] = [[] for _ in range(len(pdf_list_of_dfs[i]))]
    pdf_list_of_dfs[i]['reference urls']=[[] for _ in range(len(pdf_list_of_dfs[i]))]  

### Keyword lists and dictionaries

In [14]:
# find mentioning of "Appendix XYZ and "Part XYZ"
pattern_appendix = r"[Aa]ppendix [0-9]*[A-Za-z]*[-]*\s*[A-Za-z]*\s*[(]*[A-Za-z]*\s*[A-Za-z]*[)]*"
pattern_part = r"[Pp]art [0-9]*[A-Za-z]*\s*[A-Za-z]*\s*[(]*[A-Za-z]*\s*[A-Za-z]*[)]*"

In [15]:
# the list of all document titles is updated with all expressions from pattern_appendix
# and pattern_part that appear in the list of documents
ref_in_title_list = [scrape_df['title'][i] for i in range(len(scrape_df))]

for i in range(len(scrape_df)):
    
    ref_appendix = re.findall(pattern_appendix, str(ref_in_title_list[i]) )
    ref_part = re.findall(pattern_part, str(ref_in_title_list[i]) )
    
    if ref_part!=[]:
        ref_in_title_list+= ref_part
     
    if ref_appendix!=[]:
        ref_in_title_list+= ref_appendix
        
    else: 
        continue

In [16]:
# keyword processor for references is updated with references occuring in titles
all_refs_in_titles = ref_in_title_list + pdf_titles
keyword_processor_ref.add_keywords_from_list(all_refs_in_titles)

The "keywords", "needed records" and "references" columns are updated with all the keywords found

In [17]:
# keywords for html
for i in range(len(scrape_df)):
    doctitle=scrape_df['title'][i]
    for j in range(len(segm_dfs_list[i])):       
        sectitle=html_list_of_dfs[i]['section title'][j]
        subsectitle=html_list_of_dfs[i]['subsection title'][j]
        # more weight is put to words appearing in titles
        text=3*(doctitle + " ")  + 2*(sectitle + " ")+ subsectitle + " " +html_list_of_dfs[i]['text'][j]
        #The regional keywords are updated with all names of countries except "United Kingdom"
        countrylist=list(Counter(GeoText(text).countries).items())
        count_minus_uk = [t for t in countrylist if (t[0] != 'United Kingdom')] 
        keywords_all=list(Counter(keyword_processor_all.extract_keywords(text)).items())
        html_list_of_dfs[i]['keywords'][j]=keywords_all +count_minus_uk
        keywords_rec=list(Counter(keyword_processor_rec.extract_keywords(text)).items())
        html_list_of_dfs[i]['needed records'][j]=keywords_rec
        keywords_ref = list(Counter(keyword_processor_ref.extract_keywords(text)).items())
        keywords_refminusself = [t for t in keywords_ref if (t[0] not in subsectitle) if (t[0] not in sectitle) if (t[0] not in doctitle)]
        html_list_of_dfs[i]['references'][j]=keywords_refminusself

In [18]:
# keywords for pdf

for i in range(len(pdfs)):
    doctitle = pdf_titles[i]
    for j in range(len(pdfs[i])):
        sectitle = ""
        if 'section title' in pdf_list_of_dfs[i].columns:
            sectitle = str(pdf_list_of_dfs[i]['section title'][j])
        subsectitle = ""
        if 'subsection title' in pdf_list_of_dfs[i].columns:
            subsectitle = str(pdf_list_of_dfs[i]['subsection title'][j])
        subsubsectitle = ""
        if 'subsubsection title' in pdf_list_of_dfs[i].columns:
            subsubsectitle = str(pdf_list_of_dfs[i]['subsubsection title'][j])
        subsubsubsectitle = ""
        if 'subsubsubsection title' in pdf_list_of_dfs[i].columns:
            subsubsubsectitle = str(pdf_list_of_dfs[i]['subsubsubsection title'][j])
        subsubsubsubsectitle = ""
        if 'subsubsubsubsection title' in pdf_list_of_dfs[i].columns:
            subsubsubsubsectitle = str(pdf_list_of_dfs[i]['subsubsubsubsection title'][j])
        # more weight is put to words appearing in titles
        text = (pdf_list_of_dfs[i]['text'][j]  + " " + 3*(doctitle + " ")  + 2*(sectitle + " ")
                + subsectitle + " " + subsubsectitle + " " 
                + subsubsubsectitle + " " + subsubsubsubsectitle) 
        #The regional keywords are updated with all names of countries except "United Kingdom"
        countrylist=list(Counter(GeoText(text).countries).items())
        count_minus_uk = [t for t in countrylist if (t[0] != 'United Kingdom')] 
        keywords_all=list(Counter(keyword_processor_all.extract_keywords(text)).items())
        pdf_list_of_dfs[i]['keywords'][j]=keywords_all +count_minus_uk
        keywords_rec=list(Counter(keyword_processor_rec.extract_keywords(text)).items())
        pdf_list_of_dfs[i]['needed records'][j]=keywords_rec
        keywords_ref = list(Counter(keyword_processor_ref.extract_keywords(text)).items())
        keywords_refminusself = [t for t in keywords_ref if (t[0] not in subsectitle) if (t[0] not in sectitle) if (t[0] not in doctitle)]
        pdf_list_of_dfs[i]['references'][j]=keywords_refminusself

In [19]:
# update column "reference urls" with urls corresponding to titles that appear in the "references" column

#for html documents
html_urls = [scrape_df['URL'][i] for i in range(len(scrape_df))] 
html_titles = [scrape_df['title'][i] for i in range(len(scrape_df))]
all_titles = html_titles + pdf_titles
all_urls = html_urls + pdf_urls

for i in range(len(scrape_df)):
    for j in range(len(segm_dfs_list[i])):      
        tuple_list= html_list_of_dfs[i]['references'][j]
        url_for_ref = []
        if tuple_list != []: 
            value_list = [itm[0] for itm in tuple_list]
            for ref in  value_list:
                for longtitle in all_titles:
                    if ref in longtitle:
                        idx = all_titles.index(longtitle)
                        item = all_urls[idx]
                        url_for_ref.append(item)
        html_list_of_dfs[i]['reference urls'][j] = url_for_ref

#for PDFs
for i in range(len(pdfs)):
    for j in range(len(pdfs[i])):  
        tuple_list= pdf_list_of_dfs[i]['references'][j]
        url_for_ref = []
        if tuple_list != []: 
            value_list = [itm[0] for itm in tuple_list]
            for ref in  value_list:
                for longtitle in all_titles:
                    if ref in longtitle:
                        idx = all_titles.index(longtitle)
                        item = all_urls[idx]
                        url_for_ref.append(item)
        pdf_list_of_dfs[i]['reference urls'][j] = url_for_ref

### Pickle resulting list of dataframes
We create a combined list that contains both the pdf and the html dataframes.
This listed is pickled as 'labeled_immigration_rules.pickle'.

In [20]:
# The first pdf has such a different format that we have to leave it out for now.
pdf_list_of_dfs=pdf_list_of_dfs[1:]

list_of_dataframes = pdf_list_of_dfs + html_list_of_dfs

with open('labeled_corpus.pickle','wb') as f:
    pickle.dump(list_of_dataframes,f, pickle.HIGHEST_PROTOCOL)

## Some examples

### Example 1: Appendix Armed Forces
from https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-armed-forces

In [21]:
ex1=html_list_of_dfs[2]  
ex1

Unnamed: 0,section,section title,subsection,subsection title,text,document title,document type,document url,keywords,needed records,references,reference urls
0,0,,0,,Immigration Rules Appendix Armed Forces\nRules...,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,[],[],[],[]
1,1,Part 1 General,0,,Part 1 General,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,[],[],"[(part 1, 3)]",[https://www.gov.uk/guidance/immigration-rules...
2,1,Part 1 General,1,Who these rules apply to,Who these rules apply to\n\n1. The rules conta...,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,"[(Commonwealth, 1), (partner, 4), (child, 4)]",[],"[(part 1, 2), (Immigration Act 1971, 1)]",[https://www.gov.uk/guidance/immigration-rules...
3,1,Part 1 General,2,Interpretation and general provisions,Interpretation and general provisions\n\n2. In...,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,"[(application, 8), (partner, 6), (parent, 1), ...",[],"[(part 1, 2), (part 7, 2), (Immigration Act 19...",[https://www.gov.uk/guidance/immigration-rules...
4,1,Part 1 General,3,Leave to enter,Leave to enter\n\n6. The requirements to be me...,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,"[(entry clearance, 1), (visa, 1)]",[],"[(part 1, 2)]",[https://www.gov.uk/guidance/immigration-rules...
5,2,Part 2 - Suitability requirements,0,Leave to enter,Part 2 - Suitability requirements\n\n8. An app...,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,"[(application, 9), (entry clearance, 8), (refu...","[(Criminal Record, 2), (report, 1)]","[(part 2, 3), (Refugee Convention, 5), (releva...",[https://www.gov.uk/guidance/immigration-rules...
6,3,Part 3 - Discharged members of HM Forces,0,Leave to enter,Part 3 - Discharged members of HM Forces,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,[],[],"[(part 3, 3)]",[https://www.gov.uk/guidance/immigration-rules...
7,3,Part 3 - Discharged members of HM Forces,1,General eligibility requirements,General eligibility requirements\n\n11. The ge...,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,"[(criteria, 1), (date, 1), (application, 2), (...",[],"[(part 3, 2)]",[https://www.gov.uk/guidance/immigration-rules...
8,3,Part 3 - Discharged members of HM Forces,2,Medical discharge,Medical discharge\n\n12. The medical discharge...,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,"[(criteria, 1)]",[],"[(part 3, 2)]",[https://www.gov.uk/guidance/immigration-rules...
9,3,Part 3 - Discharged members of HM Forces,3,Indefinite leave to enter,Indefinite leave to enter\n\n13. Entry clearan...,Immigration Rules Appendix Armed Forces,html,https://www.gov.uk/guidance/immigration-rules/...,"[(entry clearance, 2), (Commonwealth, 2), (app...",[],"[(part 3, 2)]",[https://www.gov.uk/guidance/immigration-rules...


### Example 2: Part 6a
from https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system

In [22]:
ex2 = html_list_of_dfs[45]
ex2

Unnamed: 0,section,section title,subsection,subsection title,text,document title,document type,document url,keywords,needed records,references,reference urls
0,0,,0,,Immigration Rules part 6A: the points-based sy...,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 5)]",[],[],[]
1,1,245AAA.General requirements for indefinite lea...,0,,245AAA.General requirements for indefinite lea...,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 3), (sponsor, 3), (date, 7), (...","[(evidence, 2), (certificate, 1)]","[(Appendix A, 1), (Appendix J, 1)]",[https://www.gov.uk/guidance/immigration-rules...
2,2,245AA.Documents not submitted with applications,0,,245AA.Documents not submitted with application...,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 3), (documents, 12), (entry cl...","[(evidence, 2), (letter, 1)]",[],[]
3,3,245A. Specified documents for students previou...,0,,245A. Specified documents for students previou...,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 3), (documents, 6), (applicati...","[(letter, 1)]",[],[]
4,4,Tier 1 (Exceptional Talent) Migrants,0,,Tier 1 (Exceptional Talent) Migrants,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 3), (Tier 1, 3), (Tier 1 (Exce...",[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...
76,14,Tier 4 (Child) Student,2,245ZZA. Entry clearance,245ZZA. Entry clearance\nAll migrants arriving...,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 3), (Tier 4, 3), (child, 6), (...","[(evidence, 1), (certificate, 2)]","[(Appendix A, 1), (Appendix C, 1)]",[https://www.gov.uk/guidance/immigration-rules...
77,14,Tier 4 (Child) Student,3,245ZZB. Period and conditions of grant,245ZZB. Period and conditions of grant\n\n(a) ...,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 3), (Tier 4, 7), (child, 4), (...","[(confirmation, 9), (licence, 1)]",[],[]
78,14,Tier 4 (Child) Student,4,245ZZC. Requirements for leave to remain,245ZZC. Requirements for leave to remain\nTo q...,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 3), (Tier 4, 4), (child, 6), (...","[(evidence, 1), (certificate, 2)]","[(Appendix A, 1), (Appendix C, 1)]",[https://www.gov.uk/guidance/immigration-rules...
79,14,Tier 4 (Child) Student,5,245ZZD. Period and conditions of grant,245ZZD. Period and conditions of grant\n\n(a) ...,Immigration Rules part 6A: the points-based sy...,html,https://www.gov.uk/guidance/immigration-rules/...,"[(points-based, 3), (Tier 4, 5), (child, 2), (...","[(confirmation, 9), (licence, 1)]",[],[]


In [23]:
ex2['reference urls'][78]

['https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-armed-forces',
 'https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-ar-eu',
 'https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-ar-administrative-review',
 'https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-a-attributes',
 'https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-c-maintenance-funds']

### Example 3
from the PDF "English language requirements: skilled workers"

In [24]:
ex3 = pdf_list_of_dfs[3]
ex3

Unnamed: 0,index,section,subsection,page_no,text,url,section title,subsection title,document title,document url,document type,keywords,needed records,references,reference urls
0,0,0.0,0.0,1,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...,https://assets.publishing.service.gov.uk/gover...,,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(language requirements, 4)]",[],[],[]
1,1,0.0,0.0,1,\nVersion 17.0 \n,https://assets.publishing.service.gov.uk/gover...,,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(language requirements, 3)]",[],[],[]
2,2,1.0,0.0,1,\n \n \n \n \nContents \n,https://assets.publishing.service.gov.uk/gover...,Contents,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(language requirements, 3)]",[],[],[]
3,3,1.0,0.0,2,\nContents .....................................,https://assets.publishing.service.gov.uk/gover...,Contents,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(guidance, 2), (points-based, 1), (Tier 1, 2)...","[(evidence, 1), (documentation, 1), (certifica...","[(Appendix W, 3)]",[https://www.gov.uk/guidance/immigration-rules...
4,4,2.0,0.0,3,\n \nAbout this guidance \n,https://assets.publishing.service.gov.uk/gover...,About this guidance,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(guidance, 3), (language requirements, 3)]",[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,79,14.0,1.0,23,"\nDoctors, dentists, nurses and midwives appl...",https://assets.publishing.service.gov.uk/gover...,Applications under Tier 2 (General),"English language requirement for doctors, dent...",English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(nurses, 2), (midwives, 2), (Tier 2 (General)...","[(certificate, 1)]",[],[]
80,80,14.0,1.0,23,\n• 2211 – Medical Practitioners \n,https://assets.publishing.service.gov.uk/gover...,Applications under Tier 2 (General),"English language requirement for doctors, dent...",English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(2211, 1), (language requirements, 4), (Tier ...",[],[],[]
81,81,14.0,1.0,23,• 2215 – Dental Practitioners \n• 2231 – Nur...,https://assets.publishing.service.gov.uk/gover...,Applications under Tier 2 (General),"English language requirement for doctors, dent...",English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(2215, 1), (Dental practitioners, 1), (2231, ...",[],[],[]
82,82,15.0,0.0,23,\n \nTransitional arrangements \n,https://assets.publishing.service.gov.uk/gover...,Transitional arrangements,Transitional arrangements,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(language requirements, 3)]",[],[],[]


In [25]:
#A closer look at line 75:
print("Text: ", ex3['text'][75]) 
print("Document Title: ",ex3['document title'][75]) 
print("Section Title: ",ex3['section title'][75]) 
print("Subsection Title: ",ex3['subsection title'][75]) 
print("Keywords: ",ex3['keywords'][75]) 
print("References: ",ex3['references'][75])                
print("Reference URLs: ",ex3['reference urls'][75])

Text:   
There are no English language requirements for the following routes: 
 
•  Tier 1 (Investor) 
•  Tier 1 (Exceptional talent) 
•  Tier 2 (Intra-company transfer) 
•  Tier 5 
•  Global Talent (Appendix W Worker) 
 
Related content  
Contents 
Applications under Tier 2 (General)  
Transitional arrangements 
Appendix W 

Document Title:  English language requirements: skilled workers
Section Title:  Exemptions from the language requirements or no requirement
Subsection Title:  No English language requirements
Keywords:  [('language requirements', 7), ('Tier 1', 2), ('Tier 1 (Investor)', 1), ('Tier 1 (Exceptional Talent)', 1), ('Tier 2', 1), ('Company', 1), ('Tier 5', 1), ('Global talent', 1), ('Tier 2 (General)', 1), ('requirement', 2)]
References:  [('Appendix W', 2)]
Reference URLs:  ['https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-w-immigration-rules-for-workers']
