# Labels
Here we take the dataframes of scraped pdfs and htmls and create a new dataframe for each of them.
We create a new dataframe for each html document that has additional columns for general labels ("keywords"),
labels for the needed paperwork ("needed_records"), and
labels for references to other documents ("references" and "reference urls").

In [24]:
###------- TODO: change ------------
# pip freeze?:
#!pip install https://github.com/elyase/geotext/archive/master.zip
from keywords import * 
###-------------------------------

import pandas as pd
import pickle
import os, re
from geotext import GeoText
from collections import Counter
from pathlib import Path
pd.options.mode.chained_assignment = None 

In [25]:
# For reproduceability use the same path as where the nationbetter 
# package stored the data!
# output_path = '~/S2DS/Nation.Better/nationbetter_data'
# output_path = os.path.expanduser(output_path)
output_path =  os.path.abspath(os.path.join('..','data_nationbetter'))

formatted_html_path = os.path.join(output_path,'formatted_html_dfs')
raw_html_dicts = os.path.join(output_path,'raw_html_dicts')
formatted_html_pkls = os.listdir(formatted_html_path)
html_df_pickles = os.listdir(formatted_html_path)
html_raw_dict_pickles = os.listdir(raw_html_dicts)
raw_pdf_dicts = os.path.join(output_path,'raw_pdf_dicts')
pdf_raw_dict_pickles = os.listdir(raw_pdf_dicts)
formatted_pdf_path = os.path.join(output_path,'formatted_pdf_dfs')
formatted_pdf_pkls = os.listdir(formatted_pdf_path)

In [26]:
#load all pdfs
pdfs =[]
for file in formatted_pdf_pkls:
    df = pickle.load(open(os.path.join(formatted_pdf_path,file),'rb'))
    pdfs.append(df)
    df[df['page_no']==1]

for df in pdfs:
    title_idx =df[(df['text_type']=='title')&(df['page_no']==1)].index[0]
    has_sub = df[(df['text_type']=='subtitle')&(df['page_no']==1)].index
    if title_idx != 0:
        df.drop([title_idx-1],inplace = True)
    if list(has_sub):
        if has_sub[0]>title_idx:
            new_title= df.iloc[title_idx].text + df.iloc[has_sub[0]].text
            df.at[0,'text']= new_title

In [27]:
# list of pdf URLs
pdf_urls = [pdfs[i]['url'][1] for i in range(len(pdfs))]

In [28]:
for df in pdfs:
    title_index = df[df['text_type']=='title'].index.min()
    df.drop(df[~(df.index>=title_index)].index,inplace=True)

In [29]:
# This part of the code fixes a bug dropping the "empty" subsection counter columns in 
# pdfs[0] = Tier 2 Policy Guidance. There are however more issues and this fix should
# be implemented on a deeper leve
keylist = []
for df in pdfs:
    keylist.append(df.keys())
keylist
#dropping empty (sub)title columns
for df_no in range(len(pdfs)):
    find_sec = re.compile(r'\w*title$')
    find=list(filter(None,[find_sec.search(key) for key in keylist[df_no]]))
    old_keys = [key.group() for key in find]
    for key in old_keys:
        if len(pdfs[df_no][key].unique())==1:
            pdfs[df_no].drop(key,axis =1,inplace=True)
#rebuild keylist for new columns
keylist = []
for df in pdfs:
    keylist.append(df.keys())

In [30]:
#renaming the columns in the dataframe with the convention of html 
for df_no in range(len(pdfs)):
    find_sec = re.compile(r'\w*title$')
    find=list(filter(None,[find_sec.search(key) for key in keylist[df_no]]))
    old_keys = [key.group() for key in find]
    #if the dataframe of pdfs[0] is fixed and keys still need to be dropped
    # insert here some code that renames missing keys and columns
    # ['title','subsubsection'] -> ['title','subtitle']
    keep_keys = keylist[df_no].drop(old_keys)
    find_sec = re.compile(r'(title)$')
    final_keys = [find_sec.sub(r'section',key) for key in keylist[df_no]]
    rename_keys = pd.Index(final_keys).drop(keep_keys)
    rename_dict = dict(zip(old_keys,rename_keys))
    pdfs[df_no].rename(columns = rename_dict,inplace = True)

In [31]:
#write function that can be iterated over to obtain section title columns
#for single df
# df_no = 1
# Sorting the keys into agg_keys, which will generate the column names
# and dropping unwanted keys
keylist = []
for df in pdfs:
    keylist.append(df.keys())

def get_title_col_pdf(df,sec_col='text'):
    '''
    Taking a column (of section counters) and check weither a section
    transition occurs. If yes store in column [(sub)*section]_title
    the tite (assuming it happpends) on the first line of the section
    '''
    #Checking if the (sub)section title changes if yes return true
    is_sec_title = (df[sec_col]-df[sec_col].shift().bfill()).astype(bool)
    colname = sec_col + ' title'  
    return is_sec_title, colname


for num, df in enumerate(pdfs):
    #get the keys
    keylist = df.keys()
    find_sec = re.compile(r'\w*section$')
    find=list(filter(None,[find_sec.search(key) for key in keylist]))
    agg_keys = [key.group() for key in find]
    sec_keys = agg_keys + ['page_no','text','url']
    #reduce the dataframe
    df = df[sec_keys]

    #call function
    for key in agg_keys:
        is_sec_title, colname = get_title_col_pdf(df,key)
        #Selects string column where is_sec_title true, returns section name
        titlecol = df[is_sec_title].text.replace('\s+', ' ', regex=True).str.strip()
        titlecol.name = colname
        df = pd.concat([df,titlecol],axis = 1)
        df[colname].fillna(method='ffill',inplace=True)

    #Generate dict for aggregation of text in (sub)*sections
    dex = df.keys().drop(agg_keys+['text']) 
    agg_fn = ['first' for no_dex in range(len(dex))]
    agg_arg = dict(zip(dex,agg_fn))
    agg_arg.update({'text':' '.join})
    df.groupby(agg_keys,as_index = False).agg(agg_arg)
   # df.rename(columns={'text':'string'},inplace=True)
    pdfs[num]=df

In [32]:
# indexing in all dataframes should start at 0
pdfs = [pdfs[i].reset_index() for i in range(len(pdfs))]

# list of all pdf titles
pdf_titles = []
for df in pdfs:
    title_loc = df.index.min()
    pdf_titles.append(df.text.replace('\s+', ' ', regex=True).str.strip()[title_loc])

In [33]:
#load all htmls
htmls =[]
for file in formatted_html_pkls:
    df = pickle.load(open(os.path.join(formatted_html_path,file),'rb'))
    htmls.append(df)

In [34]:
def get_title_col_html(df,sec_col='string'):
    #Checking if the (sub)section title changes if yes return true
    is_sec_title = (df[sec_col]-df[sec_col].shift().bfill()).astype(bool)
    colname = sec_col + ' title'
    #Selects string column where is_sec_title true, returns section name
    df[colname]= df[is_sec_title].string
    df[colname].fillna(method='ffill',inplace = True)

def agg_subsections_html(df):
    #get columns which need to be added 
    list_to_new_keys = ['section','subsection']
    for key in list_to_new_keys:
        get_title_col_html(df,key)
    df.groupby(['section','subsection'],as_index = False).agg({'string':' '.join,'section title':'first','subsection title':'first'},inplace = True)

for df in htmls:
    agg_subsections_html(df)

In [35]:
# rename column "string" in the html dataframes as "text"
for i in range(len(htmls)):
    htmls[i].rename(columns = {'string':'text'}, inplace = True)

# html titles and URLs
html_titles = []
html_urls = []
for i in range(len(htmls)):
    raw_dict = pickle.load(open(os.path.join(raw_html_dicts,html_raw_dict_pickles[i]),'rb'))
    html_titles.append(raw_dict["title"])
    html_urls.append(raw_dict["URL"])

In [36]:
# the columns for the labels are added, some entries are set to empty lists

# for html
for i in range(len(htmls)):
    htmls[i]['document title'] = html_titles[i]    
    htmls[i]['document url'] = html_urls[i]
    htmls[i]['document type'] = "html" 
    htmls[i]['keywords'] = [[] for _ in range(len(htmls[i]))]
    htmls[i]['needed records'] = [[] for _ in range(len(htmls[i]))]
    htmls[i]['references'] = [[] for _ in range(len(htmls[i]))]
    htmls[i]['reference urls'] = [[] for _ in range(len(htmls[i]))]

# for pdf
for i in range(len(pdfs)):
    pdfs[i]['document title'] = pdf_titles[i]
    pdfs[i]['document url'] = pdf_urls[i]
    pdfs[i]['document type'] = "pdf" 
    pdfs[i]['keywords'] = [[] for _ in range(len(pdfs[i]))]
    pdfs[i]['needed records'] = [[] for _ in range(len(pdfs[i]))]
    pdfs[i]['references'] = [[] for _ in range(len(pdfs[i]))]
    pdfs[i]['reference urls']=[[] for _ in range(len(pdfs[i]))]  

### Keyword lists and dictionaries

In [37]:
# find mentioning of "Appendix XYZ and "Part XYZ"
pattern_appendix = r"[Aa]ppendix [0-9]*[A-Za-z]*[-]*\s*[A-Za-z]*\s*[(]*[A-Za-z]*\s*[A-Za-z]*[)]*"
pattern_part = r"[Pp]art [0-9]*[A-Za-z]*\s*[A-Za-z]*\s*[(]*[A-Za-z]*\s*[A-Za-z]*[)]*"

In [38]:
# the list of all document titles is updated with all expressions from pattern_appendix
# and pattern_part that appear in the list of documents

ref_in_title_list = html_titles + pdf_titles

for i in range(len(htmls + pdfs)):
    
    ref_appendix = re.findall(pattern_appendix, str(ref_in_title_list[i]) )
    ref_part = re.findall(pattern_part, str(ref_in_title_list[i]) )
    
    if ref_part!=[]:
        ref_in_title_list+= ref_part
     
    if ref_appendix!=[]:
        ref_in_title_list+= ref_appendix
        
    else: 
        continue
        
keyword_processor_ref.add_keywords_from_list(ref_in_title_list)

The "keywords", "needed records" and "references" columns are updated with all the keywords found

In [39]:
# keywords for htmls
for i in range(len(htmls)):
    doctitle=html_titles[i]
    for j in range(len(htmls[i])):       
        sectitle=str(htmls[i]['section title'][j])
        subsectitle=str(htmls[i]['subsection title'][j])
        # more weight is put to words appearing in titles
        text=3*(doctitle + " ")  + 2*(sectitle + " ")+ subsectitle + " " +htmls[i]['text'][j]
        #The regional keywords are updated with all names of countries except "United Kingdom"
        countrylist=list(Counter(GeoText(text).countries).items())
        count_minus_uk = [t for t in countrylist if (t[0] != 'United Kingdom')] 
        keywords_all=list(Counter(keyword_processor_all.extract_keywords(text)).items())
        htmls[i]['keywords'][j]=keywords_all +count_minus_uk
        keywords_rec=list(Counter(keyword_processor_rec.extract_keywords(text)).items())
        htmls[i]['needed records'][j]=keywords_rec
        keywords_ref = list(Counter(keyword_processor_ref.extract_keywords(text)).items())
        # removing self-references
        keywords_refminusself = [t for t in keywords_ref if (t[0] not in subsectitle) if (t[0] not in sectitle) if (t[0] not in doctitle)]
        htmls[i]['references'][j]=keywords_refminusself

In [40]:
# keywords for pdf

for i in range(len(pdfs)):
    doctitle = pdf_titles[i]
    for j in range(len(pdfs[i])):
        sectitle = ""
        if 'section title' in pdfs[i].columns:
            sectitle = str(pdfs[i]['section title'][j])
        subsectitle = ""
        if 'subsection title' in pdfs[i].columns:
            subsectitle = str(pdfs[i]['subsection title'][j])
        subsubsectitle = ""
        if 'subsubsection title' in pdfs[i].columns:
            subsubsectitle = str(pdfs[i]['subsubsection title'][j])
        subsubsubsectitle = ""
        if 'subsubsubsection title' in pdfs[i].columns:
            subsubsubsectitle = str(pdfs[i]['subsubsubsection title'][j])
        subsubsubsubsectitle = ""
        if 'subsubsubsubsection title' in pdfs[i].columns:
            subsubsubsubsectitle = str(pdfs[i]['subsubsubsubsection title'][j])
        # more weight is put to words appearing in titles
        text = (pdfs[i]['text'][j]  + " " + 3*(doctitle + " ")  + 2*(sectitle + " ")
                + subsectitle + " " + subsubsectitle + " " 
                + subsubsubsectitle + " " + subsubsubsubsectitle) 
        #The regional keywords are updated with all names of countries except "United Kingdom"
        countrylist=list(Counter(GeoText(text).countries).items())
        count_minus_uk = [t for t in countrylist if (t[0] != 'United Kingdom')] 
        keywords_all=list(Counter(keyword_processor_all.extract_keywords(text)).items())
        pdfs[i]['keywords'][j]=keywords_all +count_minus_uk
        keywords_rec=list(Counter(keyword_processor_rec.extract_keywords(text)).items())
        pdfs[i]['needed records'][j]=keywords_rec
        keywords_ref = list(Counter(keyword_processor_ref.extract_keywords(text)).items())
        # removing self-references
        keywords_refminusself = [t for t in keywords_ref if (t[0] not in subsectitle) if (t[0] not in sectitle) if (t[0] not in doctitle)]
        pdfs[i]['references'][j]=keywords_refminusself

In [41]:
# update column "reference urls" with urls corresponding to titles that appear in the "references" column

all_titles = html_titles + pdf_titles
all_urls = html_urls + pdf_urls

#for html documents
for i in range(len(htmls)):
    for j in range(len(htmls[i])):      
        tuple_list= htmls[i]['references'][j]
        url_for_ref = []
        if tuple_list != []: 
            value_list = [itm[0] for itm in tuple_list]
            for ref in  value_list:
                for longtitle in all_titles:
                    if ref in longtitle:
                        idx = all_titles.index(longtitle)
                        item = all_urls[idx]
                        url_for_ref.append(item)
        htmls[i]['reference urls'][j] = url_for_ref

#for PDFs
for i in range(len(pdfs)):
    for j in range(len(pdfs[i])):  
        tuple_list= pdfs[i]['references'][j]
        url_for_ref = []
        if tuple_list != []: 
            value_list = [itm[0] for itm in tuple_list]
            for ref in  value_list:
                for longtitle in all_titles:
                    if ref in longtitle:
                        idx = all_titles.index(longtitle)
                        item = all_urls[idx]
                        url_for_ref.append(item)
        pdfs[i]['reference urls'][j] = url_for_ref

### Pickle resulting list of dataframes
We create a combined list that contains both the pdf and the html dataframes.
This listed is pickled as 'labeled_immigration_rules.pickle'.

In [49]:
# The first pdf has such a different format that we have to leave it out for now.
pdfs=pdfs[1:]

list_of_dataframes = pdfs + htmls
outpath = os.path.join('..','data_nationbetter','labeled_corpus.pkl')
with open(outpath,'wb') as f:
    pickle.dump(list_of_dataframes,f, pickle.HIGHEST_PROTOCOL)

## Some examples

### Example 1: Appendix Armed Forces
from https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-armed-forces

In [43]:
ex1=htmls[2]  
ex1

Unnamed: 0,section,subsection,is table,text,section title,subsection title,document title,document url,document type,keywords,needed records,references,reference urls
0,0,0,False,Immigration Rules Appendix A: attributes\nPoin...,,,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(points, 1), (Tier 1, 1), (Tier 5, 1), (point...",[],"[(Immigration Rules Appendix A: attributes, 1)]",[https://www.gov.uk/guidance/immigration-rules...
1,1,0,False,Attributes for Tier 1 (Exceptional Talent) Mig...,Attributes for Tier 1 (Exceptional Talent) Mig...,,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 1, 3), (Tier 1 (Exceptional Talent), 3)]",[],[],[]
2,1,0,False,\n1. An applicant applying for indefinite leav...,Attributes for Tier 1 (Exceptional Talent) Mig...,,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 1, 3), (Tier 1 (Exceptional Talent), 3)...",[],[],[]
3,1,1,False,Table 1,Attributes for Tier 1 (Exceptional Talent) Mig...,Table 1,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 1, 2), (Tier 1 (Exceptional Talent), 2)]",[],[],[]
4,1,2,False,Applications for indefinite leave to remain,Attributes for Tier 1 (Exceptional Talent) Mig...,Applications for indefinite leave to remain,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 1, 2), (Tier 1 (Exceptional Talent), 2)]",[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,41,1,True,Criterion Point...,Attributes for Tier 4 (Child) Students,Table 17,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 4, 2), (child, 2), (points, 1)]","[(confirmation, 1)]",[],[]
149,41,2,False,Notes,Attributes for Tier 4 (Child) Students,Notes,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 4, 2), (child, 2)]",[],[],[]
150,41,2,False,\n124. A Confirmation of Acceptance for Studie...,Attributes for Tier 4 (Child) Students,Notes,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 4, 6), (child, 4), (Independent School,...","[(confirmation, 10), (licence, 2), (evidence, 1)]","[(Appendix H, 1)]",[https://www.gov.uk/guidance/immigration-rules...
151,42,0,False,Specified documents,Specified documents,Specified documents,Immigration Rules Appendix 7: overseas workers...,https://www.gov.uk/guidance/immigration-rules/...,html,"[(documents, 4)]",[],[],[]


### Example 2: Part 6a
from https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system

In [44]:
ex2 = htmls[45]
ex2

Unnamed: 0,section,subsection,is table,text,section title,subsection title,document title,document url,document type,keywords,needed records,references,reference urls
0,0,0,False,Immigration Rules part 6: self-employment and ...,,,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,"[(Tier 1 (Investor), 1)]",[],[(Immigration Rules part 6: self-employment an...,[https://www.gov.uk/guidance/immigration-rules...
1,1,0,False,Person intending to establish themselves in bu...,Person intending to establish themselves in bu...,,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
2,1,1,False,Requirements for leave to enter the United Kin...,Person intending to establish themselves in bu...,Requirements for leave to enter the United Kin...,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
3,1,1,False,\n200. Deleted on 30 June 2008 by paragraph 17...,Person intending to establish themselves in bu...,Requirements for leave to enter the United Kin...,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],"[(Appendix F, 4)]",[https://www.gov.uk/guidance/immigration-rules...
4,1,2,False,Leave to enter the United Kingdom as a person ...,Person intending to establish themselves in bu...,Leave to enter the United Kingdom as a person ...,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
5,1,2,False,\n204. Deleted on 30 June 2008 by paragraph 17...,Person intending to establish themselves in bu...,Leave to enter the United Kingdom as a person ...,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],"[(Appendix F, 1)]",[https://www.gov.uk/guidance/immigration-rules...
6,1,3,False,Refusal of leave to enter the United Kingdom a...,Person intending to establish themselves in bu...,Refusal of leave to enter the United Kingdom a...,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
7,1,3,False,\n205. Deleted on 30 June 2008 by paragraph 17...,Person intending to establish themselves in bu...,Refusal of leave to enter the United Kingdom a...,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],"[(Appendix F, 1)]",[https://www.gov.uk/guidance/immigration-rules...
8,1,4,False,Requirements for an extension of stay in order...,Person intending to establish themselves in bu...,Requirements for an extension of stay in order...,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],[],[]
9,1,4,False,\nDeleted on 30 June 2008 by paragraph 17 of S...,Person intending to establish themselves in bu...,Requirements for an extension of stay in order...,Immigration Rules part 5: working in the UK,https://www.gov.uk/guidance/immigration-rules/...,html,[],[],"[(Appendix F, 10)]",[https://www.gov.uk/guidance/immigration-rules...


In [46]:
ex2['reference urls'][40]

['https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-f-archived-immigration-rules',
 'https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-fm-se-family-members-specified-evidence',
 'https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-fm-family-members']

### Example 3
from the PDF "English language requirements: skilled workers"

In [47]:
ex3 = pdfs[3]
ex3

Unnamed: 0,index,section,subsection,page_no,text,url,section title,subsection title,document title,document url,document type,keywords,needed records,references,reference urls
0,0,0.0,0.0,1,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...,https://assets.publishing.service.gov.uk/gover...,,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(language requirements, 4)]",[],[],[]
1,1,0.0,0.0,1,\nVersion 17.0 \n,https://assets.publishing.service.gov.uk/gover...,,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(language requirements, 3)]",[],[],[]
2,2,1.0,0.0,1,\n \n \n \n \nContents \n,https://assets.publishing.service.gov.uk/gover...,Contents,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(language requirements, 3)]",[],[],[]
3,3,1.0,0.0,2,\nContents .....................................,https://assets.publishing.service.gov.uk/gover...,Contents,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(guidance, 2), (points-based, 1), (Tier 1, 2)...","[(evidence, 1), (documentation, 1), (certifica...","[(Appendix W, 3)]",[https://www.gov.uk/guidance/immigration-rules...
4,4,2.0,0.0,3,\n \nAbout this guidance \n,https://assets.publishing.service.gov.uk/gover...,About this guidance,,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(guidance, 3), (language requirements, 3)]",[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,79,14.0,1.0,23,"\nDoctors, dentists, nurses and midwives appl...",https://assets.publishing.service.gov.uk/gover...,Applications under Tier 2 (General),"English language requirement for doctors, dent...",English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(nurses, 2), (midwives, 2), (Tier 2 (General)...","[(certificate, 1)]",[],[]
80,80,14.0,1.0,23,\n• 2211 – Medical Practitioners \n,https://assets.publishing.service.gov.uk/gover...,Applications under Tier 2 (General),"English language requirement for doctors, dent...",English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(2211, 1), (language requirements, 4), (Tier ...",[],[],[]
81,81,14.0,1.0,23,• 2215 – Dental Practitioners \n• 2231 – Nur...,https://assets.publishing.service.gov.uk/gover...,Applications under Tier 2 (General),"English language requirement for doctors, dent...",English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(2215, 1), (Dental practitioners, 1), (2231, ...",[],[],[]
82,82,15.0,0.0,23,\n \nTransitional arrangements \n,https://assets.publishing.service.gov.uk/gover...,Transitional arrangements,Transitional arrangements,English language requirements: skilled workers,https://assets.publishing.service.gov.uk/gover...,pdf,"[(language requirements, 3)]",[],[],[]


In [48]:
#A closer look at line 75:
print("Text: ", ex3['text'][75]) 
print("Document Title: ",ex3['document title'][75]) 
print("Section Title: ",ex3['section title'][75]) 
print("Subsection Title: ",ex3['subsection title'][75]) 
print("Keywords: ",ex3['keywords'][75]) 
print("References: ",ex3['references'][75])                
print("Reference URLs: ",ex3['reference urls'][75])

Text:   
There are no English language requirements for the following routes: 
 
•  Tier 1 (Investor) 
•  Tier 1 (Exceptional talent) 
•  Tier 2 (Intra-company transfer) 
•  Tier 5 
•  Global Talent (Appendix W Worker) 
 
Related content  
Contents 
Applications under Tier 2 (General)  
Transitional arrangements 
Appendix W 

Document Title:  English language requirements: skilled workers
Section Title:  Exemptions from the language requirements or no requirement
Subsection Title:  No English language requirements
Keywords:  [('language requirements', 7), ('Tier 1', 2), ('Tier 1 (Investor)', 1), ('Tier 1 (Exceptional Talent)', 1), ('Tier 2', 1), ('Company', 1), ('Tier 5', 1), ('Global talent', 1), ('Tier 2 (General)', 1), ('requirement', 2)]
References:  [('Appendix W', 2)]
Reference URLs:  ['https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-w-immigration-rules-for-workers']
