In [46]:
import pandas as pd
import numpy as np

import spacy

import en_core_web_sm
import en_core_web_lg

from web_scraping_lib import *
from text_wrangling_utils import *




In [47]:
# documents relevant for Tier 2 visa
url_list = [
    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-index",
    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system",
    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-a-attributes",
    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-c-maintenance-funds",
    "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-b-english-language"
]

test_url = url_list[0]
print(test_url)

https://www.gov.uk/guidance/immigration-rules/immigration-rules-index


In [48]:
df = map_df(scrape_govuk_guidance,url_list)
df.head()

Unnamed: 0,URL,title,summary,text_dump,text_segmented,hyperlinks_dump,timestamp
0,https://www.gov.uk/guidance/immigration-rules/...,Immigration Rules: Index,The rules are divided into different documents...,\nImmigration Rules: Index\nThe rules are divi...,"[(text, Immigration Rules: Index), (text, The ...",[https://www.gov.uk/guidance/immigration-rules...,2020-08-27T18:18:10+00:00
1,https://www.gov.uk/guidance/immigration-rules/...,Immigration Rules part 6A: the points-based sy...,Points-based system (paragraphs 245AAA to 245Z...,\nImmigration Rules part 6A: the points-based ...,"[(text, Immigration Rules part 6A: the points-...",[],2020-08-27T18:18:11+00:00
2,https://www.gov.uk/guidance/immigration-rules/...,Immigration Rules Appendix A: attributes,Points needed for attributes for applicants in...,\nImmigration Rules Appendix A: attributes\nPo...,"[(text, Immigration Rules Appendix A: attribut...",[http://www.oanda.com],2020-08-27T18:18:12+00:00
3,https://www.gov.uk/guidance/immigration-rules/...,Immigration Rules Appendix C: maintenance (funds),Maintenance (funds),\nImmigration Rules Appendix C: maintenance (f...,"[(text, Immigration Rules Appendix C: maintena...",[],2020-08-27T18:18:14+00:00
4,https://www.gov.uk/guidance/immigration-rules/...,Immigration Rules Appendix B: English language,English Language,\nImmigration Rules Appendix B: English langua...,"[(text, Immigration Rules Appendix B: English ...",[],2020-08-27T18:18:15+00:00


In [49]:
text_list = df['text_segmented'][1]

In [50]:
text_list

[('text', 'Immigration Rules part 6A: the points-based system'),
 ('text', 'Points-based system (paragraphs 245AAA to 245ZZE).'),
 ('section', '245AAA.General requirements for indefinite leave to remain'),
 ('text',
  'The following rules apply to all requirements for indefinite leave to remain in Part 6A and Appendix A:'),
 ('text',
  '\n(a) References to a “continuous period” “lawfully in the UK” means, subject to paragraph (e), residence in the UK for an unbroken period with valid leave, and for these purposes a period shall be considered unbroken where:\n    \n(i)\tthe applicant has not been absent from the UK for more than 180 days during any 12 month period in the continuous period, except that:\n        \n(1) any absence from the UK for the purpose of assisting with a national or international humanitarian or environmental crisis overseas shall not count towards the 180 days, if the applicant provides evidence that this was the purpose of the absence(s) and that their Sponsor, i

In [51]:
dict(text_list)

{'text': '\nThe specified documents, details and requirements of care arrangements referred to in paragraph 245ZZA(f) and paragraph 245ZZC(g) are:\n    \n(i) The applicant must provide a written letter of undertaking from the intended carer confirming the care arrangement, which shows:\n        \n(1) the name, current address and contact details of the intended carer,\n(2) the address where the carer and the Tier 4 (Child) student will be living in the UK if different from the intended carer’s current address,\n(3) confirmation that the accommodation offered to the Tier 4 (Child) student is a private address, and not operated as a commercial enterprise, such as a hotel or a youth hostel,\n(4) the nature of the relationship between the Tier 4 (Child) student’s parent(s) or legal guardian and the intended carer,\n(5) that the intended carer agrees to the care arrangements for the Tier 4 (Child) student,\n(6) that the intended carer has at least £570 per month (up to a maximum of nine mon

In [52]:
""" Function to extract sections from text_segmented """

def extract_sections(list):
    
    """ list: list of tuples (string with label, text string) """
    

    
    section_num = 0
    
    sections = []
    for key, value in list:

        value = value.replace('\n', ' ')
        value = value.replace('\t', ' ')
        
        section = key == 'section'

        if section:
            
            section_num = section_num + 1

            sections.append(['Section ' + value])
            
            sections[section_num-1].append(' ' + value)

        else:
            if section_num > 0:
                
                sections[section_num-1][1] +=  ' ' + value
    
    
    return pd.DataFrame(sections)

In [53]:
df_sections = extract_sections(df['text_segmented'][1])

In [54]:
df_sections = pd.DataFrame(df_sections)
df_sections[1][3]

' Tier 1 (Exceptional Talent) Migrants 245B. Purpose  (a)(i) This category is now closed to new applicants.      (ii) Individuals who have entry clearance, leave to enter or leave to remain as a Tier 1 (Exceptional Talent) Migrant may apply under these Rules for indefinite leave to remain. (iii) Other applicants who wish to work in the UK as a talented or promising individual in a particular field, including those who have been endorsed by an endorsing body (but have not yet obtained leave to enter or leave to remain in this capacity) and applicants who wish to extend Tier 1 (Exceptional Talent) leave, may apply under the rules for the Global Talent category which are set out in Appendix W.    245BA. DELETED 245BB. DELETED 245BC. DELETED 245BD. DELETED 245BE. DELETED 245BF.Requirements for indefinite leave to remain To qualify for indefinite leave to remain, a Tier 1 (Exceptional Talent) Migrant must meet the requirements listed below. If the applicant meets these requirements, indefin

In [55]:
""" Function to organize the url_list documents into sections from text_segmented """

def url_docs_sections(scraped_urls):
    
    
    """ 
    Input-
    scraped_urls: a dataframe as the output of map_df(scrape_govuk_guidance,url_list)
    Output-
    dictionary with Values the dataframe of sections for each url and Key the title of the corresponding document
    """
    
    list_sec_df = []
    for row in range(len(df)):
        
         list_sec_df = list_sec_df + [extract_sections(df['text_segmented'][row])]
        
    
    
    return dict(list(zip(df['title'], list_sec_df)))

In [56]:
docs_sections_dic = url_docs_sections(df)

In [57]:
list(docs_sections_dic.values())[1]

Unnamed: 0,0,1
0,Section 245AAA.General requirements for indefi...,245AAA.General requirements for indefinite le...
1,Section 245AA.Documents not submitted with app...,245AA.Documents not submitted with applicatio...
2,Section 245A. Specified documents for students...,245A. Specified documents for students previo...
3,Section Tier 1 (Exceptional Talent) Migrants,Tier 1 (Exceptional Talent) Migrants 245B. Pu...
4,Section Tier 1 (General) Migrants,Tier 1 (General) Migrants 245C.Purpose DELETE...
5,Section Tier 1 (Entrepreneur) Migrants,Tier 1 (Entrepreneur) Migrants 245D. Purpose ...
6,Section Tier 1 (Investor) Migrants,Tier 1 (Investor) Migrants 245E. Purpose This...
7,Section Tier 1 (Graduate Entrepreneur) Migrants,Tier 1 (Graduate Entrepreneur) Migrants 245F....
8,Section Tier 2 (Intra-Company Transfer) Migrants,Tier 2 (Intra-Company Transfer) Migrants 245G...
9,"Section Tier 2 (General) Migrants, Tier 2 (Min...","Tier 2 (General) Migrants, Tier 2 (Minister o..."


In [58]:
labels_docs = pd.read_csv('Labels_from_legal_corpus.csv')

In [59]:
labels_docs.head()

Unnamed: 0,Types of Visa,Keywords - Joao,Tier 2 and 5: Guidance for Sponsors -Addendum (pages 36 - 61) Keywords - Bernhard
0,Tier 1,Sponsor,authorising officer
1,Exceptional Talent,Restrictions,B-rating
2,Enterpreneur,Skill,bankruptcy
3,Investor,Pay,branch
4,Tier 2,Close relative,certificates of sponsorship


In [60]:
types_of_visa = labels_docs['Types of Visa'].dropna()

In [61]:
types_of_visa

0                     Tier 1
1         Exceptional Talent
2               Enterpreneur
3                   Investor
4                     Tier 2
5     Intra-Company Transfer
6               Sportsperson
7       Minister of Religion
8                     Tier 3
9                     Tier 4
10                   Student
11             Child Student
12                    Tier 5
13          Temporary Worker
14     Youth Mobility Scheme
Name: Types of Visa, dtype: object

In [62]:
from flashtext import KeywordProcessor

# Create an object of KeywordProcessor
keyword_processor = KeywordProcessor()

In [63]:
# add keywords
keyword_processor.add_keywords_from_list(list(types_of_visa))

In [64]:
keywords_found = keyword_processor.extract_keywords(df_sections.iloc[:,1][10])

In [65]:
keywords_found

['Tier 5',
 'Youth Mobility Scheme',
 'Tier 5',
 'Youth Mobility Scheme',
 'Tier 5',
 'Youth Mobility Scheme',
 'Tier 5',
 'Youth Mobility Scheme',
 'Sportsperson',
 'Tier 4']

In [66]:
""" Function to count words on a list and return a dictionary"""

def word_count_dict(word_list):
    word_count = [word_list.count(p) for p in word_list]
    return dict(list(zip(word_list,word_count)))

In [67]:
""" Function to add an extra column with labels """


def column_labelling_sections(dic):
    
    """ dic: dictionary as in the output of url_docs_sections """
    
    
    for df in dic.values():
        
        if len(df)>0:
            tags = []
            for i in range(len(df)):    
                keywords_found = keyword_processor.extract_keywords(df.iloc[:,1][i])
                tags = tags + [word_count_dict(keywords_found)]
     
            df['Tags: Types of Visa'] = tags
    
    
        
    return dic

In [68]:
types_of_visa_tags(docs_sections_dic)

{'Immigration Rules: Index': Empty DataFrame
 Columns: []
 Index: [],
 'Immigration Rules part 6A: the points-based system':                                                     0  \
 0   Section 245AAA.General requirements for indefi...   
 1   Section 245AA.Documents not submitted with app...   
 2   Section 245A. Specified documents for students...   
 3        Section Tier 1 (Exceptional Talent) Migrants   
 4                   Section Tier 1 (General) Migrants   
 5              Section Tier 1 (Entrepreneur) Migrants   
 6                  Section Tier 1 (Investor) Migrants   
 7     Section Tier 1 (Graduate Entrepreneur) Migrants   
 8    Section Tier 2 (Intra-Company Transfer) Migrants   
 9   Section Tier 2 (General) Migrants, Tier 2 (Min...   
 10  Section Tier 5 (Youth Mobility Scheme) Tempora...   
 11         Section Tier 5 (Temporary Worker) Migrants   
 12                   Section Tier 4 (General) Student   
 13                     Section Tier 4 (Child) Student   
 
    

In [69]:
list(types_of_visa_tags(docs_sections_dic).values())[1]

Unnamed: 0,0,1,Labels: Types of Visa
0,Section 245AAA.General requirements for indefi...,245AAA.General requirements for indefinite le...,"{'Tier 2': 0.0023228814161397287, 'Tier 1': 0...."
1,Section 245AA.Documents not submitted with app...,245AA.Documents not submitted with applicatio...,{}
2,Section 245A. Specified documents for students...,245A. Specified documents for students previo...,{}
3,Section Tier 1 (Exceptional Talent) Migrants,Tier 1 (Exceptional Talent) Migrants 245B. Pu...,"{'Tier 1': 0.018059181538397466, 'Exceptional ..."
4,Section Tier 1 (General) Migrants,Tier 1 (General) Migrants 245C.Purpose DELETE...,{'Tier 1': 0.05406722127027579}
5,Section Tier 1 (Entrepreneur) Migrants,Tier 1 (Entrepreneur) Migrants 245D. Purpose ...,"{'Tier 1': 0.009494625919778387, 'Sportsperson..."
6,Section Tier 1 (Investor) Migrants,Tier 1 (Investor) Migrants 245E. Purpose This...,"{'Tier 1': 0.007025790024405602, 'Investor': 0..."
7,Section Tier 1 (Graduate Entrepreneur) Migrants,Tier 1 (Graduate Entrepreneur) Migrants 245F....,"{'Tier 1': 0.03922071315328133, 'Tier 2': 0.03..."
8,Section Tier 2 (Intra-Company Transfer) Migrants,Tier 2 (Intra-Company Transfer) Migrants 245G...,"{'Tier 2': 0.007216526163868854, 'Intra-Compan..."
9,"Section Tier 2 (General) Migrants, Tier 2 (Min...","Tier 2 (General) Migrants, Tier 2 (Minister o...","{'Tier 2': 0.013134321140366374, 'Minister of ..."


In [70]:
""" Function to count words on a list and return a dictionary"""

def word_count_dict(word_list):
    word_count = [word_list.count(p) for p in word_list]
    return dict(list(zip(word_list,word_count)))

In [71]:
""" Function to add an extra column with labels and corresponding tf-idf values """


def column_labelling_sections_norm(dic):
    
    """ dic: dictionary as in the output of url_docs_sections """
    
    
    for df in dic.values():
        
        if len(df)>0:
            tags = []
            for i in range(len(df)):
                num_words_i = len(df.iloc[:,1][i].split())
                keywords_found = keyword_processor.extract_keywords(df.iloc[:,1][i])
                
                keywords_tf = []
                for p in keywords_found:
                    keywords_tf += [np.log(1+keywords_found.count(p)/num_words_i)] # normalized term frequency of keywords
                
                tags_dic = dict(list(zip(keywords_found, keywords_tf)))
                
                tags = tags + [tags_dic]
     
            df['Labels: Types of Visa'] = tags
    
    
        
    return dic

In [72]:
column_labelling_sections_norm(docs_sections_dic)

{'Immigration Rules: Index': Empty DataFrame
 Columns: []
 Index: [],
 'Immigration Rules part 6A: the points-based system':                                                     0  \
 0   Section 245AAA.General requirements for indefi...   
 1   Section 245AA.Documents not submitted with app...   
 2   Section 245A. Specified documents for students...   
 3        Section Tier 1 (Exceptional Talent) Migrants   
 4                   Section Tier 1 (General) Migrants   
 5              Section Tier 1 (Entrepreneur) Migrants   
 6                  Section Tier 1 (Investor) Migrants   
 7     Section Tier 1 (Graduate Entrepreneur) Migrants   
 8    Section Tier 2 (Intra-Company Transfer) Migrants   
 9   Section Tier 2 (General) Migrants, Tier 2 (Min...   
 10  Section Tier 5 (Youth Mobility Scheme) Tempora...   
 11         Section Tier 5 (Temporary Worker) Migrants   
 12                   Section Tier 4 (General) Student   
 13                     Section Tier 4 (Child) Student   
 
    