In [10]:
import pandas as pd
import re


from flashtext import KeywordProcessor

import pickle

## 1. Import dataframe of the scraped Immigration Rules documents

In [11]:
with open("immigration_rules_scrape.pickle",'rb') as f:
    scrape_df = pickle.load(f)

In [12]:
scrape_df.head()

Unnamed: 0,URL,media type,title,text_dump,text_segmented,hyperlinks_dump,timestamp
0,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules: Index,\nImmigration Rules: Index\nThe rules are divi...,"[(text, Immigration Rules: Index\nThe rules ar...",[https://www.gov.uk/guidance/immigration-rules...,2020-08-25T18:14:38+00:00
1,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules: introduction,\nImmigration Rules: introduction\nThis contai...,"[(text, Immigration Rules: introduction\nThis ...",[],2020-08-25T18:14:39+00:00
2,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 1: leave to enter or st...,\nImmigration Rules part 1: leave to enter or ...,"[(text, Immigration Rules part 1: leave to ent...",[],2020-08-25T18:14:39+00:00
3,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 2: transitional provisi...,\nImmigration Rules part 2: transitional provi...,"[(text, Immigration Rules part 2: transitional...",[],2020-08-25T18:14:41+00:00
4,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 3: students,\nImmigration Rules part 3: students\nPersons ...,"[(text, Immigration Rules part 3: students\nPe...",[],2020-08-25T18:14:41+00:00


In [13]:
len(scrape_df)

51

In [14]:
scrape_df['title']

0                              Immigration Rules: Index
1                       Immigration Rules: introduction
2     Immigration Rules part 1: leave to enter or st...
3     Immigration Rules part 2: transitional provisi...
4                    Immigration Rules part 3: students
5             Immigration Rules part 4: work experience
6           Immigration Rules part 5: working in the UK
7     Immigration Rules part 6: self-employment and ...
8     Immigration Rules part 6A: the points-based sy...
9            Immigration Rules part 7: other categories
10             Immigration Rules part 8: family members
11        Immigration Rules part 9: grounds for refusal
12    Immigration Rules part 10: registering with th...
13                    Immigration Rules part 11: asylum
14     Immigration Rules part 11A: temporary protection
15                           Immigration Rules part 11B
16    Immigration Rules part 12: Procedure and right...
17               Immigration Rules part 13: depo

In [36]:
scrape_df['text_segmented'][0][1][1]

'                                                                                                                                                                                                                                                                                                                                             Unnamed: 0  Paragraph number  Unnamed: 2\n                                                                                                                                                                                                                                                                                                                      Introduction (Paragraphs 1 to 6C)               NaN         NaN\n                                                                                                                                                                                                                                                         

## 2. Extract keywords from the Immigration Rules document titles:

In [15]:
references_list = scrape_df['title']

In [278]:
pattern_appendix = r"[Aa]ppendix [0-9]*[A-Za-z]*[-]*\s*[A-Za-z]*\s*[(]*[A-Za-z]*\s*[A-Za-z]*[)]*"
pattern_part = r"[Pp]art [0-9]*[A-Za-z]*\s*[A-Za-z]*\s*[(]*[A-Za-z]*\s*[A-Za-z]*[)]*"

pattern_paragraph = r"[Pp]aragraph[s]* [0-9A-Za-z]*\s?\-?[a-z\s0-9A-Za-z]*"


print( re.findall(pattern_paragraph, 'Immigration Rules paragraphs 21-22') )
print( re.findall(pattern_part, 'Immigration Rules Part Appendix (EU)') )

['paragraphs 21-22']
['Part Appendix (EU)']


In [17]:
# Extract labels Appendix and Part from title list



labels_references = []

for i in range(len(references_list)):
    
    ref_appendix = re.findall(pattern_appendix, str(references_list[i]) )
    ref_part = re.findall(pattern_part, str(references_list[i]) )
    
    
    if ref_part!=[]:
        labels_references+= ref_part
     
    if ref_appendix!=[]:
        labels_references+= ref_appendix
        
    else: 
        continue

        

In [18]:
print(labels_references)
print(len(labels_references))

['part 1', 'part 2', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 12', 'part 13', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'Appendix D', 'Appendix E', 'Appendix ECAA', 'Appendix EU', 'Appendix EU (Family Permit)', 'Appendix F', 'Appendix FM', 'Appendix FM-SE', 'Appendix G', 'Appendix H', 'Appendix J', 'Appendix K', 'Appendix KoLL', 'Appendix M', 'Appendix N', 'Appendix O', 'Appendix P', 'Appendix SN', 'Appendix T', 'Appendix U', 'Appendix V', 'Appendix W']
49


## 3. Extract sections into a dataframe for each document - stored in a dictionary

In [19]:
""" Function to extract sections from text_segmented """

def extract_sections(list):
    
    """ list: list of tuples (string with label, text string) """
    

    
    section_num = 0
    
    sections = []
    for key, value in list:

        value = value.replace('\n', ' ')
        value = value.replace('\t', ' ')
        
        section = key == 'section'

        if section:
            
            section_num = section_num + 1

            sections.append(['Section ' + value])
            
            sections[section_num-1].append(' ' + value)

        else:
            if section_num > 0:
                
                sections[section_num-1][1] +=  ' ' + value
    
    
    return pd.DataFrame(sections)

In [20]:
""" Function to organize the url_list documents into sections from text_segmented """

def url_docs_sections(df):
    
    
    """ 
    Input:
        scraped_urls: a dataframe as the output of map_df(scrape_govuk_guidance,url_list)
    Output:
        Dictionary with Values the dataframe of sections for each url and Key the title of the corresponding document
    """
    
    list_sec_df = []
    for row in range(len(df)):
        
         list_sec_df = list_sec_df + [extract_sections(df['text_segmented'][row])]
        
    
    
    return dict(list(zip(df['title'], list_sec_df)))

In [21]:
docs_sections_dic = url_docs_sections(scrape_df)

In [22]:
len(docs_sections_dic)

51

In [23]:
datafs = list(docs_sections_dic.values())

In [32]:
datafs[1]

Unnamed: 0,0,1,References to Immigration Rules
0,Section Implementation and transitional provis...,Implementation and transitional provisions 4...,{}
1,Section Application,Application 5.Save where expressly indicated...,{}
2,Section Interpretation,Interpretation 6.In these Rules the followin...,"{'Appendix FM': 9, 'part 8': 3, 'part 1': 5, '..."


## 4. Add labels of references to sections - stored in a column on each dataframe of the dictionary

In [24]:
""" Function to count words on a list and return a dictionary"""

def word_count_dict(word_list):
    word_count = [word_list.count(p) for p in word_list]
    return dict(list(zip(word_list,word_count)))

In [25]:
""" Function to add an extra column with labels """

# Create an object of KeywordProcessor
keyword_processor = KeywordProcessor()



def add_column_labeling_sections(dic, labels_list):
    
    """ 
    Input:
        data: dataframe with a column of strings
        list: list of keywords (strings)
        
    Output: 
        appends a column to data with the labels present in a certain entry of the column
    """
    

    
    for key, df in dic.items() :
        labels = list(labels_list)
        
        row = list(dic.keys()).index(key) 

        if row>1: 
            del labels[row-2]
            keyword_processor.add_keywords_from_list(labels) # We remove the label of the document we are labelling
            print(labels)
            print(row)
        else:
            keyword_processor.add_keywords_from_list(labels)
        
        if len(df)>0:

            
            tags = []
            for i in range(len(df)):    
                keywords_found = keyword_processor.extract_keywords(df.iloc[:,1][i])
                tags += [word_count_dict(keywords_found)]
     
            df['References to Immigration Rules'] = tags
            keyword_processor.remove_keywords_from_list(labels)
    
        
    return dic

In [26]:
docs_section_labels_ref = add_column_labeling_sections(docs_sections_dic, labels_references)

['part 2', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 12', 'part 13', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'Appendix D', 'Appendix E', 'Appendix ECAA', 'Appendix EU', 'Appendix EU (Family Permit)', 'Appendix F', 'Appendix FM', 'Appendix FM-SE', 'Appendix G', 'Appendix H', 'Appendix J', 'Appendix K', 'Appendix KoLL', 'Appendix M', 'Appendix N', 'Appendix O', 'Appendix P', 'Appendix SN', 'Appendix T', 'Appendix U', 'Appendix V', 'Appendix W']
2
['part 1', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 12', 'part 13', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'Appendi

['part 1', 'part 2', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 13', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'Appendix D', 'Appendix E', 'Appendix ECAA', 'Appendix EU', 'Appendix EU (Family Permit)', 'Appendix F', 'Appendix FM', 'Appendix FM-SE', 'Appendix G', 'Appendix H', 'Appendix J', 'Appendix K', 'Appendix KoLL', 'Appendix M', 'Appendix N', 'Appendix O', 'Appendix P', 'Appendix SN', 'Appendix T', 'Appendix U', 'Appendix V', 'Appendix W']
16
['part 1', 'part 2', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 12', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'Appendix

['part 1', 'part 2', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 12', 'part 13', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'Appendix D', 'Appendix E', 'Appendix ECAA', 'Appendix EU', 'Appendix F', 'Appendix FM', 'Appendix FM-SE', 'Appendix G', 'Appendix H', 'Appendix J', 'Appendix K', 'Appendix KoLL', 'Appendix M', 'Appendix N', 'Appendix O', 'Appendix P', 'Appendix SN', 'Appendix T', 'Appendix U', 'Appendix V', 'Appendix W']
33
['part 1', 'part 2', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 12', 'part 13', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'Appendix D', 'App

['part 1', 'part 2', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 12', 'part 13', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'Appendix D', 'Appendix E', 'Appendix ECAA', 'Appendix EU', 'Appendix EU (Family Permit)', 'Appendix F', 'Appendix FM', 'Appendix FM-SE', 'Appendix G', 'Appendix H', 'Appendix J', 'Appendix K', 'Appendix KoLL', 'Appendix M', 'Appendix N', 'Appendix O', 'Appendix P', 'Appendix T', 'Appendix U', 'Appendix V', 'Appendix W']
46
['part 1', 'part 2', 'part 3', 'part 4', 'part 5', 'part 6', 'part 6A', 'part 7', 'part 8', 'part 9', 'part 10', 'part 11', 'part 11A', 'part 11B', 'part 12', 'part 13', 'part 14', 'part 15', 'Appendix 2', 'Appendix 6', 'Appendix 7', 'Appendix A', 'Appendix AR', 'Appendix AR (EU)', 'Appendix Armed Forces', 'Appendix B', 'Appendix C', 'A

In [27]:
list(docs_section_labels_ref.values())[7]

Unnamed: 0,0,1,References to Immigration Rules
0,Section Person intending to establish themselv...,Person intending to establish themselves in b...,{'Appendix F': 18}
1,Section Innovators,Innovators Requirements for leave to enter th...,{'Appendix F': 15}
2,Section Persons intending to establish themsel...,Persons intending to establish themselves in ...,{'Appendix F': 3}
3,Section Requirements for an extension of stay ...,Requirements for an extension of stay as an i...,{'Appendix F': 8}
4,"Section Writers, composers and artists","Writers, composers and artists Requirements f...",{'Appendix F': 6}


In [28]:
len(labels_references)

49

In [305]:
""" Function to add an extra column with labels """

# Create an object of KeywordProcessor
keyword_processor = KeywordProcessor()

keyword_processor.add_keyword("[Pp]aragraph[s]* [0-9A-Za-z]*[\sa-z\s0-9A-Za-z]*")


def add_column_labeling_sections(dic):
    
    """ 
    Input:
        data: dataframe with a column of strings
        list: list of keywords (strings)
        
    Output: 
        appends a column to data with the labels present in a certain entry of the column
    """
    
    
    for key, df in dic.items() :


        
        if len(df)>0:

            
            tags = []
            for i in range(len(df)):                  
                keywords_found = re.findall(r"[Pp]aragraph[s]* [0-9A-Z]+?\s?[to]*[and]*[or]*\-[0-9A-Z]*", str(df.iloc[:,1][i]))
                tags += [word_count_dict(keywords_found)]
     
            df['References to Immigration Rules'] = tags

    
        
    return dic

In [306]:
docs_section_labels_ref = add_column_labeling_sections(docs_sections_dic)

In [307]:
list(docs_section_labels_ref.values())[2]

Unnamed: 0,0,1,References to Immigration Rules
0,Section Leave to enter the United Kingdom,Leave to enter the United Kingdom 7.A person...,{}
1,Section Exercise of the power to refuse leave ...,Exercise of the power to refuse leave to ente...,{}
2,Section Suspension of leave to enter or remain...,Suspension of leave to enter or remain in the...,{}
3,Section Cancellation of leave to enter or rema...,Cancellation of leave to enter or remain in t...,{}
4,Section Requirement for persons arriving in th...,Requirement for persons arriving in the Unite...,{}
5,Section Requirement for a person not requiring...,Requirement for a person not requiring leave ...,{}
6,Section Common Travel Area,"Common Travel Area 15.The United Kingdom, th...",{}
7,Section Admission of certain British passport ...,Admission of certain British passport holders...,{}
8,Section Persons outside the United Kingdom,Persons outside the United Kingdom 17A. Wher...,{}
9,Section Returning Residents,Returning Residents 18. A person may resume ...,{}
