In [1]:
# Check if project packages are installed
! pip show selenium webdriver-manager pandas beautifulsoup4 py2neo python-dotenv

Name: selenium
Version: 3.141.0
Summary: Python bindings for Selenium
Home-page: https://github.com/SeleniumHQ/selenium/
Author: UNKNOWN
Author-email: UNKNOWN
License: Apache 2.0
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: urllib3
Required-by: 
---
Name: webdriver-manager
Version: 3.2.1
Summary: Library provides the way to automatically manage drivers for different browsers
Home-page: https://github.com/SergeyPirogov/webdriver_manager
Author: Sergey Pirogov
Author-email: automationremarks@gmail.com
License: UNKNOWN
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: crayons, requests, configparser
Required-by: 
---
Name: pandas
Version: 0.24.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: http://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: pytz, python-dateutil, numpy
Required-by: statsmodels, seaborn
---
Name: beautifulsoup4
Version:

In [2]:
# install packages if not already installed
! pip install selenium webdriver-manager pandas beautifulsoup4 py2neo python-dotenv



In [2]:
# Import packages
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager 
from bs4 import BeautifulSoup, ResultSet
import pandas as pd
import re
import os
from neo4j import GraphDatabase, basic_auth
from py2neo import Graph, Node, Relationship, NodeMatcher, RelationshipMatcher

In [3]:
#URL constants
GOV_UK_HOME_URL = "https://www.gov.uk"
DOCUMENTS_URL = "https://www.gov.uk/guidance/immigration-rules"
IMMIGRATION_RULES_INDEX_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-index"
POINT_BASED_SYSTEM_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system"
APPENDIX_A_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-a-attributes"
APPENDIX_C_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-c-maintenance-funds"
APPENDIX_6_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-6-academic-subjects-that-need-a-certificate"

In [4]:
%load_ext dotenv
%dotenv -o

In [5]:
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")

In [6]:
# Establish database connection

graph = Graph(DB_HOST, auth=(DB_USER, DB_PASS)) #local
# graph = GraphDatabase.driver(uri="bolt+routing://" + DB_HOST,
#                               auth=basic_auth(user=DB_USER, password=DB_PASS),
#                               encrypted=True)

In [7]:
matcher = NodeMatcher(graph)

In [11]:
#Open documents url for scraping
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(DOCUMENTS_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\84.0.4147.30\chromedriver.exe] found in cache
 


In [46]:
"""Creating Immigration Document Nodes"""
docs = soup.find(attrs={'class': 'section-list'})
for doc_list_item in docs.findAll('li'):
    title = doc_list_item.a.find('span', attrs={
        'class': 'subsection-title-text'}
        ).text
    summary = doc_list_item.a.find('span', attrs={
        'class': 'subsection-summary'}
        ).text
    url = GOV_UK_HOME_URL + doc_list_item.a['href']
    
    tx = graph.begin()
    tx.evaluate('''
        CREATE (doc:Document {
            title: $title, summary: $summary, url: $url
            })
        ''', parameters = {'title': title, 'summary':summary, 'url': url})
    tx.commit()

In [47]:
"""Creating Section nodes related to all document nodes"""

# Query graph for all document nodes
docs = list(matcher.match("Document").skip(1)) #Skip immigration rules index document

# Loop through docs
for doc in docs:
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(doc['url']) # Get url from document node, open page for scraping

    content = driver.page_source
    soup = BeautifulSoup(content)

    driver.close()

    # Create section nodes connected to each doc
    for section in soup.findAll(attrs={'class': 'js-subsection-title'}):
        section_url = doc['url'] + '#' + section['id']

        tx = graph.begin()
        tx.evaluate('''
            CREATE (sec:Section {title: $title, url: $url})
            ''', parameters = {'title': section.text, 'url': section_url})
        tx.evaluate('''
            MATCH (doc:Document), (sec:Section)
            WHERE sec.title = $title AND doc.title = $doc_title AND NOT ()-[: CONTAINS]->(sec)
            CREATE (doc)-[: CONTAINS]->(sec)
            ''', parameters = {'title': section.text, 'doc_title': doc['title']})
        tx.commit()

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
 
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39]
[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 
[W

In [48]:
"""Adding Paragraph labels to Section nodes which are also Paragraphs"""
tx = graph.begin()
tx.evaluate('''
    MATCH (s:Section), (d:Document)
    WHERE s.title =~ "^[0-9].*" AND NOT d.title CONTAINS 'Appendix' AND (d)-[:CONTAINS]-(s)
    SET s:Paragraph
    ''')
tx.commit()

In [49]:
"""Adding index property to Sections which are Paragraphs"""
sec_pars = list(matcher.match("Section").where("_: Paragraph")) # match sections which are also paragraphs

for sec_par in sec_pars:
    title = sec_par['title']
    index = title.split('.')[0]

    tx = graph.begin()
    tx.evaluate('''
        MATCH (p:Paragraph)
        WHERE p:Section AND p.title = $title
        SET p.index = $index
        ''', parameters = {'title': title, 'index': index})
    tx.commit()

In [8]:
#Open documents url for scraping
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(POINT_BASED_SYSTEM_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147
 
[WDM] - There is no [win32] chromedriver for browser 84.0.4147 in cache
[WDM] - Get LATEST driver version for 84.0.4147
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/84.0.4147.30/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\Joel\.wdm\drivers\chromedriver\win32\84.0.4147.30]


In [51]:
"""Creating Paragraph Nodes for Paragraphs under the Sections of the Point Based System Part 6a Document"""
for section in soup.findAll(attrs={'class': 'js-openable'}):
    for tag in section.div.contents: # Loop through tags in section
        if(tag.name == 'h3' and tag.text != 'Notes'): # If it is an h3 tag and the tag's text is not Notes
            par_title_parts = tag.text.split('.')
            title = par_title_parts[-1].strip()
            index = par_title_parts[0]

            tx = graph.begin()
            tx.evaluate('''
                CREATE (par:Paragraph {title: $title, index: $index})
                ''', parameters = {'title': title, 'index': index})
            tx.evaluate('''
                MATCH (sec:Section), (par:Paragraph), (doc: Document)
                WHERE sec.title = $sec_title AND par.index = $index AND doc.title CONTAINS 'Appendix C:' AND NOT (doc)-[:CONTAINS]->(sec)
                CREATE (sec)-[: CONTAINS]->(par) 
                ''', parameters = {'sec_title': section.h2.text, 'index': index})
            tx.commit()


In [11]:
"""Extracting the contents/texts of each paragraph into a list for further processing"""
for section in soup.findAll(attrs={'class': 'js-openable'}):
    if section.h2.text == 'Tier 4 (General) Student':
        section_tags = section.div.findAll({'p', 'ol', 'h3'}, recursive=False)
        break

In [113]:
par_245ZT_245ZV_tags = ResultSet(source=soup,result=section_tags[:10])
par_245ZW_tags = ResultSet(source=soup,result=section_tags[10:15])
par_245ZX_tags = ResultSet(source=soup,result=section_tags[15:21])
par_245ZY_tags = ResultSet(source=soup,result=section_tags[21:])

In [105]:
"""Adding rules to paragraphs 245ZT to 245ZV"""
for i, tag in enumerate(par_245ZT_245ZV_tags):
    if tag.name == 'h3' and tag.text != 'Notes':
        par_index = tag.text.split('.')[0]

    elif tag.name != 'h3':
        if tag.name == 'ol':
            for item in tag.findAll('li', recursive=False):
                
                if item.text.split()[0][0] == '(':  #if rule starts with ()
                    sub_rule_index = item.text.split()[0]
                    sub_rule = item.text
                else:
                    sub_rule += item.text

                tx = graph.begin()
                tx.evaluate('''
                    CREATE (subrule:SubRule {desc: $desc, index: $index})
                    ''', parameters = {'desc': sub_rule, 'index': sub_rule_index})
                tx.evaluate('''
                    MATCH (par:Paragraph), (rule:Rule), (subrule:SubRule)
                    WHERE par.index = $index AND rule.desc = $desc AND (par)-[:CONTAINS]->(rule) AND NOT ()-[:CONTAINS]->(subrule)
                    CREATE (rule)-[:CONTAINS]->(subrule)
                    ''', parameters = {'index': par_index, 'desc': 'Requirements:'})
                tx.commit()

        elif tag.name == 'p' and section_tags[i-1].name == 'ol':

            sub_rule = tag.text

            tx = graph.begin()
            tx.evaluate('''
                CREATE (subrule:SubRule {desc: $desc})
                ''', parameters = {'desc': sub_rule})
            tx.evaluate('''
                MATCH (par:Paragraph), (rule:Rule), (subrule:SubRule)
                WHERE par.index = $index AND rule.desc = $desc AND (par)-[:CONTAINS]->(rule) AND NOT ()-[:CONTAINS]->(subrule)
                CREATE (rule)-[:CONTAINS]->(subrule)
                ''', parameters = {'index': par_index, 'desc': 'Requirements:'})
            tx.commit()

        else: # Handles rules in p tags
            rule = tag.text
        
            tx = graph.begin()
            tx.evaluate('''
                CREATE (rule:Rule {desc: $desc})
                ''', parameters = {'desc': rule})
            tx.evaluate('''
                MATCH (par:Paragraph), (rule:Rule)
                WHERE par.index = $index and rule.desc = $desc AND NOT ()-[:CONTAINS]->(rule)
                CREATE (par)-[:CONTAINS]->(rule)
                ''', parameters = {'index': par_index, 'desc': rule})
            tx.commit()

In [106]:
"""Adding rules to paragraph 245ZW. Period and conditions of grant for entry clearance"""
for i, tag in enumerate(par_245ZW_tags):
    if tag.name == 'h3' and tag.text != 'Notes':
        par_index = tag.text.split('.')[0]

    #TODO: if tag name is table, create Table node
    #TODO: If tag name is h3 and text is Notes, create Notes node

    elif tag.name != 'h3':
        if tag.name == 'ol':
            for item in tag.findAll('li', recursive=False):
                
                rule_index = item.text.split()[0]
                rule = item.text

                if item.text.split()[0][0] != '(':  #if rule starts with ()
                    rule_index = ""
                # else:
                #     #Get rule from database and edit description with appended rule
                #     #rule += item.text

                tx = graph.begin()
                tx.evaluate('''
                    CREATE (rule:Rule {desc: $desc, index: $index})
                    ''', parameters = {'desc': rule, 'index': rule_index})
                tx.evaluate('''
                    MATCH (par:Paragraph), (rule:Rule)
                    WHERE par.index = $index AND rule.desc = $desc AND NOT ()-[:CONTAINS]->(rule)
                    CREATE (par)-[:CONTAINS]->(rule)
                    ''', parameters = {'desc': rule, 'index': par_index})
                tx.commit()

(a)
(b)
(i)
(c)
provided


In [111]:
"""Adding rules to paragraph 245ZX. Requirements for leave to remain"""
for i, tag in enumerate(par_245ZX_tags):
    if tag.name == 'h3' and tag.text != 'Notes':
        par_index = tag.text.split('.')[0]

    elif tag.name != 'h3':
        if tag.name == 'ol':
            for item in tag.findAll('li', recursive=False):
                
                if item.text.split()[0][0] == '(':  #if rule starts with ()
                    sub_rule_index = item.text.split()[0]
                    sub_rule = item.text
                else:
                    sub_rule += item.text

                tx = graph.begin()
                tx.evaluate('''
                    CREATE (subrule:SubRule {desc: $desc, index: $index})
                    ''', parameters = {'desc': sub_rule, 'index': sub_rule_index})
                tx.evaluate('''
                    MATCH (par:Paragraph), (rule:Rule), (subrule:SubRule)
                    WHERE par.index = $index AND rule.desc = $desc AND (par)-[:CONTAINS]->(rule) AND NOT ()-[:CONTAINS]->(subrule)
                    CREATE (rule)-[:CONTAINS]->(subrule)
                    ''', parameters = {'index': par_index, 'desc': 'Requirements:'})
                tx.commit()

        elif tag.name == 'p' and section_tags[i-1].name == 'ol':

            sub_rule = tag.text

            tx = graph.begin()
            tx.evaluate('''
                CREATE (subrule:SubRule {desc: $desc})
                ''', parameters = {'desc': sub_rule})
            tx.evaluate('''
                MATCH (par:Paragraph), (rule:Rule), (subrule:SubRule)
                WHERE par.index = $index AND rule.desc = $desc AND (par)-[:CONTAINS]->(rule) AND NOT ()-[:CONTAINS]->(subrule)
                CREATE (rule)-[:CONTAINS]->(subrule)
                ''', parameters = {'index': par_index, 'desc': 'Requirements:'})
            tx.commit()

        else: # Handles rules in p tags
            rule = tag.text
        
            tx = graph.begin()
            tx.evaluate('''
                CREATE (rule:Rule {desc: $desc})
                ''', parameters = {'desc': rule})
            tx.evaluate('''
                MATCH (par:Paragraph), (rule:Rule)
                WHERE par.index = $index and rule.desc = $desc AND NOT ()-[:CONTAINS]->(rule)
                CREATE (par)-[:CONTAINS]->(rule)
                ''', parameters = {'index': par_index, 'desc': rule})
            tx.commit()

In [114]:
"""Adding rules to paragraph 245ZY. Period and conditions of grant for leave to remain"""
for i, tag in enumerate(par_245ZY_tags):
    if tag.name == 'h3' and tag.text != 'Notes':
        par_index = tag.text.split('.')[0]

    #TODO: if tag name is table, create Table node
    #TODO: If tag name is h3 and text is Notes, create Notes node

    elif tag.name != 'h3':
        if tag.name == 'ol':
            for item in tag.findAll('li', recursive=False):
                
                rule_index = item.text.split()[0]
                rule = item.text

                if item.text.split()[0][0] != '(':  #if rule starts with ()
                    rule_index = ""
                # else:
                #     #Get rule from database and edit description with appended rule
                #     #rule += item.text

                tx = graph.begin()
                tx.evaluate('''
                    CREATE (rule:Rule {desc: $desc, index: $index})
                    ''', parameters = {'desc': rule, 'index': rule_index})
                tx.evaluate('''
                    MATCH (par:Paragraph), (rule:Rule)
                    WHERE par.index = $index AND rule.desc = $desc AND NOT ()-[:CONTAINS]->(rule)
                    CREATE (par)-[:CONTAINS]->(rule)
                    ''', parameters = {'desc': rule, 'index': par_index})
                tx.commit()

In [34]:
rules = list(matcher.match('Rule'))

In [1]:
#TODO: References and citations
#Subrule (c) of Rule (Requirements) of Paragraph (245ZV) cites paragraphs (10) to (14) of Document (Appendix C)
#Subrule (ca) of Rule (Requirements) of Paragraph (245ZV) cites Rule (b) of paragraph (118) of Document (Appendix A)
#Subrule (da) of Rule (Requirements) of Paragraph (245ZV) cites paragraphs (1) and (2) of Document (Appendix 6)
#Subrule (f) of Rule (Requirements) of Paragraph (245ZV) cites paragraph 245A Document (Part 6A: Point Based System)

In [8]:
#Open documents url for scraping
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(APPENDIX_A_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147
 
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\84.0.4147.30\chromedriver.exe] found in cache


In [21]:
"""Creating Paragraph Nodes for Paragraphs under the Section: Attributes for Tier 4 (General) Students"""
for section in soup.findAll(attrs={'class': 'js-openxable'}):
    if section.h2.text == 'Attributes for Tier 4 (General) Students':
        for tag in section.div.contents: # Loop through tags in section
            if tag.name == 'ol':
                for item in tag.findAll('li', recursive=False):
                    # print(item.text.split('.'))
                    par_title_parts = item.text.split('.')
                    title = item.text
                    index = par_title_parts[0]

                    tx = graph.begin()
                    tx.evaluate('''
                        CREATE (par:Paragraph {title: $title, index: $index})
                        ''', parameters = {'title': title, 'index': index})
                    tx.evaluate('''
                        MATCH (sec:Section), (par:Paragraph)
                        WHERE sec.title = $sec_title AND par.index = $index
                        CREATE (sec)-[: CONTAINS]->(par) 
                        ''', parameters = {'sec_title': section.h2.text, 'index': index})
                    tx.commit()
            elif tag.name == 'table': # Create table node
                tx = graph.begin()
                tx.evaluate('''
                    CREATE (tab:Table {title: 'Table 16', table: $table})
                    ''', parameters = {'table': tag.str})
                tx.evaluate('''
                    MATCH (sec:Section), (tab:Table {title: 'Table 16'})
                    WHERE sec.title = $sec_title
                    CREATE (sec)-[: CONTAINS]->(tab) 
                    ''', parameters = {'sec_title': section.h2.text})
                tx.commit()

In [24]:
#Subrule (b) of Rule (Requirements) of Paragraph (245ZV) cites paragraphs (113) to (120) (Section Attributes for Tier 4 (General) Students) of Document (Appendix A)
tx = graph.begin()
tx.evaluate('''
    MATCH (sub:SubRule {index: '(b)'}), (rule:Rule {desc: 'Requirements:'}), (par:Paragraph {index: '245ZV'}), (sec:Section {title: 'Attributes for Tier 4 (General) Students'})
    WHERE (par)-[:CONTAINS]->(rule) AND (rule)-[:CONTAINS]->(sub)
    CREATE (sub)-[:REFERENCES]->(sec) 
    ''')
tx.commit()