In [1]:
# Check if project packages are installed
! pip show selenium webdriver-manager pandas beautifulsoup4 py2neo python-dotenv

Name: selenium
Version: 3.141.0
Summary: Python bindings for Selenium
Home-page: https://github.com/SeleniumHQ/selenium/
Author: UNKNOWN
Author-email: UNKNOWN
License: Apache 2.0
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: urllib3
Required-by: 
---
Name: webdriver-manager
Version: 3.2.1
Summary: Library provides the way to automatically manage drivers for different browsers
Home-page: https://github.com/SergeyPirogov/webdriver_manager
Author: Sergey Pirogov
Author-email: automationremarks@gmail.com
License: UNKNOWN
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: configparser, requests, crayons
Required-by: 
---
Name: pandas
Version: 0.24.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: http://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: pytz, numpy, python-dateutil
Required-by: statsmodels, seaborn
---
Name: beautifulsoup4
Version:

In [79]:
# install packages if not already installed
! pip install selenium webdriver-manager pandas beautifulsoup4 py2neo python-dotenv

Collecting python-dotenv
  Downloading https://files.pythonhosted.org/packages/cb/2a/07f87440444fdf2c5870a710b6770d766a1c7df9c827b0c90e807f1fb4c5/python_dotenv-0.13.0-py2.py3-none-any.whl
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.13.0


In [27]:
# Import packages
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager 
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
from neo4j import GraphDatabase, basic_auth
from py2neo import Graph, Node, Relationship, NodeMatcher, RelationshipMatcher

In [3]:
#URL constants
GOV_UK_HOME_URL = "https://www.gov.uk"
DOCUMENTS_URL = "https://www.gov.uk/guidance/immigration-rules"
IMMIGRATION_RULES_INDEX_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-index"
POINT_BASED_SYSTEM_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system"

In [42]:
%reload_ext dotenv
%dotenv -o

In [43]:
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")

In [44]:
# Establish database connection

graph = Graph(DB_HOST, auth=(DB_USER, DB_PASS)) #local
# graph = GraphDatabase.driver(uri="bolt+routing://" + DB_HOST,
#                               auth=basic_auth(user=DB_USER, password=DB_PASS),
#                               encrypted=True)

In [45]:
matcher = NodeMatcher(graph)

In [24]:
#Open documents url for scraping
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(DOCUMENTS_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
 
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache


In [46]:
"""Creating Immigration Document Nodes"""
docs = soup.find(attrs={'class': 'section-list'})
for doc_list_item in docs.findAll('li'):
    title = doc_list_item.a.find('span', attrs={
        'class': 'subsection-title-text'}
        ).text
    summary = doc_list_item.a.find('span', attrs={
        'class': 'subsection-summary'}
        ).text
    url = GOV_UK_HOME_URL + doc_list_item.a['href']
    
    tx = graph.begin()
    tx.evaluate('''
        CREATE (doc:Document {
            title: $title, summary: $summary, url: $url
            })
        ''', parameters = {'title': title, 'summary':summary, 'url': url})
    tx.commit()

In [47]:
"""Creating Section nodes related to all document nodes"""

# Query graph for all document nodes
docs = list(matcher.match("Document").skip(1)) #Skip immigration rules index document

# Loop through docs
for doc in docs:
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(doc['url']) # Get url from document node, open page for scraping

    content = driver.page_source
    soup = BeautifulSoup(content)

    driver.close()

    # Create section nodes connected to each doc
    for section in soup.findAll(attrs={'class': 'js-subsection-title'}):
        section_url = doc['url'] + '#' + section['id']

        tx = graph.begin()
        tx.evaluate('''
            CREATE (sec:Section {title: $title, url: $url})
            ''', parameters = {'title': section.text, 'url': section_url})
        tx.evaluate('''
            MATCH (doc:Document), (sec:Section)
            WHERE sec.title = $title AND doc.title = $doc_title AND NOT ()-[: CONTAINS]->(sec)
            CREATE (doc)-[: CONTAINS]->(sec)
            ''', parameters = {'title': section.text, 'doc_title': doc['title']})
        tx.commit()

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
 
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39]
[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 
[W

In [48]:
"""Adding Paragraph labels to Section nodes which are also Paragraphs"""
tx = graph.begin()
tx.evaluate('''
    MATCH (s:Section), (d:Document)
    WHERE s.title =~ "^[0-9].*" AND NOT d.title CONTAINS 'Appendix' AND (d)-[:CONTAINS]-(s)
    SET s:Paragraph
    ''')
tx.commit()

In [49]:
"""Adding index property to Sections which are Paragraphs"""
sec_pars = list(matcher.match("Section").where("_: Paragraph")) # match sections which are also paragraphs

for sec_par in sec_pars:
    title = sec_par['title']
    index = title.split('.')[0]

    tx = graph.begin()
    tx.evaluate('''
        MATCH (p:Paragraph)
        WHERE p:Section AND p.title = $title
        SET p.index = $index
        ''', parameters = {'title': title, 'index': index})
    tx.commit()

In [50]:
#Open documents url for scraping
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(POINT_BASED_SYSTEM_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 


In [51]:
"""Creating Paragraph Nodes for Paragraphs under the Sections of the Point Based System Part 6a Document"""
for section in soup.findAll(attrs={'class': 'js-openable'}):
    for tag in section.div.contents: # Loop through tags in section
        if(tag.name == 'h3' and tag.text != 'Notes'): # If it is an h3 tag and the tag's text is not Notes
            par_title_parts = tag.text.split('.')
            title = par_title_parts[-1].strip()
            index = par_title_parts[0]

            tx = graph.begin()
            tx.evaluate('''
                CREATE (par:Paragraph {title: $title, index: $index})
                ''', parameters = {'title': title, 'index': index})
            tx.evaluate('''
                MATCH (sec:Section), (par:Paragraph), (doc: Document)
                WHERE sec.title = $sec_title AND par.index = $index AND doc.title CONTAINS 'Appendix C:' AND NOT (doc)-[:CONTAINS]->(sec)
                CREATE (sec)-[: CONTAINS]->(par) 
                ''', parameters = {'sec_title': section.h2.text, 'index': index})
            tx.commit()


In [52]:
"""Extracting the contents/texts of each paragraph into a list for further processing"""
for section in soup.findAll(attrs={'class': 'js-openable'}):
    if section.h2.text == 'Tier 4 (General) Student':
        section_tags = section.div.findAll({'p', 'ol', 'h3'}, recursive=False)
        break

In [53]:
len(section_tags)

26

In [54]:
"""Adding rules to paragraphs"""
for tag in section_tags:
    if tag.name == 'h3' and tag.text != 'Notes':
        par_index = tag.text.split('.')[0]

    elif tag.name != 'h3':
        if tag.name == 'ol':
            # TODO: Explore if previous tag is p tag and text contains requirements, list items become sub-rules of that rule

            for item in tag.findAll('li', recursive=False):
                rule = item.text

                tx = graph.begin()
                tx.evaluate('''
                    CREATE (rule:Rule {desc: $desc})
                    ''', parameters = {'desc': rule})
                tx.evaluate('''
                    MATCH (par:Paragraph), (rule:Rule)
                    WHERE par.index = $index and rule.desc = $desc AND NOT ()-[:CONTAINS]->(rule)
                    CREATE (par)-[:CONTAINS]->(rule)
                    ''', parameters = {'index': par_index, 'desc': rule})
                tx.commit()

        else:
            rule = tag.text
        
            tx = graph.begin()
            tx.evaluate('''
                CREATE (rule:Rule {desc: $desc})
                ''', parameters = {'desc': rule})
            tx.evaluate('''
                MATCH (par:Paragraph), (rule:Rule)
                WHERE par.index = $index and rule.desc = $desc AND NOT ()-[:CONTAINS]->(rule)
                CREATE (par)-[:CONTAINS]->(rule)
                ''', parameters = {'index': par_index, 'desc': rule})
            tx.commit()

In [55]:
rules = list(matcher.match('Rule'))
rules

[(_386:Rule {desc: '(ga)  If the course is at degree level or above, the grant of entry clearance the applicant is seeking must not lead to the applicant having been granted more than 5 years in the UK since the age of 18 as a Tier 4 (General) Migrant, or as a Student, to study courses at degree level or above unless:\n    \n(i) the applicant has successfully completed a course at degree level in the UK of a minimum duration of 4 academic years, and will follow a course of study at Masters degree level sponsored by a higher education provider with a track record of compliance, and the grant of entry clearance must not lead to the applicant having spent more than 6 years in the UK since the age of 18 as a Tier 4 (General) Migrant, or as a Student, studying courses at degree level or above; or\n(ii) the grant of entry clearance is to follow a course leading to the award of a PhD, and the applicant is sponsored by a higher education provider with a track record of compliance; or\n(iii) th