In [1]:
# Check if project packages are installed
! pip show selenium webdriver-manager pandas beautifulsoup4 py2neo

Name: selenium
Version: 3.141.0
Summary: Python bindings for Selenium
Home-page: https://github.com/SeleniumHQ/selenium/
Author: UNKNOWN
Author-email: UNKNOWN
License: Apache 2.0
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: urllib3
Required-by: 
---
Name: webdriver-manager
Version: 3.2.1
Summary: Library provides the way to automatically manage drivers for different browsers
Home-page: https://github.com/SergeyPirogov/webdriver_manager
Author: Sergey Pirogov
Author-email: automationremarks@gmail.com
License: UNKNOWN
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: configparser, requests, crayons
Required-by: 
---
Name: pandas
Version: 0.24.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: http://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: numpy, python-dateutil, pytz
Required-by: statsmodels, seaborn
---
Name: beautifulsoup4
Version:

In [16]:
# install packages if not already installed
! pip install selenium webdriver-manager pandas beautifulsoup4 py2neo



In [1]:
# Import packages
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager 
from bs4 import BeautifulSoup
import pandas as pd
import re
from py2neo import Graph, Node, Relationship, NodeMatcher, RelationshipMatcher

In [2]:
#URL constants
GOV_UK_HOME_URL = "https://www.gov.uk"
DOCUMENTS_URL = "https://www.gov.uk/guidance/immigration-rules"
POINT_BASED_SYSTEM_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system"

In [3]:
# Establish database connection

graph = Graph('bolt://localhost:7687', auth=('neo4j', 'Undertaker11.'))

In [4]:
matcher = NodeMatcher(graph)

In [9]:
#Open documents url for scraping
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(DOCUMENTS_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 


In [10]:
"""Creating Immigration Document Nodes"""
docs = soup.find(attrs={'class': 'section-list'})
for doc_list_item in docs.findAll('li'):
    title = doc_list_item.a.find('span', attrs={
        'class': 'subsection-title-text'}
        ).text
    summary = doc_list_item.a.find('span', attrs={
        'class': 'subsection-summary'}
        ).text
    url = GOV_UK_HOME_URL + doc_list_item.a['href']
    
    tx = graph.begin()
    tx.evaluate('''
        CREATE (doc:Document {
            title: $title, summary: $summary, url: $url
            })
        ''', parameters = {'title': title, 'summary':summary, 'url': url})
    tx.commit()

In [12]:
# Query graph for all document nodes
docs = list(matcher.match("Document").skip(1)) #Skip immigration rules index document

# Loop through docs
for doc in docs:
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(doc['url']) # Get url from document node, open page for scraping

    content = driver.page_source
    soup = BeautifulSoup(content)

    driver.close()

    # Create section nodes connected to each doc
    for section in soup.findAll(attrs={'class': 'js-subsection-title'}):
        tx = graph.begin()
        tx.evaluate('''
            CREATE (sec:Section {title: $title})
            ''', parameters = {'title': section.text})
        tx.evaluate('''
            MATCH (doc:Document), (sec:Section)
            WHERE sec.title = $title AND doc.title = $doc_title AND NOT ()-[: CONTAINS]->(sec)
            CREATE (doc)-[: CONTAINS]->(sec)
            ''', parameters = {'title': section.text, 'doc_title': doc['title']})
        tx.commit()

et LATEST driver version for 83.0.4103
INFO:WDM:Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
INFO:WDM:Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 83.0.4103
INFO:WDM:Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
INFO:WDM:Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
INFO:WDM:Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 83.0.4103
INFO:WDM:Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
INFO:WDM:Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39

In [9]:
"""Adding Paragraph labels to Section nodes which are also Paragraphs"""
tx = graph.begin()
tx.evaluate('''
    MATCH (s:Section), (d:Document)
    WHERE s.title =~ "^[0-9].*" AND NOT d.title CONTAINS 'Appendix' AND (d)-[:CONTAINS]-(s)
    SET s:Paragraph
    ''')
tx.commit()

In [22]:
sec_pars = list(matcher.match("Section").where("_: Paragraph")) # match sections which are also paragraphs

for sec_par in sec_pars:
    title = sec_par['title']
    index = title.split('.')[0]

    tx = graph.begin()
    tx.evaluate('''
        MATCH (p:Paragraph)
        WHERE p:Section AND p.title = $title
        SET p.index = $index
        ''', parameters = {'title': title, 'index': index})
    tx.commit()

In [23]:
#Open documents url for scraping
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(POINT_BASED_SYSTEM_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 83.0.4103
INFO:WDM:Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
INFO:WDM:Get LATEST driver version for 83.0.4103
 
[WDM] - Get LATEST driver version for 83.0.4103
INFO:WDM:Get LATEST driver version for 83.0.4103
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_win32.zip
INFO:WDM:Trying to download new driver from http://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39]
INFO:WDM:Driver has been saved in cache [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39]


In [27]:
"""Creating Paragraph Nodes for Paragraphs under the Sections of the Point Based System Part 6a Document"""
for section in soup.findAll(attrs={'class': 'js-openable'}):
    for tag in section.div.contents: # Loop through tags in section
        if(tag.name == 'h3' and tag.text != 'Notes'): # If it is an h3 tag and the tag's text is not Notes
            par_title_parts = tag.text.split('.')
            title = par_title_parts[-1].strip()
            index = par_title_parts[0]

            tx = graph.begin()
            tx.evaluate('''
                CREATE (par:Paragraph {title: $title, index: $index})
                ''', parameters = {'title': title, 'index': index})
            tx.evaluate('''
                MATCH (sec:Section), (par:Paragraph), (doc: Document)
                WHERE sec.title = $sec_title AND par.index = $index AND doc.title CONTAINS 'Appendix C:' AND NOT (doc)-[:CONTAINS]->(sec)
                CREATE (sec)-[: CONTAINS]->(par) 
                ''', parameters = {'sec_title': section.h2.text, 'index': index})
            tx.commit()
