In [1]:
# Check if project packages are installed
! pip show selenium webdriver-manager pandas beautifulsoup4 py2neo

Name: selenium
Version: 3.141.0
Summary: Python bindings for Selenium
Home-page: https://github.com/SeleniumHQ/selenium/
Author: UNKNOWN
Author-email: UNKNOWN
License: Apache 2.0
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: urllib3
Required-by: 
---
Name: webdriver-manager
Version: 3.2.1
Summary: Library provides the way to automatically manage drivers for different browsers
Home-page: https://github.com/SergeyPirogov/webdriver_manager
Author: Sergey Pirogov
Author-email: automationremarks@gmail.com
License: UNKNOWN
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: requests, configparser, crayons
Required-by: 
---
Name: pandas
Version: 0.24.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: http://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: c:\users\joel\anaconda3\lib\site-packages
Requires: numpy, pytz, python-dateutil
Required-by: statsmodels, seaborn
---
Name: beautifulsoup4
Version:

In [16]:
# install packages if not already installed
! pip install selenium webdriver-manager pandas beautifulsoup4 py2neo



In [2]:
# Import packages
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager 
from bs4 import BeautifulSoup
import pandas as pd
import re
from py2neo import Graph, Node, Relationship

In [3]:
#URL constants
GOV_UK_HOME_URL = "https://www.gov.uk"
DOCUMENTS_URL = "https://www.gov.uk/guidance/immigration-rules"
POINT_BASED_SYSTEM_URL = "https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system"

In [4]:
# Establish database connection

graph = Graph('bolt://localhost:7687', auth=('neo4j', 'Undertaker11.'))

In [8]:
#Open documents url for scraping
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(DOCUMENTS_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 


In [9]:
"""Creating Immigration Document Nodes"""
docs = soup.find(attrs={'class': 'section-list'})
for doc_list_item in docs.findAll('li'):
    title = doc_list_item.a.find('span', attrs={
        'class': 'subsection-title-text'}
        ).text
    summary = doc_list_item.a.find('span', attrs={
        'class': 'subsection-summary'}
        ).text
    url = GOV_UK_HOME_URL + doc_list_item.a['href']
    
    tx = graph.begin()
    tx.evaluate('''
        CREATE (doc:Document {
            title: $title, summary: $summary, url: $url
            })
        ''', parameters = {'title': title, 'summary':summary, 'url': url})
    tx.commit()

In [10]:
#Get points based system immigration rules(part 6a) web page
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(POINT_BASED_SYSTEM_URL)

content = driver.page_source
soup = BeautifulSoup(content)

driver.close()

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\Joel\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache
 


In [11]:
"""Creating Section Nodes and connecting to the Point Based System Part 6a Document Node"""
for section in soup.findAll(attrs={'class': 'js-subsection-title'}):
    tx = graph.begin()
    tx.evaluate('''
        CREATE (sec:Section {title: $title})
        ''', parameters = {'title': section.text})
    tx.evaluate('''
        MATCH (doc:Document), (sec:Section) 
        WHERE sec.title = $title AND doc.title = "Immigration Rules part 6A: the points-based system"
        CREATE (doc)-[: CONTAINS]->(sec)
        ''', parameters = {'title': section.text})
    tx.commit()

In [132]:
"""Creating Paragraph Nodes for Paragraphs under the Tier 4 (General) Student section"""
par_content = ""
for section in soup.findAll(attrs={'class': 'js-openable'}):
    if(section.h2.text == "Tier 4 (General) Student"): # Focus on Tier 4 (General) Student section
        for tag in section.div.contents: # Loop through tags in section
            if(tag.name == 'h3' and tag.text != 'Notes'): # If it is an h3 tag and the tag's text is not a Notes section
                tx = graph.begin()
                tx.evaluate('''
                    CREATE (par:Paragraph {title: $title, content: $content})
                    ''', parameters = {'title': par_title, 'content': par_content})
                tx.evaluate('''
                    MATCH (sec:Section), (par:Paragraph) 
                    WHERE sec.title = "Tier 4 (General) Student" AND par.title = $title
                    CREATE (sec)-[: CONTAINS]->(par) 
                    ''', parameters = {'title': par_title})
                tx.commit()

# # For last paragraph in the section
# tx = graph.begin()
# tx.evaluate('''
#     CREATE (sec:Paragraph {title: $title, content: $content})
#     ''', parameters = {'title': par_title, 'content': par_content})
# tx.evaluate('''
#     MATCH (sec:Section), (par:Paragraph) 
#     WHERE sec.title = "Tier 4 (General) Student" AND par.title = $title
#     CREATE (sec)-[: CONTAINS]->(par) 
#     ''', parameters = {'title': par_title})
# tx.commit()