In [56]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re

In [63]:
#Get points based system immigration rules(part 6a) web page
driver = webdriver.Chrome("C:\chromedriver_win32\chromedriver.exe")
driver.get("https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system")

In [103]:
documents = []
paragraph_headings = []
paragraphs = []
attributes = []
rules = []

content = driver.page_source
soup = BeautifulSoup(content)
driver.close()

In [None]:
for paragraph in soup.findAll('h2', attrs={'class': 'js-subsection-title'}):
    if(re.search("^\d", paragraph.text)):
        paragraph_headings.append(paragraph.text)

for paragraph in soup.findAll('h3'):
    if(re.search("^\d", paragraph.text)):
        paragraph_headings.append(paragraph.text)

# Remove deleted paragraphs from paragraph_headings list
deleted_paragraph_headings = []

for paragraph_heading in paragraph_headings:
    if(re.search("DELETED$", paragraph_heading)):
        deleted_paragraph_headings.append(paragraph_heading)
        
for paragraph_heading in deleted_paragraph_headings:
    paragraph_headings.remove(paragraph_heading)

In [None]:
for i, div in enumerate(soup.findAll(attrs={'class': 'js-subsection-body body-content-wrapper'})):
    paragraphs.append(div.text)
    if(i == 2):
        break

In [None]:
p = "" #Initialize empty paragraph
for j, div in enumerate(soup.findAll(attrs={'class': 'js-subsection-body body-content-wrapper'})):
    if(j >= 3): #Start operation from 4th section
        for k, tag in enumerate(div.contents): #Loop through the tags in each section
            if(k == 0 and tag.name != 'h3'): #Skip section if first tag is not an h3 tag
                break
            if(tag.name == 'h3' and tag.text != 'Notes'): #If tag is an h3 tag and and not a table notes section
                if(p != ""):
                    paragraphs.append(p) #Append paragraph to list of paragraphs if paragraph is not empty
                p = "" #Empty paragraph
            if(tag.name == 'p' or tag.name == 'ol' or tag.name == 'table'):
                p = p + tag.text #Concatenate tag texts if tag is a p, ol or table tag.

paragraphs.append(p) #Append last concatenated paragraph to list of paragraphs

In [None]:
len(paragraph_headings)

In [None]:
len(paragraphs)

In [None]:
df = pd.DataFrame({'Paragraph': paragraph_headings, 'Content': paragraphs})
df.to_csv('immigration.csv', index=False, encoding='UTF8', header=False)

In [38]:
from py2neo import Graph, Node, Relationship

graph = Graph('bolt://localhost:7687', auth=('neo4j', 'Undertaker11.'))

In [128]:
#Get immigration rules index page
driver = webdriver.Chrome("C:\chromedriver_win32\chromedriver.exe")
driver.get("https://www.gov.uk/guidance/immigration-rules/immigration-rules-index")

content = driver.page_source
soup = BeautifulSoup(content)

In [129]:
documents = []
"""Creating a Document Node for the Part 6A: Points-Based System document"""
for doc in soup.findAll('tr'):
    if(doc.a):
        title = doc.a.text
        url = doc.a['href']
        tx = graph.begin()
        tx.evaluate('''
            CREATE (doc:Document {title: $title, url: $url})
            ''', parameters = {'title': title, 'url': url})
        tx.commit()
        documents.append(title)

In [130]:
documents

['Introduction',
 'Implementation and transitional provisions',
 'Application',
 'Interpretation',
 'Public funds clarification',
 'Part 1: General   provisions regarding leave   to enter or remain in the   United Kingdom',
 'Leave   to enter the United Kingdom',
 'Exercise of the power to refuse leave to enter the United Kingdom',
 'Suspension of leave to enter or remain in the United Kingdom',
 'Cancellation of leave to enter or remain in the United Kingdom',
 'Requirement for persons arriving in the United Kingdom or seeking entry through the Channel Tunnel to produce evidence of identity and nationality',
 'Requirement for a person not requiring leave to enter the United Kingdom to prove that he has the right   of abode',
 'Common Travel Area',
 'Admission for certain British passport holders',
 'Persons outside the United Kingdom',
 'Returning residents',
 'Non-lapsing   leave',
 'Holders of restricted travel documents and passports',
 'Leave to enter granted on arrival in the Uni

In [131]:
#Get points based system immigration rules(part 6a) web page
driver = webdriver.Chrome("C:\chromedriver_win32\chromedriver.exe")
driver.get("https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system")

content = driver.page_source
soup = BeautifulSoup(content)

"""Creating a Section Node for Tier 4 (General) Student section"""
for section in soup.findAll(attrs={'class': 'js-subsection-title'}):
#     if(section.text == "Tier 4 (General) Student"):
        tx = graph.begin()
        tx.evaluate('''
            CREATE (sec:Section {title: $title})
            ''', parameters = {'title': section.text})
        tx.evaluate('''
            MATCH (doc:Document), (sec:Section) 
            WHERE sec.title = $title AND doc.title = "Part 6A: Points-Based System"
            CREATE (doc)-[: CONTAINS]->(sec)
            ''', parameters = {'title': section.text})
        tx.commit()

In [132]:
"""Creating Paragraph Nodes for Paragraphs under the Tier 4 (General) Student section"""
par_content = ""
for section in soup.findAll(attrs={'class': 'js-openable'}):
    if(section.h2.text == "Tier 4 (General) Student"): # Focus on Tier 4 (General) Student section
        for tag in section.div.contents: # Loop through tags in section
            if(tag.name == 'h3' and tag.text != 'Notes'): # If it is an h3 tag and the tag's text is not a Notes section
                if(par_content != ""):
                    tx = graph.begin()
                    tx.evaluate('''
                        CREATE (par:Paragraph {title: $title, content: $content})
                        ''', parameters = {'title': par_title, 'content': par_content})
                    tx.evaluate('''
                        MATCH (sec:Section), (par:Paragraph) 
                        WHERE sec.title = "Tier 4 (General) Student" AND par.title = $title
                        CREATE (sec)-[: CONTAINS]->(par) 
                        ''', parameters = {'title': par_title})
                    tx.commit()
                par_title = tag.text
            else:
                par_content = par_content + tag.text

# For last paragraph in the section
tx = graph.begin()
tx.evaluate('''
    CREATE (sec:Paragraph {title: $title, content: $content})
    ''', parameters = {'title': par_title, 'content': par_content})
tx.evaluate('''
    MATCH (sec:Section), (par:Paragraph) 
    WHERE sec.title = "Tier 4 (General) Student" AND par.title = $title
    CREATE (sec)-[: CONTAINS]->(par) 
    ''', parameters = {'title': par_title})
tx.commit()