## Web Scrape

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import time
import os

In [2]:
Iteration = "\postcodesNo"

df_postcodes = pd.read_excel('C:\Files\Code\CouncilDatascrape'+Iteration+'.xlsx')


In [3]:
df_postcodes

Unnamed: 0,Postcodes
0,SE25 5AQ
1,SW17 9HW
2,SE1 7TN
3,SE8 5SQ
4,SE1 8RL
...,...
102,SR2 7SN
103,DH4 6AA
104,S35 2UX
105,LS11 9DG


In [4]:
#initialise chrome driver manager to pass through webdriver
service = Service(ChromeDriverManager().install())

In [5]:
#create function that passes a postcose string item
def return_authority(postcode):

    #Create instance of chrome ad driver and then apply functions before quitting driver
    with webdriver.Chrome(service=service) as driver:
        driver.get("https://www.gov.uk/find-local-council")

        #Locate the search box and input what is being passed as postcode
        input_element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//input[@id="postcode"]')))
        input_element.send_keys(postcode)

        #Locate and click search button
        button_element = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH, '//button[@class="gem-c-button govuk-button gem-c-button--bottom-margin"]')))
        button_element.click()

        #There are 3 main instances that can occur here, London returns a single borough, so easy to locate. Regions return county and borough council
        #and these occupy the same h2 path loction, with a differential of the div path.
        #N.B I know nothing about CSS so do not truley understand what is happening here
        #Third option is that the postcode isn't real so will stay on the same page and show an error message
        #I am using an error cascade to identify these conditions as the div location of the london borough is different to that of regional borough and returning a series of 
        #two strings, with combinations of borough council, county council and "N/A" or no postcode where applicable

        #Initial try assumes regional and identifies county and borough council
        try: 

            #Locate object and return text
            county_council = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH, '//h3[@class="gem-c-heading govuk-heading-m"]')))
            county_council = county_council.text

            #For borough we need to identify multiple levels. Again, for reasons i do not know (do not know CSS), this join returns the whole papragraph for borough council and
            #the county council. So select first item in the object array, return text (the whole paragraph) and take first line
            borough_council = WebDriverWait(driver,5).until(
                    EC.all_of(
                        EC.presence_of_element_located((By.XPATH, '//div[@class="district-result group"]')),
                        EC.presence_of_element_located((By.XPATH, '//h3[@class="gem-c-heading govuk-heading-m"]'))))
            
            borough_council = borough_council[0].text.splitlines()[0]

            #Returning both fo these in this instance
            return pd.Series([borough_council, county_council])

        #When the xpath is unable to be found we move to this section
        except Exception:
            
            #Trying the london borough return
            try:

                #identify and return the longer string for this instance, returning only what is after the is in "Your local authority is...."
                london_council = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH, '//div[@class="unitary-result group"]')))
                london_council = london_council.text.splitlines()[0].split("is ", 1)[1].split(".", 1)[0]
                return pd.Series([london_council, "N/A"])

            except:
                #If none of these are met then it is and error and we return as such
                return pd.Series(["Not a postcode", "Not a postcode"])

        driver.quit()

In [6]:
df_postcodes[['Borough', 'County']] = df_postcodes['Postcodes'].apply(return_authority)

In [7]:
df_postcodes.to_excel(r'C:\Files\Code\CouncilDatascrape'+ Iteration + 'Authority.xlsx', index=False)