In [None]:
import os
import re
import time
import random

from selenium import webdriver
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup

import pandas as pd

In [None]:
# Read scraped school list
school_data = pd.read_csv('data/all_schools.csv')
school_links = school_data['link'].tolist()
PREFIX = 'https://reports.ofsted.gov.uk'

In [None]:
driver = webdriver.Chrome()

In [None]:
for school_link in school_links:
    driver.get(f'{PREFIX}{school_link}')
    page_html = driver.page_source
    page = BeautifulSoup(page_html, 'html.parser')
    school_link_split = school_link.split('/')
    school_id = school_link_split[-2] + '_' + school_link_split[-1]
    
    try:
        # Last inspection info scraping
        ratings = [school_id]
        column_names = ['school_id']
        # If the school has subjudgements
        if len(page.find_all('div', class_='subjudgements__overall')) > 0:
            column_names.extend(['overall_info','overall_judgement'])
            ratings.append(page.find('div', class_='subjudgements__overall').find('p').get_text().strip())
            ratings.append(page.find('div', class_='subjudgements__overall').find('strong').get_text().strip())
            subjudgements = []
            # Subjudgements might differ between schools, so collect all in unordered fashion
            for subjudgement in page.find_all('div', class_='subjudgements__rates__item'):
                subjudgements.append([subjudgement.find('p').get_text().strip(),subjudgement.find('strong').get_text().strip()])
            subjudgements.sort()
            for judgement in subjudgements:
                column_names.append(judgement[0])
                ratings.append(judgement[1])
        # If the school has only the overall judgement
        elif len(page.find_all('ol', class_='rating-scale')) > 0:
            ratings.append(True)
            ratings.append(page.find('li', class_='rating--selected').find('span').get_text().strip())
            column_names.extend(['is_short_overall','overall_judgement'])
        ratings = pd.DataFrame([ratings], columns=column_names)
        ratings.to_csv(f'data/ratings/{school_id}.csv',index=False)

        # Inspection data scraping
        inspections = page.find_all('ol', class_='timeline')[0].find_all('div', class_='event')
        inspections_list = []
        for inspection in inspections[:]:
            # Check that it is not one of initial entries (open/registration/etc)
            # Those entries do not contain any useful information, so skip them
            record_detail = inspection.find_all('span', class_='event__title')[0].get_text().lower()
            if not (record_detail in ('opened', 'proposed to open', 'registration') 
                    or 'previously' in record_detail
                    or 'converted' in record_detail):
                inspection_data = []
                # Inspection/record date
                inspection_data.append(inspection.find_all('time')[0].get_text().strip())
                entry = inspection.find_all('a', href=True)[0]
                # Link to the inspection pdf file
                inspection_data.append(entry['href'])
                # Inspection details
                inspection_data.append(BeautifulSoup(str(re.sub("""<span class="nonvisual">.*</span>""", '', str(entry))), 'html.parser').get_text().strip())
                # Inspection publication date
                inspection_data.append(inspection.find_all('time')[1].get_text().strip())
                inspections_list.append(inspection_data)
        school_inspection_data = pd.DataFrame(inspections_list, columns=['record_date','link','record_details','publication_date'])
        school_inspection_data['school_id'] = school_id
        school_inspection_data.to_csv(f'data/inspections/{school_id}.csv',index=False)
    # Catch schools with unexpected structure
    # Save their ids for future investigation 
    except:
        with open(f'data/exceptions/{school_id}.txt', 'w') as file:
            file.write(school_id)
    time.sleep(random.uniform(5,8))

In [None]:
files = []
for file in os.listdir('data/inspections/'):
    files.append(pd.read_csv(f'data/inspections/{file}'))
inspection_data = pd.concat(files)

In [None]:
inspection_data.to_csv('data/inspection_data.csv', index=False)

In [None]:
files = []
for file in os.listdir('data/ratings/'):
    files.append(pd.read_csv(f'data/ratings/{file}'))
rating_data = pd.concat(files)

In [None]:
rating_data.to_csv('data/rating_data.csv', index=False)