Dynamic web scraping example using selenium by Hans (h.h.sievertsen@bristol.ac.uk).

Objective: Collect class sizes and save them in a csv file.

First version: May 2013. 

This version: October 2018 (adapted to jupyter notebook).

Python version: 2.7

In [1]:
# load packages
# selenium is used for dynamic website scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd


In [None]:
# Set path to of a firefox web driver (Chrome is also possible)
webdriver_path= 'C:\\Users\\hhsie\\Dropbox\\programming and statistics\\Python\\webdriver\\geckodriver.exe'


Check out the website. 
Go to https://www.lectio.dk/

In [None]:
# Now attempt to open the website 
driver = webdriver.Firefox(executable_path =webdriver_path)
driver.get("https://www.lectio.dk/lectio/login_list.aspx")


In [None]:
# close web browser
driver.close()

In [None]:
# Open link again and find all school links
driver = webdriver.Firefox(executable_path =webdriver_path)
driver.get("https://www.lectio.dk/lectio/login_list.aspx")
# find all school links
schools = driver.find_elements_by_xpath("//a[@href]")
# loop over schools
for s in schools:
    print(s.get_attribute('href'))
    

In [None]:
# visit one of these links
schoolsite = webdriver.Firefox(executable_path =webdriver_path)
schoolsite.get(schools[0].get_attribute('href'))

In [None]:
# find the class list
classlink = schoolsite.find_element_by_link_text("Klasse");
classlink.click();

In [None]:
#save a list of classes
allclasses = schoolsite.find_elements_by_xpath("//a[contains(@href, 'klasseid')]");
# loop over schools
for c in allclasses:
    print(c.get_attribute('href'))

In [None]:
# visit one of these links
classsite = webdriver.Firefox(executable_path =webdriver_path)
classsite.get(allclasses[0].get_attribute('href'))

In [None]:
# Find student list
studentlink = classsite.find_element_by_link_text("Elever");
studentlink.click();

In [None]:
# Find class size and school name
# Find classsize
classsize_element = classsite.find_element_by_xpath("//span[@id='s_m_Content_Content_additionalInfoLbl']")
class_size=classsize_element.text[14:]
# Find school name
school_element = classsite.find_element_by_xpath('//*[@id="s_m_masterleftDiv"]')
# encoding, remove line breaks etc
school_name=school_element.text.encode('utf-8').replace('\n', '')
school_name=school_name[0:school_name.find("  20")]
#print
print("School name: "+school_name+". Class size: "+class_size)

In [None]:
#close browsers
schoolsite.quit()
driver.quit()
classsite.quit()

In [None]:
# Bring everything together
# Open websites 
driver = webdriver.Firefox(executable_path =webdriver_path)
classsite = webdriver.Firefox(executable_path = webdriver_path)
schoolsite = webdriver.Firefox(executable_path = webdriver_path)

# Empty dataframe to save results
df = pd.DataFrame(columns=['Schoolname',"Class_size"])

# Open website
driver.get("https://www.lectio.dk/lectio/login_list.aspx")
# Find all schools
schools = driver.find_elements_by_xpath("//a[@href]")
# Loop over links
for s in schools:
	# open link in new browser
	schoolsite.get(s.get_attribute('href'))
	# click on classroom
	classlink = schoolsite.find_element_by_link_text("Klasse")
	classlink.click()
	# find all classes
	allclasses = schoolsite.find_elements_by_xpath("//a[contains(@href, 'klasseid')]")
	# Loop over classes
	for c in allclasses:
		classsite.get(c.get_attribute('href'))
		# Select students
		studentlink = classsite.find_element_by_link_text("Elever")
		studentlink.click()
		# Find classsize
		classsize_element = classsite.find_element_by_xpath("//span[@id='s_m_Content_Content_additionalInfoLbl']")
		class_size=classsize_element.text[14:]
		# Find schools
		school_element = classsite.find_element_by_xpath('//*[@id="s_m_masterleftDiv"]')
		# encoding, remove line breaks etc
		school_name=school_element.text.encode('utf-8').replace('\n', '')
		school_name=school_name[0:school_name.find("  20")]
		# Append dataframe
		df=df.append([ pd.DataFrame([[school_name, class_size]],columns=['Schoolname',"Class_size"])])
		# Write to CSV
		df.to_csv('data_output.csv', index=False)
schoolsite.quit()
driver.quit()
classsite.quit()

