# Web Scaping

## Scraping a single page - Tests

In [1]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser 

In [2]:
#This is just to make it so the request goes through and the Dawson website does not decline it 
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
}

In [3]:
# URL of program page for pure and applied
url = 'https://www.dawsoncollege.qc.ca/pure-applied/'
r = requests.get(url, headers=headers)

In [4]:
# Status code is 200 means all is good and request went through
r.status_code

200

In [5]:
# Get HTML of the page
html = r.text.strip()

In [6]:
# Make html of page a BeautifulSoup 
soup = BeautifulSoup(html, 'lxml')

In [7]:
# Header Content of page
content = soup.find(class_='entry-content')
#Get program title and strip any white spaces
programTitle = content.find('h2').contents[0].strip()

In [8]:
# Get when last modified
dateModified = soup.find(class_='page-mod-date').contents[0].strip()
# Remove the 'Last Modified: ' field so just date is left
dateModified = dateModified[15:]
print(dateModified)

August 8, 2018


## Function to parse a single page

In [12]:
# Parse a single Program page at Dawson College
def parseProgramPage(url):
    
    #This is just to make it so the request goes through and the Dawson website does not decline it 
    headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
    }
    
    # URL of program page for pure and applied
    r = requests.get(url, headers=headers)
    
    print(r.status_code)
    
    if (r.status_code == requests.codes.ok):
    
        # Get HTML of the page
        html = r.text.strip()

        # Make html of page a BeautifulSoup 
        soup = BeautifulSoup(html, 'lxml')
    
        programDateMod = getDateOfMods(soup)
    
        #Page data to be returned
        pageData = {
            'date': programDateMod,
        }
    
        return pageData
    else: 
        return '404'

#Get the date of when the page was last modified
def getDateOfMods(soup):
    # Get when last modified
    dateModified = soup.find(class_='page-mod-date').contents[0].strip()
    # Remove the 'Last Modified: ' field so just date is left
    dateModified = dateModified[15:]
    return dateModified
    

## Functions to parse all of the programs

In [13]:
# An array to hold all the programs retrieved
programs = []

### Testing

In [14]:
    #This is just to make it so the request goes through and the Dawson website does not decline it 
    headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
    }
    
    url = 'https://www.dawsoncollege.qc.ca/programs'
    
    # URL of program page for pure and applied
    r = requests.get(url, headers=headers)
    
    # Get HTML of the page
    html = r.text.strip()
    
    # Make html of page a BeautifulSoup 
    soup = BeautifulSoup(html, 'lxml')
    
    content = soup.find(class_='entry-content')
    
    #Get all the rows
    trTags = content.find_all('tr')
    
    #Loop through each row
    for tr in trTags:
        
        #Means a program continue (not a heading)
        if(tr.find(class_='program-name')):
            #Get program url 
            programUrlEnd = tr.find(class_='program-name').find('a')['href']
            programUrl = 'https://www.dawsoncollege.qc.ca/' + programUrlEnd
            print(programUrl)
            #Get page info
            pageInfo = parseProgramPage(programUrl)
            print(pageInfo)
            #Add program type
            #If 404 means page was not found so just have name and type
            if (pageInfo == '404'):
                pageInfo = {
                
                    'programName': tr.find(class_='program-name').find('a').contents[0].strip()
                    'type' = tr.find(class_='program-type').contents[0].strip()
                }
            #print(pageInfo)
            pageInfo['type'] = tr.find(class_='program-type').contents[0].strip()
            pageInfo['programName']: tr.find(class_='program-name').find('a').contents[0].strip()
            programs.append(pageInfo)
    

https://www.dawsoncollege.qc.ca//3d/
200
{'programName': 'Program Information', 'date': 'October 12, 2018'}
https://www.dawsoncollege.qc.ca//accounting-and-management-technology/
200
{'programName': 'Program Information', 'date': 'August 8, 2018'}
https://www.dawsoncollege.qc.ca//anthropology/
200
{'programName': 'What is anthropology?', 'date': 'August 15, 2017'}
https://www.dawsoncollege.qc.ca//arts-culture/
200
{'programName': 'Arts & Culture (500.G1)', 'date': 'August 28, 2018'}
https://www.dawsoncollege.qc.ca//programs/creative-applied-arts/arts-literature-and-communication/
200
{'programName': 'University Studies', 'date': 'April 12, 2019'}
https://www.dawsoncollege.qc.ca//biology/
200


AttributeError: 'NoneType' object has no attribute 'contents'