# Web Scaping

## Scraping a single page - Tests

In [1]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser 

In [2]:
#This is just to make it so the request goes through and the Dawson website does not decline it 
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
}

In [3]:
# URL of program page for pure and applied
url = 'https://www.dawsoncollege.qc.ca/pure-applied/'
r = requests.get(url, headers=headers)

In [4]:
# Status code is 200 means all is good and request went through
r.status_code

200

In [5]:
# Get HTML of the page
html = r.text.strip()

In [6]:
# Make html of page a BeautifulSoup 
soup = BeautifulSoup(html, 'lxml')

In [7]:
# Header Content of page
content = soup.find(class_='entry-content')
#Get program title and strip any white spaces
programTitle = content.find('h2').contents[0].strip()

In [8]:
# Get when last modified
dateModified = soup.find(class_='page-mod-date').contents[0].strip()
# Remove the 'Last Modified: ' field so just date is left
dateModified = dateModified[15:]
print(dateModified)

August 8, 2018


## Function to parse a single page

In [14]:
# Parse a single Program page at Dawson College
def parseProgramPage(url):
    
    #This is just to make it so the request goes through and the Dawson website does not decline it 
    headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
    }
    
    # URL of program page for pure and applied
    r = requests.get(url, headers=headers)
    
    if (r.status_code == requests.codes.ok):
    
        # Get HTML of the page
        html = r.text.strip()

        # Make html of page a BeautifulSoup 
        soup = BeautifulSoup(html, 'lxml')
    
        programDateMod = getDateOfMods(soup)
    
        #Page data to be returned
        pageData = {
            'date': programDateMod
        }
    
        return pageData
    else: 
        return '404'

#Get the date of when the page was last modified
def getDateOfMods(soup):
    # Get when last modified
    dateModified = soup.find(class_='page-mod-date').contents[0].strip()
    # Remove the 'Last Modified: ' field so just date is left
    dateModified = dateModified[15:]
    return dateModified
    

## Functions to parse all of the programs

In [15]:
# An array to hold all the programs retrieved
programs = []

### Testing

In [None]:
    #This is just to make it so the request goes through and the Dawson website does not decline it 
    headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
    }
    
    url = 'https://www.dawsoncollege.qc.ca/programs'
    
    # URL of program page for pure and applied
    r = requests.get(url, headers=headers)
    
    # Get HTML of the page
    html = r.text.strip()
    
    # Make html of page a BeautifulSoup 
    soup = BeautifulSoup(html, 'lxml')
    
    content = soup.find(class_='entry-content')
    
    #Get all the rows
    trTags = content.find_all('tr')
    
    #Loop through each row
    for tr in trTags:
        
        #Means a program continue (not a heading)
        if(tr.find(class_='program-name')):
            #Get program url 
            programUrlEnd = tr.find(class_='program-name').find('a')['href']
            #Not an actual program 
            if programUrlEnd == '/programs/general-education':
                continue
            programUrl = 'https://www.dawsoncollege.qc.ca/' + programUrlEnd
            #Get page info
            pageInfo = parseProgramPage(programUrl)
            #Add program type
            #If 404 means page was not found so just have name and type
            if (pageInfo == '404'):
                pageInfo = {
                
                    'programName': tr.find(class_='program-name').find('a').contents[0].strip(),
                }
            if tr.find(class_='program-type'):
                try:
                    pageInfo['type'] = tr.find(class_='program-type').contents[0].strip()
                #Since not a program and details are not present just continue
                except: 
                    continue
            pageInfo['programName'] = tr.find(class_='program-name').find('a').contents[0].strip()
            programs.append(pageInfo)
            print(pageInfo)
    print('Finished extracting')
    

{'date': 'October 12, 2018', 'type': 'Program', 'programName': '3D Animation & Computer Generated Imagery'}
{'date': 'August 8, 2018', 'type': 'Program', 'programName': 'Accounting and Management Technology'}
{'date': 'August 15, 2017', 'type': 'Discipline', 'programName': 'Anthropology'}
{'date': 'August 28, 2018', 'type': 'Profile', 'programName': 'Arts and Culture'}
{'date': 'April 12, 2019', 'type': 'Program', 'programName': 'Arts, Literature and Communication (ALC)'}
{'date': 'June 14, 2018', 'type': 'Discipline', 'programName': 'Biology'}
{'date': 'October 25, 2018', 'type': 'Program', 'programName': 'Biomedical Laboratory Technology'}
{'date': 'June 18, 2019', 'type': 'Discipline', 'programName': 'Chemistry'}
{'date': 'August 8, 2018', 'type': 'Profile', 'programName': 'Child Studies'}
{'date': 'August 28, 2018', 'type': 'Profile', 'programName': 'Cinema/Video/Communications'}
{'date': 'August 8, 2018', 'type': 'Program', 'programName': 'Civil Engineering Technology'}
{'date': '