# Web Scrape Demo

In [1]:
# imports

from bs4 import BeautifulSoup # For HTML parsing
import requests # Website connections
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
import pandas as pd # For converting results to a dataframe and bar chart plots
import json # For parsing json
%matplotlib inline

In [2]:
page_url = 'https://www.ontariocars.ca/for-sale?condition=USED&location=Markham&distance=200&first=24&after=0&sort=DISTANCE&asc=true'

In [3]:
result = requests.get(page_url)

In [4]:
# anything other than 200 is an error
result.status_code

200

In [5]:
result.headers

{'Date': 'Tue, 11 Aug 2020 18:43:36 GMT', 'Content-Type': 'text/html;charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'Jetty(9.4.z-SNAPSHOT)'}

In [6]:
type(result.content)

bytes

In [7]:
result.content

b'\n    <!doctype html>\n    <!--[if lt IE 7 ]> <html class="ie6"> <![endif]-->\n    <!--[if IE 7 ]>    <html class="ie7"> <![endif]-->\n    <!--[if IE 8 ]>    <html class="ie8"> <![endif]-->\n    <!--[if IE 9 ]>    <html class="ie9"> <![endif]-->\n    <!--[if (gt IE 9)|!(IE)]><!--> <html class=""> <!--<![endif]-->\n    <head>\n        <meta charset="utf-8">\n        <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=0">\n        <title>Used Cars for sale in Ontario - Buy Used Vehicles | ontariocars</title>\n        <meta name="description" content="Buy your next used car with ontariocars.ca. Choose from thousands of used vehicles sold by members of the UCDA - Used Car Dealers Association of Ontario.">\n        <meta name="robots" content="noindex">\n        <link href="https://fonts.googleapis.com/css?family=Montserrat:200,300,400,600,700" rel="stylesheet">\n        <link rel="icon" type="image/png" href="/images/favicon-32x32.png" sizes="32x32" />\n    

In [8]:
soup = BeautifulSoup(result.content)

In [9]:
soup

<!DOCTYPE html>
<!--[if lt IE 7 ]> <html class="ie6"> <![endif]--><!--[if IE 7 ]>    <html class="ie7"> <![endif]--><!--[if IE 8 ]>    <html class="ie8"> <![endif]--><!--[if IE 9 ]>    <html class="ie9"> <![endif]--><!--[if (gt IE 9)|!(IE)]><!--><html class=""> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, user-scalable=0" name="viewport"/>
<title>Used Cars for sale in Ontario - Buy Used Vehicles | ontariocars</title>
<meta content="Buy your next used car with ontariocars.ca. Choose from thousands of used vehicles sold by members of the UCDA - Used Car Dealers Association of Ontario." name="description"/>
<meta content="noindex" name="robots"/>
<link href="https://fonts.googleapis.com/css?family=Montserrat:200,300,400,600,700" rel="stylesheet"/>
<link href="/images/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/images/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="/css/reset.c

In [10]:
def get_page(condition, location, distance):
    url = f'https://www.ontariocars.ca/for-sale?condition={condition}&location={location}&distance={distance}&first=24&after=0&sort=DISTANCE&asc=true'
    result = requests.get(url)
    soup = BeautifulSoup(result.content)
    return soup

get_page('USED', 'Toronto', '100')

<!DOCTYPE html>
<!--[if lt IE 7 ]> <html class="ie6"> <![endif]--><!--[if IE 7 ]>    <html class="ie7"> <![endif]--><!--[if IE 8 ]>    <html class="ie8"> <![endif]--><!--[if IE 9 ]>    <html class="ie9"> <![endif]--><!--[if (gt IE 9)|!(IE)]><!--><html class=""> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, user-scalable=0" name="viewport"/>
<title>Used Cars for sale in Ontario - Buy Used Vehicles | ontariocars</title>
<meta content="Buy your next used car with ontariocars.ca. Choose from thousands of used vehicles sold by members of the UCDA - Used Car Dealers Association of Ontario." name="description"/>
<meta content="noindex" name="robots"/>
<link href="https://fonts.googleapis.com/css?family=Montserrat:200,300,400,600,700" rel="stylesheet"/>
<link href="/images/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/images/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="/css/reset.c

In [11]:
for div in soup.find_all('div', class_= 'vehicle-item'):
    href = div.find('span', class_='vehicle-details').a['href']
    url = f'https://www.ontariocars.ca{href}'
    print(url)

https://www.ontariocars.ca/for-sale/2007-honda-fit-lx/1245bf45-a6ec-4aff-a3d6-befd80aa40c3
https://www.ontariocars.ca/for-sale/2016-mazda-cx-5-awd-4dr-auto-gs/125ff33a-b3be-4ef6-8c6c-db040e3b113e
https://www.ontariocars.ca/for-sale/2006-toyota-rav4-4dr-auto-v6-4wd-sport/10eb275d-5599-4d1f-91d1-88b94529ad25
https://www.ontariocars.ca/for-sale/2013-ford-focus-5dr-hb-se/10f351ff-9f6e-47e4-b32d-f78ad270c9e6
https://www.ontariocars.ca/for-sale/2011-ford-super-duty-f-350-srw-xl-crew-cab-long-box-gas/10f40018-416d-4604-b7c3-4a00c0565871
https://www.ontariocars.ca/for-sale/2011-dodge-grand-caravan-4dr-wgn-r-t/10fb8c3c-56f5-4d4b-b51f-110cf2edfb2c
https://www.ontariocars.ca/for-sale/2018-audi-q3-2-0-tfsi-quattro-komfort-tiptronic/108c39bf-5d4c-4030-8bac-4a8585da26c0
https://www.ontariocars.ca/for-sale/2012-mazda-mazda5-4dr-wgn-auto-gs/1136c220-75fd-4d00-8563-a2a365320d1a
https://www.ontariocars.ca/for-sale/2018-hyundai-sonata-sport/11424a82-d571-49d4-8e29-0b9b50c6f8f2
https://www.ontariocars.ca/

In [12]:
from bs4 import BeautifulSoup
import requests
from time import sleep
import random
import csv
cities=['Aurora','Barrie','Belleville','Brampton','Brantford','Brockville','Burlington','Caledon','Cambridge','Cayuga',\
        'Chatham-Kent','Cornwall','Essex','Guelph','Hamilton','Kingston','Kitchener','London','Markham','Milton',\
        'Mississauga','Newmarket','Niagara-Falls','North-York','Oakville','Oshawa','Ottawa','Pembroke','Peterborough',\
        'Richmond-Hill','Sarnia','St.-Catharines','Stratford','Sudbury','Thunder-Bay','Toronto','Vaughan','Waterloo',\
        'Whitby','Windsor']
for city in cities:
    csv_file = open(f'{city}.csv', 'w')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['year', 'brand', 'model','location', 'price','mileage','body_type','exterior_colour',\
                    'interior_colour','door','passenger','transmission','drive_type','displacement',\
                    'cylinder','fuel'])
    print(city)
    for i in range(45):
            print(f'page:{i+1}')
            mainpage_source=requests.get(f'https://www.ontariocars.ca/for-sale?condition=USED&city={city}&first=24&after={i*24}&sort=CREATED&asc=false')
            print(f'the mainpage status:{mainpage_source.status_code}')
            soup=BeautifulSoup(mainpage_source.content, 'lxml')
            for div in soup.find_all('div', class_='vehicle-item'):
                href=div.find('span', class_='vehicle-details').a['href']
                subpage_link=f'https://www.ontariocars.ca{href}'
                subpage_source=requests.get(subpage_link)
                print((f'the subpage status:{subpage_source.status_code}'))
                subpage_soup=BeautifulSoup(subpage_source.content,'lxml')
                header=subpage_soup.find('div', class_='container-top padded').get_text()
                neck=subpage_soup.find('div', class_='details-container details-title').get_text()
                body=subpage_soup.find('div', class_='details-container features').get_text()
                #header parts
                year=header.split()[0]
                brand=header.split()[1]
                model=header.split()[2]
                location=header.split()[-3]
                #neck parts
                price=neck.split()[0]
                mileage=neck.split()[1]
                #body parts
                body_type=body.split()[4][5:]
                exterior_colour=body.split()[6][6:]
                interior_colour=body.split()[8][6:]
                door=body.split()[9][5:]
                passenger=body.split()[10][10:]
                transmission=body.split()[11][12:]
                drive_type=body.split()[13][4:]
                displacement=body.split()[14][12:]
                cylinder=body.split()[15][9:]
                fuel=body.split()[16][4:]
                csv_writer.writerow([year, brand, model,location, price, mileage,body_type,exterior_colour,\
                        interior_colour,door,passenger,transmission,drive_type,displacement,\
                        cylinder,fuel])
                waittime=random.random()*1.87775
                sleep(waittime)
    csv_file.close()

Aurora
page:1
the mainpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
page:2
the mainpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200
the subpage status:200


KeyboardInterrupt: 