In [100]:
import pandas as pd
import requests
import bs4
import re
import datetime

# Web Scraping
What is webscraping: A way to automate the extraction of data from websites.

Pros:

- Ability to access data you may not otherwise find

Cons:

- Website not designed to be used by code - originally designed with end user (human) in mind
- Designed to scrape only one website due to design differences
- Any website update could break your code, code is therefore can hacky

### Legal Gray Area

- websites will have information in the terms/conditions about what they allow to be scraped... however, they usually do not follow up on people who break these rules
- important to ask what your goals are (commercial or reserach)
- try to get it from another source (or API) if possible
- use a VPN

Libraries we will use: Beautifulsoup4 (bs4) and Requests

Requests - allows us to send HTTP requests in order to recieve HTML docs, similar to a browser

Bs4 - allows us to parse HTML, so we can search through tags

Important to know the basics of the DOM tree

# Scrape coding bootcamp rankings from coursereport.com¶
Steps:
1. Send GET request to coursereport.com/best-coding-bootcamps to get the html
2. Throw html into a soup
3. Extract desired data
- rank
- name
- avg. rating
- stars
- no. of reviews
- locations (3)
- description
4. Throw into DataFrame

In [4]:
url = "https://www.coursereport.com/best-coding-bootcamps"

In [8]:
resp = requests.get(url)

In [7]:
type(requests.get(url))

requests.models.Response

In [10]:
resp.content[:10]

b'<!DOCTYPE '

## 2. Create soup

In [12]:
soup = bs4.BeautifulSoup(resp.content, "html.parser")

In [13]:
type(soup)

bs4.BeautifulSoup

In [24]:
school_list_items = (soup.body.find("div", class_="main-body")
                     .find("div", class_="longform-body container")
                     .find("div", class_="row")
                     .find("div", class_="col-md-11")
                     .find("ul", id="schools")
                     .find_all("li"))

In [25]:
type(school_list_items)

bs4.element.ResultSet

In [26]:
list(school_list_items)

[<li class="school-li" data-average="4.68" data-count="453" data-name="Flatiron School"><div class="school-header"><h3><a href="/schools/flatiron-school">1. Flatiron School</a></h3><div class="banner-container"><img alt="Established school badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/established_school_badge-27e2090d700676cff023ef04712115e4.png"/><img alt="Large alumni network badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/large_alumni_network_badge-ec0e8144bd182faf8b2a5cd87df36192.png"/><img alt="Transparent outcomes badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/transparent_outcomes_badge-4ac3e45e179e49b810251b163d57f342.png"/></div></div><div class="school-listing__subheader"><div><p class="ratings"><span class="icon-full_star"></span><span class="icon-full_star"></span><span class="icon-full_star"></span><span class="

In [28]:
school_list_items_2 = soup.find("ul", id="schools").find_all("li")

In [29]:
school_list_items[0] == school_list_items_2[0]

True

## 3. Extracting the data

In [30]:
sample = school_list_items[0]

In [31]:
sample

<li class="school-li" data-average="4.68" data-count="453" data-name="Flatiron School"><div class="school-header"><h3><a href="/schools/flatiron-school">1. Flatiron School</a></h3><div class="banner-container"><img alt="Established school badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/established_school_badge-27e2090d700676cff023ef04712115e4.png"/><img alt="Large alumni network badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/large_alumni_network_badge-ec0e8144bd182faf8b2a5cd87df36192.png"/><img alt="Transparent outcomes badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/transparent_outcomes_badge-4ac3e45e179e49b810251b163d57f342.png"/></div></div><div class="school-listing__subheader"><div><p class="ratings"><span class="icon-full_star"></span><span class="icon-full_star"></span><span class="icon-full_star"></span><span class="i

*Get rank and name*

In [32]:
sample.find("div").find("h3").find("a")

<a href="/schools/flatiron-school">1. Flatiron School</a>

In [33]:
sample.div.h3.a

<a href="/schools/flatiron-school">1. Flatiron School</a>

In [34]:
sample.div.h3.a.contents

['1. Flatiron School']

In [35]:
sample.div.h3.a.text

'1. Flatiron School'

*Get the Average Rating*

In [37]:
sample.find("p", class_="rating-number").text

'Avg Rating: 4.68 (453 reviews)'

*Get Stars*

In [38]:
sample.find("p", class_="ratings").find_all("span")

[<span class="icon-full_star"></span>,
 <span class="icon-full_star"></span>,
 <span class="icon-full_star"></span>,
 <span class="icon-full_star"></span>,
 <span class="icon-half_star"></span>]

*Get number of reviews*

In [39]:
sample.find("p", class_="rating-number").text

'Avg Rating: 4.68 (453 reviews)'

*Get locations*

In [45]:
sample.find("span", class_="location").find_all("a")[0].text

'Houston'

In [46]:
sample.find("span", class_="location").find_all("a")

[<a href="/cities/houston">Houston</a>,
 <a href="/cities/austin-coding-bootcamps">Austin</a>,
 <a href="/cities/washington-coding-bootcamps">Washington</a>]

In [47]:
[x.text for x in sample.find("span", class_="location").find_all("a")]

['Houston', 'Austin', 'Washington']

*Get Description*

In [50]:
sample.find("p", class_="description").find("p").text

'Flatiron School offers immersive on-campus and online programs in software engineering, data science, and cybersecurity in NYC, Brooklyn, Washington DC, Houston, Austin, Seattle, Chicago, Denver, and Online. Flatiron School’s immersive courses aim to launch students into fulfilling careers as software engineers, and data scientists through rigorous, market-aligned curricula, and the support of seasoned instructors and personal career coaches. Through test-driven labs and portfolio projects, Flatiron School teaches students to think and build like software engineers and data scientists.'

In [52]:
sample.find("p", class_="description").find_all("p")[0].text

'Flatiron School offers immersive on-campus and online programs in software engineering, data science, and cybersecurity in NYC, Brooklyn, Washington DC, Houston, Austin, Seattle, Chicago, Denver, and Online. Flatiron School’s immersive courses aim to launch students into fulfilling careers as software engineers, and data scientists through rigorous, market-aligned curricula, and the support of seasoned instructors and personal career coaches. Through test-driven labs and portfolio projects, Flatiron School teaches students to think and build like software engineers and data scientists.'

## 3.1 Create functions to extract data

In [53]:
sample.div.h3.a.text

'1. Flatiron School'

In [60]:
# create function to get the ranking
def get_ranking(list_item):
    ranking_pattern = r"(^\d{1,2})"
    school = list_item.div.h3.a.text
    return int(re.findall(ranking_pattern, school)[0])

In [61]:
get_ranking(sample)

1

In [64]:
# create funtion to get the name
def get_name(list_item):
    name_pattern = r"^\d{1,2}\.\s(.+)"
    school = list_item.div.h3.a.text
    return re.findall(name_pattern, school)[0]

In [65]:
get_name(sample)

'Flatiron School'

In [66]:
sample.find("p", class_="rating-number").text

'Avg Rating: 4.68 (453 reviews)'

In [69]:
# create function to get average rating
def get_rating(list_item):
    rating_pattern = r"^Avg Rating:\s(\d\.\d{1,2})"
    text = list_item.find("p", class_="rating-number").text
    return float(re.findall(rating_pattern, text)[0])

In [70]:
get_rating(sample)

4.68

In [117]:
# create function to get number of reviews
def get_reviews(list_item):
    review_pattern = r"(\d+)\sreviews\)\Z"
# alternative review pattern = r"\((\d+)\sreviews\)$"
# alternate pattern = r"(\d{1,3})\.\reviews"
    text = list_item.find("p", class_="rating-number").text
    return int(re.findall(review_pattern, text)[0])

In [76]:
get_reviews(sample)

453

In [77]:
#accessing number of stars
sample.find("p", class_="ratings").find_all("span")

[<span class="icon-full_star"></span>,
 <span class="icon-full_star"></span>,
 <span class="icon-full_star"></span>,
 <span class="icon-full_star"></span>,
 <span class="icon-half_star"></span>]

In [80]:
sample.find("p", class_="ratings").find_all("span")[0]

<span class="icon-full_star"></span>

In [81]:
sample.find("p", class_="ratings").find_all("span")[0]["class"]

['icon-full_star']

In [82]:
sample.find("p", class_="ratings").find_all("span")[0]["class"][0]

'icon-full_star'

In [83]:
stars = {"icon-full_star":1,
         "icon-half_star":0.5}

In [84]:
print(stars["icon-full_star"])
print(stars["icon-half_star"])

1
0.5


In [87]:
# create function to get stars and convert to int
def get_stars(list_item):
    stars = {"icon-full_star": 1,
            "icon-half_star": 0.5, 
            "icon-empty_star": 0}
    star_list = list_item.find("p", class_="ratings").find_all("span")
    return sum([stars[star["class"][0]] for star in star_list])

In [88]:
get_stars(sample)

4.5

In [89]:
[x.text for x in sample.find("span", class_="location").find_all("a")]

['Houston', 'Austin', 'Washington']

In [90]:
# defining function for locations
def get_location(list_item):
    return [x.text for x in list_item
            .find("span", class_="location")
            .find_all("a")]

In [94]:
"|".join(get_location(sample))

'Houston|Austin|Washington'

In [95]:
# adding the join with a pipe character to ensure proper functioning 
# if we save as CSV, can always split later
def get_location(list_item):
    return "|".join([x.text for x in list_item
            .find("span", class_="location")
            .find_all("a")])

In [96]:
get_location(sample)

'Houston|Austin|Washington'

In [97]:
sample.find("p", class_="description").find("p").text

'Flatiron School offers immersive on-campus and online programs in software engineering, data science, and cybersecurity in NYC, Brooklyn, Washington DC, Houston, Austin, Seattle, Chicago, Denver, and Online. Flatiron School’s immersive courses aim to launch students into fulfilling careers as software engineers, and data scientists through rigorous, market-aligned curricula, and the support of seasoned instructors and personal career coaches. Through test-driven labs and portfolio projects, Flatiron School teaches students to think and build like software engineers and data scientists.'

In [98]:
#create function to get the description
def get_description(list_item):
    return list_item.find("p", class_="description").find("p").text

In [99]:
get_description(sample)

'Flatiron School offers immersive on-campus and online programs in software engineering, data science, and cybersecurity in NYC, Brooklyn, Washington DC, Houston, Austin, Seattle, Chicago, Denver, and Online. Flatiron School’s immersive courses aim to launch students into fulfilling careers as software engineers, and data scientists through rigorous, market-aligned curricula, and the support of seasoned instructors and personal career coaches. Through test-driven labs and portfolio projects, Flatiron School teaches students to think and build like software engineers and data scientists.'

## 3.2 Create a function that processes and entire list item

In [102]:
#make sure we can add the date of when we scraped
datetime.datetime.today().strftime("%Y-%m-%d")

'2020-08-24'

In [103]:
sample

<li class="school-li" data-average="4.68" data-count="453" data-name="Flatiron School"><div class="school-header"><h3><a href="/schools/flatiron-school">1. Flatiron School</a></h3><div class="banner-container"><img alt="Established school badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/established_school_badge-27e2090d700676cff023ef04712115e4.png"/><img alt="Large alumni network badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/large_alumni_network_badge-ec0e8144bd182faf8b2a5cd87df36192.png"/><img alt="Transparent outcomes badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/transparent_outcomes_badge-4ac3e45e179e49b810251b163d57f342.png"/></div></div><div class="school-listing__subheader"><div><p class="ratings"><span class="icon-full_star"></span><span class="icon-full_star"></span><span class="icon-full_star"></span><span class="i

In [104]:
def create_row(list_item):
    return {"date_id": datetime.datetime.today().strftime("%Y-%m-%d"),
            "rank": get_ranking(list_item),
            "name": get_name(list_item),
            "rating": get_rating(list_item),
            "stars": get_stars(list_item),
            "reviews": get_reviews(list_item),
            "locations": get_location(list_item),
            "description": get_description(list_item)}

In [105]:
create_row(sample)

{'date_id': '2020-08-24',
 'rank': 1,
 'name': 'Flatiron School',
 'rating': 4.68,
 'stars': 4.5,
 'reviews': 453,
 'locations': 'Houston|Austin|Washington',
 'description': 'Flatiron School offers immersive on-campus and online programs in software engineering, data science, and cybersecurity in NYC, Brooklyn, Washington DC, Houston, Austin, Seattle, Chicago, Denver, and Online. Flatiron School’s immersive courses aim to launch students into fulfilling careers as software engineers, and data scientists through rigorous, market-aligned curricula, and the support of seasoned instructors and personal career coaches. Through test-driven labs and portfolio projects, Flatiron School teaches students to think and build like software engineers and data scientists.'}

In [107]:
[create_row(school) for school in school_list_items]

[{'date_id': '2020-08-24',
  'rank': 1,
  'name': 'Flatiron School',
  'rating': 4.68,
  'stars': 4.5,
  'reviews': 453,
  'locations': 'Houston|Austin|Washington',
  'description': 'Flatiron School offers immersive on-campus and online programs in software engineering, data science, and cybersecurity in NYC, Brooklyn, Washington DC, Houston, Austin, Seattle, Chicago, Denver, and Online. Flatiron School’s immersive courses aim to launch students into fulfilling careers as software engineers, and data scientists through rigorous, market-aligned curricula, and the support of seasoned instructors and personal career coaches. Through test-driven labs and portfolio projects, Flatiron School teaches students to think and build like software engineers and data scientists.'},
 {'date_id': '2020-08-24',
  'rank': 2,
  'name': 'Hack Reactor',
  'rating': 4.68,
  'stars': 4.5,
  'reviews': 453,
  'locations': 'Austin|Boulder|San Francisco',
  'description': 'Founded in 2012, Hack Reactor is a 12-

In [110]:
pd.DataFrame([create_row(school) for school in school_list_items])

Unnamed: 0,date_id,rank,name,rating,stars,reviews,locations,description
0,2020-08-24,1,Flatiron School,4.68,4.5,453,Houston|Austin|Washington,Flatiron School offers immersive on-campus and...
1,2020-08-24,2,Hack Reactor,4.68,4.5,453,Austin|Boulder|San Francisco,"Founded in 2012, Hack Reactor is a 12-week imm..."
2,2020-08-24,3,Codesmith,4.68,5.0,453,New York City|Los Angeles|Online,"Codesmith offers a full-time, 12-week full sta..."
3,2020-08-24,4,App Academy,4.68,4.5,453,San Francisco|New York City|Online,App Academy offers immersive web development c...
4,2020-08-24,5,Turing,4.68,4.5,453,Denver,Turing School of Software & Design is a 7-mont...
5,2020-08-24,6,Fullstack Academy,4.68,5.0,453,Jacksonville|San Luis Obispo|Baton Rouge,Fullstack Academy offers full-time and part-ti...
6,2020-08-24,7,General Assembly,4.68,4.0,453,Sydney|Singapore|Melbourne,General Assembly offers short and long courses...
7,2020-08-24,8,Tech Elevator,4.68,5.0,453,Cincinnati|Pittsburgh|Detroit,Tech Elevator is an immersive 14-week coding b...
8,2020-08-24,9,DigitalCrafts,4.68,5.0,453,Houston|Tampa|Atlanta,DigitalCrafts is an online and in-person softw...
9,2020-08-24,10,Software Guild,4.68,4.5,453,Louisville|Minneapolis|Atlanta,"The Software Guild offers immersive full-time,..."


In [111]:
# create a function that takes list of bootcamps, creates soup
#gets all rating information in df form
bootcamps = ["coding", "data-science", "online"]
def get_rankings(bootcamp):
    url = f"https://www.coursereport.com/best-{bootcamp}-bootcamps"
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.content, "html.parser")
    school_list_items = soup.find("ul", id="schools").find_all("li")
    df = pd.DataFrame([create_row(school) for school in school_list_items])
    return df

In [112]:
get_rankings("coding")

Unnamed: 0,date_id,rank,name,rating,stars,reviews,locations,description
0,2020-08-24,1,Flatiron School,4.68,4.5,453,Houston|Austin|Washington,Flatiron School offers immersive on-campus and...
1,2020-08-24,2,Hack Reactor,4.68,4.5,453,Austin|Boulder|San Francisco,"Founded in 2012, Hack Reactor is a 12-week imm..."
2,2020-08-24,3,Codesmith,4.68,5.0,453,New York City|Los Angeles|Online,"Codesmith offers a full-time, 12-week full sta..."
3,2020-08-24,4,App Academy,4.68,4.5,453,San Francisco|New York City|Online,App Academy offers immersive web development c...
4,2020-08-24,5,Turing,4.68,4.5,453,Denver,Turing School of Software & Design is a 7-mont...
5,2020-08-24,6,Fullstack Academy,4.68,5.0,453,Jacksonville|San Luis Obispo|Baton Rouge,Fullstack Academy offers full-time and part-ti...
6,2020-08-24,7,General Assembly,4.68,4.0,453,Sydney|Singapore|Melbourne,General Assembly offers short and long courses...
7,2020-08-24,8,Tech Elevator,4.68,5.0,453,Cincinnati|Pittsburgh|Detroit,Tech Elevator is an immersive 14-week coding b...
8,2020-08-24,9,DigitalCrafts,4.68,5.0,453,Houston|Tampa|Atlanta,DigitalCrafts is an online and in-person softw...
9,2020-08-24,10,Software Guild,4.68,4.5,453,Louisville|Minneapolis|Atlanta,"The Software Guild offers immersive full-time,..."


In [113]:
coding, data_science, online = [get_rankings(bootcamp) for bootcamp in bootcamps]

In [114]:
coding.head()

Unnamed: 0,date_id,rank,name,rating,stars,reviews,locations,description
0,2020-08-24,1,Flatiron School,4.68,4.5,453,Houston|Austin|Washington,Flatiron School offers immersive on-campus and...
1,2020-08-24,2,Hack Reactor,4.68,4.5,453,Austin|Boulder|San Francisco,"Founded in 2012, Hack Reactor is a 12-week imm..."
2,2020-08-24,3,Codesmith,4.68,5.0,453,New York City|Los Angeles|Online,"Codesmith offers a full-time, 12-week full sta..."
3,2020-08-24,4,App Academy,4.68,4.5,453,San Francisco|New York City|Online,App Academy offers immersive web development c...
4,2020-08-24,5,Turing,4.68,4.5,453,Denver,Turing School of Software & Design is a 7-mont...


In [115]:
data_science.head()

Unnamed: 0,date_id,rank,name,rating,stars,reviews,locations,description
0,2020-08-24,1,BrainStation,4.68,4.5,453,Vancouver|Toronto|New York City,BrainStation offers full-time and part-time co...
1,2020-08-24,2,Coding Temple,4.68,5.0,453,Chicago|Dallas|Washington,"Coding Temple offers 10-week, full-stack codin..."
2,2020-08-24,3,Colaberry,4.68,4.5,453,Dallas|Online,"Colaberry offers instructor-led remote, on-cam..."
3,2020-08-24,4,Data Science Retreat,4.68,4.5,453,Berlin,"Data Science Retreat is a 3-month, full-time, ..."
4,2020-08-24,5,Divergence Academy,4.68,5.0,453,Dallas,Divergence Academy is a 12-week full-time data...


In [116]:
online.head()

Unnamed: 0,date_id,rank,name,rating,stars,reviews,locations,description
0,2020-08-24,1,AcadGild,4.68,4.0,453,Bangalore|Online,AcadGild is an online coding bootcamp offering...
1,2020-08-24,2,Actualize,4.68,5.0,453,Chicago|Online,Actualize is a 12-week software development bo...
2,2020-08-24,3,Altcademy,4.68,5.0,453,Hong Kong|Online,Altcademy (formerly Hack Pacific) is an educat...
3,2020-08-24,4,App Academy,4.68,4.5,453,San Francisco|New York City|Online,App Academy offers immersive web development c...
4,2020-08-24,5,Barcelona Code School,4.68,4.5,453,Barcelona|Online,Barcelona Code School offers a 9-week full-tim...
