In [93]:
import pandas as pd
import numpy as np
import requests
import bs4
import re
import datetime

In [2]:
url = "https://www.coursereport.com/best-coding-bootcamps"
response = requests.get(url)

In [3]:
html_bytes = response.content

In [4]:
soup = bs4.BeautifulSoup(html_bytes, "html.parser")

In [5]:
type(soup)

bs4.BeautifulSoup

In [6]:
soup.body

<body data-spy="scroll" data-target="#toc" style="position:relative;"><header><nav class="navbar navbar-default" role="navigation"><div class="container" id="nav-container" itemscope="" itemtype="http://schema.org/Organization"><div class="navbar-header"><button class="navbar-toggle collapsed" data-target="#navbar-collapse" data-toggle="collapse" type="button"><span class="sr-only">Toggle navigation</span><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></button><a class="navbar-brand logo-small" href="https://www.coursereport.com" itemprop="url"><img alt="Course Report" itemprop="logo" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/logo-small-756b1c93378e1fdcc7f90674634f7675.png" title="Course Report"/></a></div><div class="collapse navbar-collapse" id="navbar-collapse"><ul class="nav navbar-nav"><li class="top-level-nav" id="schools"><a class="mp-gb-header" href="/schools">Browse Schools</a><ul class="tracks"><

We want to get the following columns from coursereport.com:

- rank
- name of school
- rating
- stars
- no of reviews
- cities



In [7]:
a = (soup
 .body
 .find("div", class_="main-body")
 .find("div", class_="longform-body container")
 .find("div", class_="row")
 .find("div", class_="col-md-11")
 .find("ul", id="schools")
 .find_all("li")
)

Let's see if we could have skipped all the previous entries and go straight to the ul tag with id "schools"

In [8]:
b = (soup
 .find("ul", id="schools")
 .find_all("li")
)

In [9]:
a == b

True

In [10]:
school_list_items = (soup
                     .find("ul", id="schools")
                     .find_all("li"))

Try to get the required info from the first element

In [11]:
flatiron = school_list_items[0]

Get the rank and name

In [12]:
rank_name = flatiron.h3.a.contents[0]
rank_name

'1. Flatiron School'

In [13]:
# Extract rank and name of school individually using regex
rank_pattern = r"^\d{1,2}"
rank = int(re.findall(rank_pattern, rank_name)[0])
print(rank)

#name
name_pattern = r"^\d{1,2}\.\s(.+)$"
name = re.findall(name_pattern, rank_name)[0]
print(name)

1
Flatiron School


Get the rating

In [14]:
rating_string = flatiron.find("span", class_="longform-rating-text").contents[0]
rating_string

'Overall Rating: (4.75) '

In [15]:
# Extract the rating
rating_pattern = r"\((.+)\)"
rating = float(re.findall(rating_pattern, rating_string)[0])
rating

4.75

Get the stars

In [16]:
a_star = (flatiron
          .find("div", class_="ratings title-rating")
          .find_all("span"))[1:][0]["class"]
a_star

['icon-full_star']

In [17]:
# Extract star count

# 1. get a list of all stars
stars_list = [tag["class"][0] for tag in (flatiron
                                          .find("div", class_="ratings title-rating")
                                          .find_all("span"))[1:]]

# 2. Translate text into numerical values
# 2.1. Create a dict with translations
star_dict = {"icon-full_star": 1,
             "icon-half_star": .5,
             "icon-empty_star": 0}

# 2.2. translate stars to floats and sum list
stars = float(sum([star_dict[s] for s in stars_list]))
stars

4.5

Get no of reviews

In [18]:
reviews_string = (flatiron
                  .find_all("div", class_="ratings title-rating")[1]
                  .a
                  .contents[0])
reviews_string

'409 Reviews'

In [19]:
# Extract no of reviews
reviews_pattern = r"^\d+"
reviews = int(re.findall(reviews_pattern, reviews_string)[0])
reviews

409

Get cities

In [20]:
(flatiron
 .find("span", class_="location")
 .find_all("a"))[0].contents[0]

'Online'

How do we want to represent multiple citites.
Let's go for a string with a unique delimiter, such as "|"

In [21]:
ex = "Online|New York City|Seattle"

In [22]:
ex.split("|")

['Online', 'New York City', 'Seattle']

In [23]:
# cities_list = (flatiron
#                .find("span", class_="location")
#                .find_all("a"))
# [tag.contents[0] for tag in cities_list]

cities_list = [tag.contents[0] for tag in (flatiron
                                           .find("span", class_="location")
                                           .find_all("a"))]
cities = "|".join(cities_list)
cities

'Online|New York City|Washington|Houston|Seattle|Denver|Chicago|Atlanta|London|Austin|San Francisco|Sydney'

Last step, put all values in a dict

In [24]:
rank_item = {"rank": rank,
             "name": name,
             "rating": rating,
             "stars": stars,
             "reviews": reviews,
             "cities": cities}

rank_item

{'rank': 1,
 'name': 'Flatiron School',
 'rating': 4.75,
 'stars': 4.5,
 'reviews': 409,
 'cities': 'Online|New York City|Washington|Houston|Seattle|Denver|Chicago|Atlanta|London|Austin|San Francisco|Sydney'}

## Create some functions so we can iterate over all schools

We create function that extract the required information for each list item (li tag)


In [32]:
# Use 2nd ranked school for quick testing
hack_reactor = school_list_items[1]

In [25]:
school_list_items

[<li><div class="info-container"><a href="/schools/flatiron-school"><div class="school-image"><img alt="flatiron-school-logo" src="https://course_report_production.s3.amazonaws.com/rich/rich_files/rich_files/999/s100/flatironschool.png" title="Flatiron School Logo"/></div></a><h3><a href="/schools/flatiron-school">1. Flatiron School</a></h3><span class="banner-container"><img alt="Established school badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/established_school_badge-ac91222df11d01846007fe26e683252b.png"/><img alt="Large alumni network badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/large_alumni_network_badge-49b12ac08920bf6c28f1f515886e3682.png"/><img alt="Transparent outcomes badge" class="banner" src="https://coursereport-production-herokuapp-com.global.ssl.fastly.net/assets/transparent_outcomes_badge-6d4374d6ca112c0b1a054a8ffcdcbb57.png"/><div class="ratings title-rating"

Get rank and name

In [28]:
# rank_name = flatiron.h3.a.contents[0]
# rank_name

# # Extract rank and name of school individually using regex
# rank_pattern = r"^\d{1,2}"
# rank = int(re.findall(rank_pattern, rank_name)[0])
# print(rank)

# #name
# name_pattern = r"^\d{1,2}\.\s(.+)$"
# name = re.findall(name_pattern, rank_name)[0]
# print(name)

# stitch previously used approach together into functions
def get_rank(list_item):
    rank_name = list_item.h3.a.contents[0]
    rank_pattern = r"^\d{1,2}"
    return int(re.findall(rank_pattern, rank_name)[0])

def get_name(list_item):
    rank_name = list_item.h3.a.contents[0]
    name_pattern = r"^\d{1,2}\.\s(.+)$"
    return re.findall(name_pattern, rank_name)[0]


In [30]:
print(get_rank(school_list_items[1]))
print(get_name(school_list_items[1]))

2
Hack Reactor


Get rating

In [33]:
# rating_string = flatiron.find("span", class_="longform-rating-text").contents[0]
# rating_string

# # Extract the rating
# rating_pattern = r"\((.+)\)"
# rating = float(re.findall(rating_pattern, rating_string)[0])
# rating


def get_rating(list_item):
    rating_string = list_item.find("span", class_="longform-rating-text").contents[0]
    rating_pattern = r"\((.+)\)"
    return float(re.findall(rating_pattern, rating_string)[0])
    

In [34]:
get_rating(hack_reactor)

4.68

Get stars

In [35]:
# # Extract star count

# # 1. get a list of all stars
# stars_list = [tag["class"][0] for tag in (flatiron
#                                           .find("div", class_="ratings title-rating")
#                                           .find_all("span"))[1:]]

# # 2. Translate text into numerical values
# # 2.1. Create a dict with translations
# star_dict = {"icon-full_star": 1,
#              "icon-half_star": .5,
#              "icon-empty_star": 0}

# # 2.2. translate stars to floats and sum list
# stars = float(sum([star_dict[s] for s in stars_list]))
# stars

def get_stars(list_item):
    stars_list = [tag["class"][0] for tag in (list_item
                                              .find("div", class_="ratings title-rating")
                                              .find_all("span"))[1:]]
    
    star_dict = {"icon-full_star": 1,
                 "icon-half_star": .5,
                 "icon-empty_star": 0}
    
    return float(sum([star_dict[s] for s in stars_list]))
    

In [36]:
get_stars(hack_reactor)

4.5

Get reviews

In [37]:
# reviews_string = (flatiron
#                   .find_all("div", class_="ratings title-rating")[1]
#                   .a
#                   .contents[0])
# reviews_string

# # Extract no of reviews
# reviews_pattern = r"^\d+"
# reviews = int(re.findall(reviews_pattern, reviews_string)[0])
# reviews

def get_reviews(list_item):
    reviews_string = (list_item
                      .find_all("div", class_="ratings title-rating")[1]
                      .a
                      .contents[0])
    
    reviews_pattern = r"^\d+"
    return int(re.findall(reviews_pattern, reviews_string)[0])
    

In [38]:
get_reviews(hack_reactor)

250

Get cities

In [39]:
# cities_list = (flatiron
#                .find("span", class_="location")
#                .find_all("a"))
# [tag.contents[0] for tag in cities_list]

# cities_list = [tag.contents[0] for tag in (flatiron
#                                            .find("span", class_="location")
#                                            .find_all("a"))]
# cities = "|".join(cities_list)
# cities

def get_cities(list_item):
    cities_list = [tag.contents[0] for tag in (list_item
                                               .find("span", class_="location")
                                               .find_all("a"))]
    return "|".join(cities_list)

In [40]:
get_cities(hack_reactor)

'Boulder|Los Angeles|Phoenix|Online|New York City|Seattle|Denver|Austin|San Francisco'

Put them all together into a function that creates a dict

In [50]:
str(datetime.date.today())

'2019-10-28'

In [52]:
def get_record(list_item):
    return {"date_id": str(datetime.date.today()),
            "rank": get_rank(list_item),
            "name": get_name(list_item),
            "rating": get_rating(list_item),
            "stars": get_stars(list_item),
            "reviews": get_reviews(list_item),
            "cities": get_cities(list_item)}

In [53]:
get_record(hack_reactor)

{'date_id': '2019-10-28',
 'rank': 2,
 'name': 'Hack Reactor',
 'rating': 4.68,
 'stars': 4.5,
 'reviews': 250,
 'cities': 'Boulder|Los Angeles|Phoenix|Online|New York City|Seattle|Denver|Austin|San Francisco'}

In [58]:
df = pd.DataFrame([get_record(list_item) for list_item in school_list_items])

In [59]:
df

Unnamed: 0,date_id,rank,name,rating,stars,reviews,cities
0,2019-10-28,1,Flatiron School,4.75,4.5,409,Online|New York City|Washington|Houston|Seattl...
1,2019-10-28,2,Hack Reactor,4.68,4.5,250,Boulder|Los Angeles|Phoenix|Online|New York Ci...
2,2019-10-28,3,Codesmith,4.88,5.0,306,Los Angeles|Online|New York City
3,2019-10-28,4,App Academy,4.7,4.5,664,Online|New York City|San Francisco
4,2019-10-28,5,Turing,4.78,4.5,153,Denver
5,2019-10-28,6,Fullstack Academy,4.9,5.0,245,Online|New York City|Chicago
6,2019-10-28,7,General Assembly,4.32,4.5,420,San Diego|Stamford|Orlando|Boston|Los Angeles|...
7,2019-10-28,8,Tech Elevator,4.95,5.0,166,Cleveland|Cincinnati|Pittsburgh|Detroit|Philad...
8,2019-10-28,9,DigitalCrafts,4.88,5.0,190,Tampa|Houston|Atlanta
9,2019-10-28,10,Software Guild,4.68,4.5,141,Minneapolis|Louisville|Online|Atlanta


In [55]:
url2 = "https://www.coursereport.com/best-data-science-bootcamps"
response2 = requests.get(url2)
html_bytes2 = response2.content
soup2 = bs4.BeautifulSoup(html_bytes2, "html.parser")

school_list_items2 = (soup2
                      .find("ul", id="schools")
                      .find_all("li"))

pd.DataFrame([get_record(list_item) for list_item in school_list_items2])

Unnamed: 0,date_id,rank,name,rating,stars,reviews,cities
0,2019-10-28,1,BrainStation,4.64,4.5,260,Vancouver|Toronto|New York City|Online|Boston|...
1,2019-10-28,2,Coding Temple,4.89,5.0,93,Dallas|Online|Washington|Boston|Chicago
2,2019-10-28,3,Divergence Academy,5.0,5.0,19,Dallas
3,2019-10-28,4,Flatiron School,4.75,4.5,409,London|Sydney|Austin|Atlanta|Houston|Seattle|N...
4,2019-10-28,5,Galvanize,4.49,4.5,176,Boulder|Austin|Seattle|New York City|Los Angel...
5,2019-10-28,6,General Assembly,4.32,4.5,420,Sydney|Austin|Atlanta|Singapore|Melbourne|Dall...
6,2019-10-28,7,Metis,4.89,5.0,89,Seattle|New York City|Online|Chicago|San Franc...
7,2019-10-28,8,NYC Data Science Academy,4.83,5.0,270,New York City|Online
8,2019-10-28,9,Principal Analytics Prep,4.87,5.0,15,New York City
9,2019-10-28,10,Propulsion Academy,5.0,5.0,30,Zurich


In [57]:
url3 = "https://www.coursereport.com/best-online-bootcamps"
response3 = requests.get(url3)
html_bytes3 = response3.content
soup3 = bs4.BeautifulSoup(html_bytes3, "html.parser")

school_list_items3 = (soup3
                      .find("ul", id="schools")
                      .find_all("li"))

pd.DataFrame([get_record(list_item) for list_item in school_list_items3])

Unnamed: 0,date_id,rank,name,rating,stars,reviews,cities
0,2019-10-28,1,AcadGild,4.3,4.0,172,Bangalore|Online
1,2019-10-28,2,Actualize,4.88,5.0,206,Online|Chicago
2,2019-10-28,3,Altcademy,5.0,5.0,31,Hong Kong|Online
3,2019-10-28,4,Bloc,4.68,4.5,440,Online
4,2019-10-28,5,Bottega,4.56,4.5,52,Scottsdale|Salt Lake City|Online
5,2019-10-28,6,CareerFoundry,4.53,4.5,316,Berlin|Online
6,2019-10-28,7,Coding Dojo,4.38,4.5,362,Tulsa|Boise|Orange County|Silicon Valley|Dalla...
7,2019-10-28,8,Covalence,4.64,4.5,47,Online
8,2019-10-28,9,Designlab,4.68,4.5,95,Online
9,2019-10-28,10,Flatiron School,4.75,4.5,409,Sydney|Houston|Austin|New York City|London|Atl...


# Exercise 1:
Among the top coding schools, what are the top cities?

In [101]:
pd.set_option("display.max_rows", 100)

top_cities = ((df["cities"]
               .str.split("|")
               .explode()
               .reset_index(drop=True))
              .value_counts()
              .drop(labels=["Online"]))

In [102]:
top_cities

Chicago           9
New York City     8
San Francisco     8
Seattle           7
Denver            6
Dallas            5
Los Angeles       5
Atlanta           5
Toronto           4
Washington        4
Boston            4
Phoenix           4
London            4
Salt Lake City    3
Austin            3
Montreal          3
Columbus          3
Sydney            3
Berlin            3
Barcelona         3
Minneapolis       3
Orange County     3
Melbourne         3
Vancouver         3
Miami             3
Houston           3
Raleigh           2
Detroit           2
Singapore         2
Sao Paulo         2
Amsterdam         2
Madrid            2
Portland          2
Wilmington        2
Mexico City       2
Tulsa             2
Paris             2
Lisbon            2
Cleveland         2
Tokyo             1
New Orleans       1
Oslo              1
Orlando           1
Indianapolis      1
Milwaukee         1
Cali              1
Stamford          1
Bogotá            1
Brisbane          1
Victoria          1


# Exercise 2:
Insert df into mysql

In [103]:
import pymysql
from sqlalchemy import create_engine
import getpass

pw = getpass.getpass()
engine = create_engine(f"mysql+pymysql://ironhack:{pw}@localhost", echo=False)

········


Test connection

In [104]:
pd.read_sql("SELECT * FROM bank.account LIMIT 10", engine)

Unnamed: 0,account_id,district_id,frequency,date
0,1,18,POPLATEK MESICNE,950324
1,2,1,POPLATEK MESICNE,930226
2,3,5,POPLATEK MESICNE,970707
3,4,12,POPLATEK MESICNE,960221
4,5,15,POPLATEK MESICNE,970530
5,6,51,POPLATEK MESICNE,940927
6,7,60,POPLATEK MESICNE,961124
7,8,57,POPLATEK MESICNE,950921
8,9,70,POPLATEK MESICNE,930127
9,10,54,POPLATEK MESICNE,960828


In [105]:
# 1. Create database
# 2. Create a table
# 3. Insert rows into that table

In [120]:
# 1. 
create_database = """
CREATE DATABASE bootcamps;
"""

# 2.
create_table = """
CREATE TABLE bootcamps.coding (
    date_id DATE, 
    `rank` INT, 
    `name` VARCHAR(255), 
    rating FLOAT, 
    stars FLOAT, 
    reviews INT, 
    cities VARCHAR(255));
"""

In [110]:
pd.read_sql(create_database, engine)

ProgrammingError: (pymysql.err.ProgrammingError) (1007, "Can't create database 'bootcamps'; database exists")
[SQL: 
CREATE DATABASE bootcamps;
]
(Background on this error at: http://sqlalche.me/e/f405)

In [119]:
pd.read_sql(create_table, engine)

ProgrammingError: (pymysql.err.ProgrammingError) (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'rank INT, `name` VARCHAR(255), rating FLOAT, stars FLOAT, reviews INT, cities VA' at line 1")
[SQL: 
CREATE TABLE bootcamps.coding ( date_id DATE, rank INT, `name` VARCHAR(255), rating FLOAT, stars FLOAT, reviews INT, cities VARCHAR(255));
]
(Background on this error at: http://sqlalche.me/e/f405)