In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Using the requests library

In [8]:
url = 'https://rldaggie.github.io/sample-html/'
res = requests.get(url)

### Status Codes

In [4]:
res.status_code

200

### Creating a `BeautifulSoup` object

In [5]:
res.content

b'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <title>The title</title>\n\n    <style media="screen">\n      tbody tr {\n        color: red;\n      }\n    </style>\n  </head>\n  <body>\n    <h1 class="foobar" id="title">This is an h1</h1>\n\n    <div>\n      <h1 class="foobar">This is yet another heading.</h1>\n\n      Something inside the div\n    </div>\n\n    <h3>Todo List</h3>\n    <ol class="todo">\n      <li class="foobar">Take out trash</li>\n      <li>Pay billz</li>\n      <li class="foobar">Feed dog</li>\n    </ol>\n\n    <h3>Completed</h3>\n    <ol class=\'done\'>\n      <li>Mow lawn</li>\n      <li class="foobar"><span>Take out compost</span></li>\n      <li><span>Create scraping lecture</span></li>\n    </ol>\n\n    <p class=\'foobar\'>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commod

# `soup.find()`

Returns either:

1. A soup object of the first match
2. `None`

In [6]:
soup = BeautifulSoup(res.content, 'lxml')

In [10]:
h1= soup.find('h1')

In [11]:
h1.text

'This is an h1'

In [None]:
h1.attrs

In [None]:
# fail safe in case tag for headers in not there
if h1:
    print(h1.text)

# `soup.find_all()`

Returns a **_LIST_** of soup objects that match your query

In [19]:
#soup.find_all('h1')

[h1.text for h1 in soup.find_all('h1')]
 

['This is an h1', 'This is yet another heading.']

In [26]:
instructors = [
    {'name': 'Caroline', 'market': 'ATL'},
    {'name': 'Heather'}]

In [27]:
pd.DataFrame(instructors)

Unnamed: 0,name,market
0,Caroline,ATL
1,Heather,


# Creating a `pandas` DataFrame from a scrape

### Todo List

In [28]:
#if I have a lot of ols in page this will help me anrrow down the one I want
ol = soup.find('ol',{'class':'done'})
ol

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [31]:
todos = []

for li in ol.find_all('li'):
    todo = {}
    todo['task'] = li.text
    todo['foo'] = 'bar'
    
    todos.append(todo)
pd.DataFrame(todos)

Unnamed: 0,task,foo
0,Mow lawn,bar
1,Take out compost,bar
2,Create scraping lecture,bar


In [32]:
table = soup.find('table', {'id': 'directory'})
type(table)

bs4.element.Tag

In [54]:
people = []
for row in table.find_all('tr')[1:]:
    # print(row)
    person = {}
    person['name'] = row.find('a').text.strip()
    person['role'] = row.td.text
    person['email'] = row.find('a').attrs['href'].replace('mailto:',' ')
    
    people.append(person)
pd.DataFrame(people)

Unnamed: 0,name,role,email
0,Praveen,Student,praveen@ga.co
1,Fred,Student,fred@ga.co
2,Homer,Student,homer@ga.co
3,Kyle,Student,kyle@ga.co
4,Sam,Student,sam@ga.co
5,Javier,Student,javier@ga.co
6,Nengkuan,Student,nengkuan@ga.co
7,Kieth,Student,kieth@ga.co
8,Bola,Student,bola@ga.co
9,Steve,Student,steve@ga.co


### GA Directory

### Basketball Reference

In [63]:
url = 'https://www.basketball-reference.com/'
res = requests.get(url)
res.status_code

soup = BeautifulSoup(res.content,'lxml')

table = soup.find('table', {'id': 'confs_standings_E'})
type(table)

bs4.element.Tag

In [None]:
#https://www.basketball-reference.com/robots.txt if it says disallow / dont scrape

In [66]:
table = soup.find('table', {'id': 'confs_standings_E'})

teams = []
for row in table.find_all('tr')[1:]:
    team = {}
    team['abbrev'] = row.find('a').text
    team['name'] = row.find('a').attrs['title']
    team['wins'] = row.find('td', {'data-stat': 'wins'}).text
    team['losses'] = row.find('td', {'data-stat': 'losses'}).text
    teams.append(team)

pd.DataFrame(teams)

Unnamed: 0,abbrev,name,wins,losses
0,MIL,Milwaukee Bucks,56,17
1,TOR,Toronto Raptors,53,19
2,BOS,Boston Celtics,48,24
3,IND,Indiana Pacers,45,28
4,MIA,Miami Heat,44,29
5,PHI,Philadelphia 76ers,43,30
6,BRK,Brooklyn Nets,35,37
7,ORL,Orlando Magic,33,40
8,CHO,Charlotte Hornets,23,42
9,WAS,Washington Wizards,25,47


In [67]:
import time

#for i in range(10):
    print(i)
    time.sleep(3)

0
1
2
3
4
5
6
7
8


KeyboardInterrupt: 

In [None]:
Good: /about
Bad: /admin/login