In [62]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Using the requests library

In [63]:
url = 'https://rldaggie.github.io/sample-html/'
res = requests.get(url)

### Status Codes

In [64]:
res.status_code

200

### Creating a `BeautifulSoup` object

In [65]:
soup = BeautifulSoup(res.content, 'lxml')

# `soup.find()`

Returns either:

1. A soup object of the first match
2. `None`

In [66]:
h1 = soup.find('h1')

In [67]:
type(h1)

bs4.element.Tag

In [68]:
if h1:
    print(h1.text)


This is an h1


In [69]:
if h1:
    print(h1.attrs)

{'class': ['foobar'], 'id': 'title'}


# `soup.find_all()`

Returns a **_LIST_** of soup objects that match your query

In [70]:
type(h1)

bs4.element.Tag

In [71]:
[h1.text for h1 in soup.find_all('h1')]

['This is an h1', 'This is yet another heading.']

# Creating a `pandas` DataFrame from a scrape

### Todo List

In [72]:
instructors = [{'name': 'J', 'market': 'Austin', 'years': 4},
              {'name': 'Adi', 'market': 'DC'}]
pd.DataFrame(instructors)

Unnamed: 0,name,market,years
0,J,Austin,4.0
1,Adi,DC,


In [73]:
ol = soup.find('ol', {'class': 'done'})
ol

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [74]:
todos=[]
for li in ol.find_all('li'):
    # print(li.text)
    todo={}
    todo['task']= li.text
    
    todos.append(todo)
    
pd.DataFrame(todos)

Unnamed: 0,task
0,Mow lawn
1,Take out compost
2,Create scraping lecture


### GA Directory

In [75]:
table = soup.find('table',{'id': 'directory'})

In [76]:
people = []
for row in table.find('tbody').find_all('tr'):
    person = {}
    person['name'] = row.find('a').text.strip()
    person['email'] = row.find('a').attrs['href'].replace('mailto:', '')
    person['role'] = row.find('td').text.strip()
    
    people.append(person)
pd.DataFrame(people)

Unnamed: 0,name,email,role
0,Praveen,praveen@ga.co,Student
1,Fred,fred@ga.co,Student
2,Homer,homer@ga.co,Student
3,Kyle,kyle@ga.co,Student
4,Sam,sam@ga.co,Student
5,Javier,javier@ga.co,Student
6,Nengkuan,nengkuan@ga.co,Student
7,Kieth,kieth@ga.co,Student
8,Bola,bola@ga.co,Student
9,Steve,steve@ga.co,Student


### Basketball Reference

In [77]:
url = 'https://www.basketball-reference.com/'
res = requests.get(url)

In [78]:
res.status_code

200

In [79]:
type(res.content)

bytes

In [80]:
soup = BeautifulSoup(res.content, 'lxml')

In [81]:
table = soup.find('table', {'id': 'confs_standings_E'})
type(table)

bs4.element.Tag

In [83]:
teams = []
for row in table.find_all('tr')[1:]:
    # print(row)
    team = {}
    team['slug'] = row.find('a').text
    team['name'] = row.find('a').attrs['title']
    team['wins'] = int(row.find('td', {'data-stat': 'wins'}).text)
    team['losses'] = int(row.find('td', {'data-stat': 'losses'}).text)
    
    
    teams.append(team)

pd.DataFrame(teams)


Unnamed: 0,slug,name,wins,losses
0,MIL,Milwaukee Bucks,60,22
1,TOR,Toronto Raptors,58,24
2,PHI,Philadelphia 76ers,51,31
3,BOS,Boston Celtics,49,33
4,IND,Indiana Pacers,48,34
5,BRK,Brooklyn Nets,42,40
6,ORL,Orlando Magic,42,40
7,DET,Detroit Pistons,41,41
8,CHO,Charlotte Hornets,39,43
9,MIA,Miami Heat,39,43
