In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup 

# Using the requests library

In [2]:
url = 'https://rldaggie.github.io/sample-html/'
res = requests.get(url)

### Status Codes

In [3]:
res.status_code

200

In [4]:
res.content

b'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <title>The title</title>\n\n    <style media="screen">\n      tbody tr {\n        color: red;\n      }\n    </style>\n  </head>\n  <body>\n    <h1 class="foobar" id="title">This is an h1</h1>\n\n    <div>\n      <h1 class="foobar">This is yet another heading.</h1>\n\n      Something inside the div\n    </div>\n\n    <h3>Todo List</h3>\n    <ol class="todo">\n      <li class="foobar">Take out trash</li>\n      <li>Pay billz</li>\n      <li class="foobar">Feed dog</li>\n    </ol>\n\n    <h3>Completed</h3>\n    <ol class=\'done\'>\n      <li>Mow lawn</li>\n      <li class="foobar"><span>Take out compost</span></li>\n      <li><span>Create scraping lecture</span></li>\n    </ol>\n\n    <p class=\'foobar\'>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commod

### Creating a `BeautifulSoup` object

In [5]:
soup = BeautifulSoup(res.content)
# soup

# `soup.find()`

Returns either:

1. A soup object of the first match
2. `None`

In [50]:
h1 = soup.find('h1', {'id':'title'})

if h1:
    print(h1.text)

This is an h1


In [51]:
h1.text

'This is an h1'

In [52]:
h1.attrs

{'class': ['foobar'], 'id': 'title'}

In [53]:
if h1:
    print(h1.text)

This is an h1


In [54]:
try:
    print(h1.text)
except:
    print('Oops! You spelled something wrong.')

This is an h1


In [56]:
list_div = []
for num in range(10,-10,-1):
    try:
        list_div.append(100/num)
        print(list_div)
    except:
        print('You cant divide by 0')

[10.0]
[10.0, 11.11111111111111]
[10.0, 11.11111111111111, 12.5]
[10.0, 11.11111111111111, 12.5, 14.285714285714286]
[10.0, 11.11111111111111, 12.5, 14.285714285714286, 16.666666666666668]
[10.0, 11.11111111111111, 12.5, 14.285714285714286, 16.666666666666668, 20.0]
[10.0, 11.11111111111111, 12.5, 14.285714285714286, 16.666666666666668, 20.0, 25.0]
[10.0, 11.11111111111111, 12.5, 14.285714285714286, 16.666666666666668, 20.0, 25.0, 33.333333333333336]
[10.0, 11.11111111111111, 12.5, 14.285714285714286, 16.666666666666668, 20.0, 25.0, 33.333333333333336, 50.0]
[10.0, 11.11111111111111, 12.5, 14.285714285714286, 16.666666666666668, 20.0, 25.0, 33.333333333333336, 50.0, 100.0]
You cant divide by 0
[10.0, 11.11111111111111, 12.5, 14.285714285714286, 16.666666666666668, 20.0, 25.0, 33.333333333333336, 50.0, 100.0, -100.0]
[10.0, 11.11111111111111, 12.5, 14.285714285714286, 16.666666666666668, 20.0, 25.0, 33.333333333333336, 50.0, 100.0, -100.0, -50.0]
[10.0, 11.11111111111111, 12.5, 14.28571

In [57]:
list_div

[10.0,
 11.11111111111111,
 12.5,
 14.285714285714286,
 16.666666666666668,
 20.0,
 25.0,
 33.333333333333336,
 50.0,
 100.0,
 -100.0,
 -50.0,
 -33.333333333333336,
 -25.0,
 -20.0,
 -16.666666666666668,
 -14.285714285714286,
 -12.5,
 -11.11111111111111]

# `soup.find_all()`

Returns a **_LIST_** of soup objects that match your query

In [58]:
h1_tags = soup.find_all('h1')

In [59]:
[tag.text for tag in h1_tags]

['This is an h1', 'This is yet another heading.']

# Creating a `pandas` DataFrame from a scrape

### Todo List

In [60]:
# people = [
#     # {'name': 'chuck', 'city': 'Dallas'},
#     # {'name': 'Jon', 'city': 'NYC'}
# ]

In [61]:
pd.DataFrame(people)

Unnamed: 0,name,city
0,chuck,Dallas
1,Jon,NYC


In [69]:
ol = soup.find('ol', {'class', 'done'})
ol

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [70]:
[row.text for row in ol.find_all('li')]

['Mow lawn', 'Take out compost', 'Create scraping lecture']

In [75]:
for item in soup.find('ol', {'class','done'}).find_all('li'):
    print('You just completed: ' + item.text)

You just completed: Mow lawn
You just completed: Take out compost
You just completed: Create scraping lecture


In [77]:
todos = []

for list_item in ol.find_all('li'):
    temp_dict = {}
    temp_dict['task'] = list_item.text
    todos.append(temp_dict)

pd.DataFrame(todos)

Unnamed: 0,task
0,Mow lawn
1,Take out compost
2,Create scraping lecture


### GA Directory

In [None]:
#Steps to scrape
#Make sure it's okay to web scrape
# get Url
# request.get url
# Check status - make sure it's 200
# convert request to a soup object
# inspect on the website and start looking where on the site the info you're looking for is living
# take our soup object and subdivide by the relevant tag
# Create our empty list
# populate temp dictionary with relevant info via for loops
# append the dict to the list
# convert list into DataFrame


In [7]:
table = soup.find('table',{'id': 'directory'})

people =[]
for item in table.find('tbody').find_all('tr'):
    person = {}
    person['name'] = item.find('a').text.strip()#Name
    person['role'] = item.find('td').text#role
    person['email'] = item.find('a').attrs['href'][7:] #email
    people.append(person)

pd.DataFrame(people)

Unnamed: 0,name,role,email
0,Praveen,Student,praveen@ga.co
1,Fred,Student,fred@ga.co
2,Homer,Student,homer@ga.co
3,Kyle,Student,kyle@ga.co
4,Sam,Student,sam@ga.co
5,Javier,Student,javier@ga.co
6,Nengkuan,Student,nengkuan@ga.co
7,Kieth,Student,kieth@ga.co
8,Bola,Student,bola@ga.co
9,Steve,Student,steve@ga.co


### Basketball Reference

In [44]:
url ='https://www.basketball-reference.com/'

res = requests.get(url)

res.status_code

200

In [66]:
soup = BeautifulSoup(res.content, 'lxml')

FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

In [46]:
# soup

In [78]:
teams = []

for conf in ['E', 'W']:
    table = soup.find('table', {'id': f'confs_standings_{conf}'}).find('tbody')
    for row in table.find_all('tr'):
        team ={}
        team['name'] = row.find('a').attrs['title']#team name
        team['wins'] =row.find('td', {'data-stat': 'wins'}).text
        team['losses'] = row.find('td', {'data-stat': 'losses'}).text#losses
        team['seed'] = row.find('span').text.strip()[1:-1]
        team['conf'] = conf #conference
        team['slug'] = row.find('a').text 
        teams.append(team)
        
df =pd.DataFrame(teams)
    

In [87]:
from time import sleep

In [88]:
full_nba_roster = []

for team in df['slug']:
    temp_url = f'https://www.basketball-reference.com/teams/{team}/2022.html'
    temp_df = pd.read_html(temp_url)[3].drop(columns=['Unnamed: 22', 'Unnamed: 17'])
    temp_df['team_name'] = team
    full_nba_roster.append(temp_df)
    sleep(3)
    print(f'{team} was scraped Successfully')

MIA was scraped Successfully
CHI was scraped Successfully
BRK was scraped Successfully
MIL was scraped Successfully
CLE was scraped Successfully
PHI was scraped Successfully
CHO was scraped Successfully
BOS was scraped Successfully
TOR was scraped Successfully
WAS was scraped Successfully
NYK was scraped Successfully
ATL was scraped Successfully
IND was scraped Successfully
DET was scraped Successfully
ORL was scraped Successfully
PHO was scraped Successfully
GSW was scraped Successfully
MEM was scraped Successfully
UTA was scraped Successfully
DAL was scraped Successfully
DEN was scraped Successfully
MIN was scraped Successfully
LAL was scraped Successfully
LAC was scraped Successfully
POR was scraped Successfully
NOP was scraped Successfully
SAC was scraped Successfully
SAS was scraped Successfully
HOU was scraped Successfully
OKC was scraped Successfully


In [91]:
full_roster = pd.concat(full_nba_roster)

In [92]:
full_roster.to_csv('full_roster.csv')

In [86]:
url = 'https://www.basketball-reference.com/teams/MIA/2022.html'
pd.read_html(url)[3].drop(columns=['Unnamed: 22', 'Unnamed: 17'])

Unnamed: 0,Rk,Unnamed: 1,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,1,Kyle Lowry,35,39,1333,14.6,0.555,0.588,0.263,2.0,...,20.2,19.6,1.9,1.5,3.4,0.121,0.8,0.2,1.0,1.0
1,2,Tyler Herro,22,38,1250,15.6,0.54,0.382,0.179,2.3,...,12.7,29.6,0.6,1.2,1.8,0.069,1.3,-1.4,-0.1,0.6
2,3,Duncan Robinson,27,44,1222,10.9,0.557,0.838,0.089,1.7,...,7.3,18.3,1.1,1.2,2.3,0.089,-0.3,-0.3,-0.6,0.4
3,4,P.J. Tucker,36,41,1161,12.5,0.615,0.484,0.155,6.3,...,12.7,11.8,1.9,1.3,3.2,0.133,0.1,1.0,1.1,0.9
4,5,Jimmy Butler,32,29,971,25.6,0.585,0.124,0.505,5.9,...,9.6,26.7,3.7,1.4,5.2,0.255,5.5,2.3,7.8,2.4
5,6,Max Strus,25,36,862,14.2,0.618,0.734,0.125,2.0,...,7.2,18.7,1.5,0.8,2.3,0.128,2.0,-0.8,1.2,0.7
6,7,Gabe Vincent,25,38,857,12.2,0.567,0.602,0.134,2.0,...,16.4,18.6,0.7,0.9,1.6,0.092,-1.1,0.4,-0.8,0.3
7,8,Caleb Martin,26,37,856,16.0,0.611,0.381,0.224,6.1,...,9.2,16.8,1.4,1.2,2.7,0.149,0.5,0.9,1.4,0.7
8,9,Bam Adebayo,24,22,723,19.6,0.581,0.003,0.466,8.7,...,15.9,25.4,1.1,1.2,2.3,0.15,0.7,1.5,2.2,0.8
9,10,Dewayne Dedmon,32,40,672,17.9,0.664,0.118,0.315,12.6,...,16.8,16.1,1.4,1.1,2.6,0.183,-0.4,0.6,0.2,0.4
