# Data Acquisition and Data Preparation

## First Scraper

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError, URLError

### Running `BeautifulSoup`

In [None]:
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

In [None]:
print(bs.html.div)

### Handling Connection and Exceptions

In [None]:
try:
    html = urlopen('https://pythonscrapingthisurldoesnotexist.com')
except HTTPError as e:
    print(e)
except URLError as e:
    print('The server could not be found!')
else:
    print('It worked!')

In [None]:
def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h1
    except AttributeError as e:
        return None
    return title


title = getTitle('http://www.pythonscraping.com/pages/page1.html')
if title == None:
    print('Title could not be found')
else:
    print(title)

### Exercise

Develop parsing function that can extract title and paragraph as a list object from [http://www.pythonscraping.com/pages/page1.html](http://www.pythonscraping.com/pages/page1.html)

In [None]:
def getContent(url):

    # type your code here

    return content

content = getContent('http://www.pythonscraping.com/pages/page1.html')

if content == None:
     print('Content could not be found')
else:
    for i in content:
        print(i)

In [None]:

def getContent(url):

    # type your code here
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h1
        body = bs.body.div
        content = []

        content += title
        content += body
    
    except AttributeError as e:
        return None
    return content

content = getContent('http://www.pythonscraping.com/pages/page1.html')

if content == None:
     print('Content could not be found')
else:
    for i in content:
        print(i)

## Advanced Parser

Create parser that scrapes page located at [https://www.pythonscraping.com/pages/warandpeace.html](https://www.pythonscraping.com/pages/warandpeace.html)

In [None]:
html = urlopen('https://www.pythonscraping.com/pages/warandpeace.html')
bs  = BeautifulSoup(html.read(), 'html.parser')

namelist = bs.findAll('span', { 'class' : 'green'})
for name in namelist:
    print(name.get_text())

### Dealing with Children and Other Descendants

In [None]:
html = urlopen('https://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')

for child in bs.find('table', {'id':'giftList'}).children:
    print(child)

### Dealing with Siblings

In [None]:
html = urlopen('https://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')

for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
    print(sibling)

### Dealing with Parents

In [None]:
html = urlopen('https://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')

print(bs.find('img', 
              {'src' : '../img/gifts/img1.jpg'})
              .parent.previous_sibling.get_text())

In [None]:
html = urlopen('https://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')

for item in bs.find('table', {'id' : 'giftList'}).findAll('tr', {'class' : 'gift'}):
    print(item.get_text())

### Exercise

Extract `Item Title` information from [https://www.pythonscraping.com/pages/page3.html](https://www.pythonscraping.com/pages/page3.html) page and store into list object. You can expand it from previous method

In [None]:
## Change or update this script
for item in bs.find('table', {'id' : 'giftList'}).findAll('tr', {'class' : 'gift'}):
    print(item.get_text())

In [None]:

for item in bs.find('table', {'id' : 'giftList'}).findAll('tr', {'class' : 'gift'}):
    for gift in item.findAll('td')[0]:
        print(gift.get_text())