# Build a webscraper

In [1]:
# install beautifulesoup
!python3 -m pip install beautifulsoup4  

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import requests

url = "https://www.apple.com/newsroom/2021/01/apple-reports-first-quarter-results/"

res = requests.get(url)

In [3]:
res.content

In [4]:
# give us a string in html format
page_content = res.content.decode("utf-8")

In [5]:
from bs4 import BeautifulSoup

# create a beautifulsoup object
# it knows how to parse the big string that looks like html
soup = BeautifulSoup(page_content, "html.parser")


### Find the elements

The common ways to identify an element:

- by tag 
  - For example, 
  - `<header>`, `<body>`, `<div>`...
    
    Then you can narrow down the search by adding the attributes, common ones for example:
    
    - by class
      - an element can have `class` attribute which you can use to search
  
    - by id
      - an element can have `id` attribute which you can use to search

In [6]:
# use .find method to find article tag
article = soup.find('article')

In [8]:
# use .find_all method to find all div tags. This will return a list
article.find_all('div')[0]  # find all div tags

In [9]:
# find the tag along with attributes
summary = article.find('div', class_="summary text component")

In [10]:
summary

<div class="summary text component">
<div class="component-content">
<p class="summary-copy">
            

				
      				 Revenue up 21 percent and EPS up 35 percent to new all-time records<br/><br/>
iPhone, Wearables, and Services set new revenue records
               
               </p>
</div>
</div>

In [11]:
# let's find the first paragraph
article.find_all('div', class_="pagebody-copy")[0]  

<div class="pagebody-copy">Cupertino, California — January 27, 2021 — Apple today announced financial results for its fiscal 2021 first quarter ended December 26, 2020. The Company posted all-time record revenue of $111.4 billion, up 21 percent year over year, and quarterly earnings per diluted share of $1.68, up 35 percent. International sales accounted for 64 percent of the quarter’s revenue.
</div>

## bytes vs string

In [44]:
# In python, it represents bytes in ACSII format.
# just to show you how bytes are different than string
res = requests.get("https://www.moi.gov.tw/")
res.content  # this will be bytes

In [45]:
res.content.decode("utf-8")  # now you can see chinese characters

## Scrape the content for our project

For our project, we are going to scrape the files from our [website]("https://learnwithshin.github.io/docs/files/").

In [14]:
# get contents from our website
url = "https://learnwithshin.github.io/docs/files/"

res = requests.get(url)

In [15]:
page_content = res.content.decode("utf-8")

In [16]:
from bs4 import BeautifulSoup

# create a beautifulsoup object
# it knows how to parse the big string that looks like html
soup = BeautifulSoup(page_content, "html.parser")


In [17]:
# find article tag
article = soup.find('article')

In [18]:
# find the anchor tag which contains the url
anchors = article.find_all("a")

In [19]:
# see all the file links
anchors

[<a href="../assets/countries_03_08_21.csv"><code>countries_03_08_21.csv</code></a>,
 <a href="../assets/country_vaccination_ts_03_08_21.csv"><code>country_vaccination_ts_03_08_21.csv</code></a>]

In [20]:
country_tag = anchors[0]
data_tag = anchors[1]

In [21]:
# get the attribute for an element
coutry_file_url = country_tag['href']
coutry_file_url

'../assets/countries_03_08_21.csv'

In [22]:
# construct the full url for the file link
base_url = "https://learnwithshin.github.io/docs/"
coutry_file_url = coutry_file_url.replace("../", base_url)

In [23]:
# now get the content for the file
res = requests.get(coutry_file_url)
# res.content.decode('utf-8')

In [41]:
# save the file - use "wb" which stands for write bytes
# becasuse requests.get(url) returns bytes
with open("country_vaccinations.csv", 'wb') as f:
    f.write(res.content)

## Refactor the code

Refactor the code we went over and put them in functions. Now we are ready to move them to an actual python module.

In [1]:
import requests

from bs4 import BeautifulSoup


def get_content(url):
    res = requests.get(url)
    page_content = res.content.decode("utf-8")
    return page_content


def save_download_file(content, file_name):
    with open(file_name, "wb") as f:
        f.write(content)
    print("File saved!")


def get_data_source_from_lws(file_index, save=False, save_file_name="file.csv"):
    """Get file source from learnwithshin."""
    url = "https://learnwithshin.github.io/docs/files/"
    content = get_content(url)
    soup = BeautifulSoup(content, "html.parser")
    article = soup.find('article')
    anchors = article.find_all("a")
    
    base_url = "https://learnwithshin.github.io/docs/"
    target_anchor = anchors[file_index]
    file_url = target_anchor["href"]
    file_url = file_url.replace("../", base_url)
    
    res = requests.get(file_url)
    if save:
        save_download_file(res.content, save_file_name)
    return res.content

In [1]:
get_data_source_from_lws(0, True, "countries.csv")