## Web Scraping

In [0]:
#@title Basic Imports
import requests
from bs4 import BeautifulSoup

In [0]:
!pip install -U lxml

Requirement already up-to-date: lxml in /usr/local/lib/python3.6/dist-packages (4.2.5)


## Quick Start

In [0]:
html_doc="""
<html><head><title>Google's MLCC</title></head>
<body>
<p class="name"><b>Study Jams</b></p>

<p class="course">ML & AI Course
<a href="http://example.com/s1" class="s" id="link1">Python</a>,
<a href="http://example.com/s2" class="s" id="link2">Tensorflow</a> and
<a href="http://example.com/s2" class="s" id="link3">ML Algorithms</a></p>

<p class="course">...</p>
"""

## Parsing a Page with BeautifulSoup

In [0]:
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   Google's MLCC
  </title>
 </head>
 <body>
  <p class="name">
   <b>
    Study Jams
   </b>
  </p>
  <p class="course">
   ML &amp; AI Course
   <a class="s" href="http://example.com/s1" id="link1">
    Python
   </a>
   ,
   <a class="s" href="http://example.com/s2" id="link2">
    Tensorflow
   </a>
   and
   <a class="s" href="http://example.com/s2" id="link3">
    ML Algorithms
   </a>
  </p>
  <p class="course">
   ...
  </p>
 </body>
</html>


## BeautifulSoup Datastructure

In [0]:
soup.title

<title>Google's MLCC</title>

In [0]:
soup.title.name

'title'

In [0]:
soup.title.string

"Google's MLCC"

In [0]:
soup.title.parent.name

'head'

In [0]:
soup.p

<p class="name"><b>Study Jams</b></p>

In [0]:
soup.find(id="link3")

<a class="s" href="http://example.com/s2" id="link3">ML Algorithms</a>

In [0]:
soup.find_all('a')

[<a class="s" href="http://example.com/s1" id="link1">Python</a>,
 <a class="s" href="http://example.com/s2" id="link2">Tensorflow</a>,
 <a class="s" href="http://example.com/s2" id="link3">ML Algorithms</a>]

## .strings and stripped_strings

In [0]:
for string in soup.strings:
    print(repr(string))

'\n'
"Google's MLCC"
'\n'
'\n'
'Study Jams'
'\n'
'ML & AI Course\n'
'Python'
',\n'
'Tensorflow'
' and\n'
'ML Algorithms'
'\n'
'...'
'\n'


In [0]:
#@title Remove Whitespace
for string in soup.stripped_strings:
    print(repr(string))

"Google's MLCC"
'Study Jams'
'ML & AI Course'
'Python'
','
'Tensorflow'
'and'
'ML Algorithms'
'...'


In [0]:
#@title Extract
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
a_tag = soup.a
i_tag = soup.i.extract()
a_tag

<a href="http://example.com/">I linked to </a>

In [0]:
i_tag

<i>example.com</i>

In [0]:
#@title get_text()
soup.get_text()

'I linked to '

## Load Web Page

In [0]:
#@title Load Web Page
page = requests.get("https://www.google.com/")
page

<Response [200]>

In [0]:
page.status_code

200

In [0]:
page.content

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/logos/doodles/2018/googles-20th-birthday-us-5142672481189888-l.png" itemprop="image"><meta content="Google\'s 20th Birthday" property="twitter:title"><meta content="Google\'s 20th Birthday! #GoogleDoodle #SearchIs20" property="twitter:description"><meta content="Google\'s 20th Birthday! #GoogleDoodle #SearchIs20" property="og:description"><meta content="summary_large_image" property="twitter:card"><meta content="@GoogleDoodles" property="twitter:site"><meta content="https://www.google.com/logos/doodles/2018/googles-20th-birthday-us-5142672481189888-2xa.gif" property="twitter:image"><meta 

In [0]:
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE doctype html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en">
 <head>
  <meta content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for." name="description"/>
  <meta content="noodp" name="robots"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="/logos/doodles/2018/googles-20th-birthday-us-5142672481189888-l.png" itemprop="image"/>
  <meta content="Google's 20th Birthday" property="twitter:title"/>
  <meta content="Google's 20th Birthday! #GoogleDoodle #SearchIs20" property="twitter:description"/>
  <meta content="Google's 20th Birthday! #GoogleDoodle #SearchIs20" property="og:description"/>
  <meta content="summary_large_image" property="twitter:card"/>
  <meta content="@GoogleDoodles" property="twitter:site"/>
  <meta content="https://www.google.com/logos/doodles/2018/googles-20th-birthday-us-51426724811898

## Exact a Table from WebPage

In [0]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = 'https://www.treasury.gov/resource-center/data-chart-center/interest-rates/Pages/TextView.aspx?data=yieldAll'

r = requests.get(url)
html = r.text

soup = BeautifulSoup(html)
table = soup.find('table', {"class": "t-chart"})
rows = table.find_all('tr')
data = []
for row in rows[1:]:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])

result = pd.DataFrame(data, columns=['Date', '1 Mo', '3 Mo', '6 Mo', '1 Yr', '2 Yr', '3 Yr', '5 Yr', '7 Yr', '10 Yr', '20 Yr', '30 Yr'])

print(result)

          Date  1 Mo  3 Mo  6 Mo  1 Yr  2 Yr  3 Yr  5 Yr  7 Yr 10 Yr 20 Yr  \
0     01/02/90   N/A  7.83  7.89  7.81  7.87  7.90  7.87  7.98  7.94   N/A   
1     01/03/90   N/A  7.89  7.94  7.85  7.94  7.96  7.92  8.04  7.99   N/A   
2     01/04/90   N/A  7.84  7.90  7.82  7.92  7.93  7.91  8.02  7.98   N/A   
3     01/05/90   N/A  7.79  7.85  7.79  7.90  7.94  7.92  8.03  7.99   N/A   
4     01/08/90   N/A  7.79  7.88  7.81  7.90  7.95  7.92  8.05  8.02   N/A   
5     01/09/90   N/A  7.80  7.82  7.78  7.91  7.94  7.92  8.05  8.02   N/A   
6     01/10/90   N/A  7.75  7.78  7.77  7.91  7.95  7.92  8.00  8.03   N/A   
7     01/11/90   N/A  7.80  7.80  7.77  7.91  7.95  7.94  8.01  8.04   N/A   
8     01/12/90   N/A  7.74  7.81  7.76  7.93  7.98  7.99  8.07  8.10   N/A   
9     01/16/90   N/A  7.89  7.99  7.92  8.10  8.13  8.11  8.18  8.20   N/A   
10    01/17/90   N/A  7.97  7.97  7.91  8.09  8.11  8.11  8.17  8.19   N/A   
11    01/18/90   N/A  8.04  8.08  8.05  8.25  8.28  8.27  8.31  

## Exercises
**1. Scrap GDG Coimbatore webpage and store it in  csv**

**2. Scrap any of one wikipedia page generate table, and do data visualization**