# Web Scraping

## installing libraries

In [2]:
!pip install requests
!pip install lxml
!pip install bs4



## Make a request to website

In [3]:
import requests

In [4]:
result = requests.get("http://www.example.com")
result

<Response [200]>

In [5]:
type(result)

requests.models.Response

In [6]:
result.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

## Making The Soup

In [7]:
import bs4

In [8]:
soup = bs4.BeautifulSoup(result.text,"lxml")
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

## Select items by HTML Tags

In [9]:
soup.select('title')

[<title>Example Domain</title>]

In [10]:
soup.select('title')[0].getText()

'Example Domain'

In [11]:
site_paragraphs = soup.select('p')
site_paragraphs

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

## Grab List from WIKIPEDIA

In [12]:
res = requests.get('https://en.wikipedia.org/wiki/Charles_Babbage')

In [14]:
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [15]:
# soup
soup.select('.toctext')

[<span class="toctext">Early life</span>,
 <span class="toctext">At the University of Cambridge</span>,
 <span class="toctext">After Cambridge</span>,
 <span class="toctext">Royal Astronomical Society</span>,
 <span class="toctext">British Lagrangian School</span>,
 <span class="toctext">Academic</span>,
 <span class="toctext">"Declinarians", learned societies and the BAAS</span>,
 <span class="toctext"><i>On the Economy of Machinery and Manufactures</i></span>,
 <span class="toctext">"Babbage principle"</span>,
 <span class="toctext">Publishing</span>,
 <span class="toctext">Influence</span>,
 <span class="toctext">Natural theology</span>,
 <span class="toctext">Religious views</span>,
 <span class="toctext">Later life</span>,
 <span class="toctext">Metrology programme</span>,
 <span class="toctext">Engineer and inventor</span>,
 <span class="toctext">Cryptography</span>,
 <span class="toctext">Public nuisances</span>,
 <span class="toctext">Computing pioneer</span>,
 <span class="toc

In [17]:
type(soup.select('.toctext')[0])

bs4.element.Tag

In [21]:
fist_item = soup.select('.toctext')[0]
fist_item.text

'Early life'

In [19]:
for item in soup.select('.toctext'):
    print(item.text)

Early life
At the University of Cambridge
After Cambridge
Royal Astronomical Society
British Lagrangian School
Academic
"Declinarians", learned societies and the BAAS
On the Economy of Machinery and Manufactures
"Babbage principle"
Publishing
Influence
Natural theology
Religious views
Later life
Metrology programme
Engineer and inventor
Cryptography
Public nuisances
Computing pioneer
Background on mathematical tables
Difference engine
Completed models
Analytical Engine
Ada Lovelace and Italian followers
Swedish followers
Legacy
Family
Death
Autopsy report
Memorials
In fiction and film
Publications
See also
Notes
References
External links


## Grab Images from WIKIPEDIA

In [23]:
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [25]:
soup.select('img')[0]

<img alt="Page protected with pending changes" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/b/b7/Pending-protection-shackle.svg/20px-Pending-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/b7/Pending-protection-shackle.svg/30px-Pending-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/b/b7/Pending-protection-shackle.svg/40px-Pending-protection-shackle.svg.png 2x" width="20"/>

In [26]:
soup.select('.thumbimage')

[<img alt="" class="thumbimage" data-file-height="1455" data-file-width="1028" decoding="async" height="311" src="//upload.wikimedia.org/wikipedia/commons/thumb/3/37/Portrait_of_Charles_Babbage_%284672397%29.jpg/220px-Portrait_of_Charles_Babbage_%284672397%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/3/37/Portrait_of_Charles_Babbage_%284672397%29.jpg/330px-Portrait_of_Charles_Babbage_%284672397%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/37/Portrait_of_Charles_Babbage_%284672397%29.jpg/440px-Portrait_of_Charles_Babbage_%284672397%29.jpg 2x" width="220"/>,
 <img alt="" class="thumbimage" data-file-height="631" data-file-width="451" decoding="async" height="308" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/44/Charles_Babbage_by_Antoine_Claudet_c1847-51-crop.jpg/220px-Charles_Babbage_by_Antoine_Claudet_c1847-51-crop.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/44/Charles_Babbage_by_Antoine_Claudet_c1847-51-crop.jpg/330px-Charle

In [29]:
computer = soup.select('.thumbimage')[2]
computer

<img alt="" class="thumbimage" data-file-height="725" data-file-width="549" decoding="async" height="291" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9e/Difference_engine_plate_1853.jpg/220px-Difference_engine_plate_1853.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9e/Difference_engine_plate_1853.jpg/330px-Difference_engine_plate_1853.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9e/Difference_engine_plate_1853.jpg/440px-Difference_engine_plate_1853.jpg 2x" width="220"/>

### Get Source of image

In [30]:
computer['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/9/9e/Difference_engine_plate_1853.jpg/220px-Difference_engine_plate_1853.jpg'

### Showing the image

<img src='https://upload.wikimedia.org/wikipedia/commons/thumb/9/9e/Difference_engine_plate_1853.jpg/220px-Difference_engine_plate_1853.jpg' />

### Save image to local

In [38]:
image_link = requests.get('https://upload.wikimedia.org/wikipedia/commons/thumb/9/9e/Difference_engine_plate_1853.jpg/220px-Difference_engine_plate_1853.jpg')
image_link

<Response [200]>

In [41]:
image_link.content

x94\xf8WOaL\xb4\xe1=\xad\xe9/%)\xdf\xb5\xcc\x04\x92\x7f,\xe4\x0e2\xb5\xea\xa7E\xe5P\xa6[Rz]j\\\x95\xa4\xc4\n\\\xe8&B\xf0\xda\x07\t;]X\x07\x8f\x08\x088\xfe\xa3E)\x81&\xa6\xc4YR\xaf\xfb\x96\x88\xfd9\tP\x82\xabZ$G\xa2\xed8\x0c\xa8\xb5\xe9\x07ciR@$`dc*\x1a\xea8XC\x81\xe3\xc6m6\xdf\n\x04\x92 \xe8\x98\x06\xec\x84+\xd4\xa8UK.\xbb\x12z\xe7\x98\xb1\x9dv\xe8.\xa5\x99\nH\xc8\xdc\x97\x95\x8e\x15\xce\x01\xe3\\k\xa2\xf7\xa5Qa\xdf\xed\xcd\x85Q\xa51\x16\x8b>\x9f\x958\xfc\xf6\xdc\x98\xe2\xd6U\x87\x12\x14RTBO8\xc9#\xdf\xc8\xe0\xa5N\xa6Obl{\xc6\xe0\xb9jT\xd6\xd12\x1c\x19\x90Pr\xe2\xd3\xfd\x9f~\xd3\xbc\xef\xd81\x8eU\xe0y\xd1E\xf1\x0e\xf8\x8bm\xdc\xd3\xec*\x955\x95\xad\x895\x89\xf0\xa7\xa1XD\x86\x1e$\xf6p\x9d\xd8%\xaf\xe6\xf7\x1e\xd9\x18\xca\xaen\x06\x86\x92s\x04\xfe4\x08n!%*l\xfa\xb5\x91>\xdd\x8a\xc3\x96\xbc\xc4U\x99\x84vW$:\xea\x10\xd4\x86b\xb6\x958\xa6\x95\x82\n\xb6\xa5\t\x059;\x7f\xa9\xe0Z\xb7MZ\xbfp8\xaa\xfdNTf\xe7Nn\xb1-\xf1O[\x05O\xb0\x1bp\x10BF\x0e\x1b\xdb\xb3 +8\xc6y\xd3\x1b\xa6]C\xbd\xea\x92\x13R

In [42]:
f = open('My_computerImage.jpg', 'wb')
f.write(image_link.content)
f.close()