## Load necessary libraries

In [25]:
import requests  # pip install requests
from bs4 import BeautifulSoup as bs  # pip install beautifulsoup4

In [26]:
### only html -->  https://keithgalli.github.io/web-scraping/example.html
### html css --->  https://keithgalli.github.io/web-scraping/webpage.html

In [27]:
# Load webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our HTML
#print(soup)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using Beautiful soup to Scrap

### find and find_all

In [28]:
# find() --> going to find first h2 element
first_header = soup.find("h2")
print(first_header)

# find_all() --> going to find all h2 element and return in list format
headers = soup.find_all("h2")
print(headers)

<h2>A Header</h2>
[<h2>A Header</h2>, <h2>Another header</h2>]


In [29]:
# Find multiple tag --> passing tag in list
first_header = soup.find(["h2","h1"])
print(first_header)

headers = soup.find_all(["h2","h1"])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [30]:
# You can pass attribute to find/find_all function
paragraph = soup.find_all("p")
print(paragraph)

paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
print(paragraph)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>, <p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<p id="paragraph-id"><b>Some bold text</b></p>]


In [31]:
# You can nest find/find_all calls
body = soup.find('body')
print(body)
print()
div = body.find('div')
print(div)
print()
h1 = div.find('h1')
print(h1)

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

<h1>HTML Webpage</h1>


In [32]:
### We can search specific strings in our find/find_all calls
import re

paragraphs = soup.find_all('p', string=re.compile("Some"))
print(paragraphs)

headers = soup.find_all('h2', string=re.compile("(H|h)eader"))
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


### Select (Css selector)

In [33]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [34]:
# Select all the paragraph
content = soup.select('p')
print(content)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>, <p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [35]:
# Select paragraph inside the div
content = soup.select('div p')
print(content)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]


In [36]:
# Para tag after h2
paragraphs = soup.select('h2 ~ p')
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [37]:
# bold tag inside id - paragraph-id
bold_text = soup.select('p#paragraph-id b')
print(bold_text)

[<b>Some bold text</b>]


In [38]:
paragraphs = soup.select("body > p")
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [39]:
# Grab by element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Get different properties of the HTML

In [40]:
# We want value without tag using string and get_text
header = soup.find('h2')
print(header)
print(header.string)
print()

# Multiple element inside the div --> get_text()
div = soup.find('div')
print(div.prettify())
print()
print(div.get_text())

<h2>A Header</h2>
A Header

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



### Get a specific property from an element

In [41]:
link = soup.find('a')
print(link)
print()
print(link['href'])

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

https://keithgalli.github.io/web-scraping/webpage.html


In [42]:
paragraphs = soup.select('p#paragraph-id')
print(paragraphs)
print()
print(paragraphs[0]['id'])

[<p id="paragraph-id"><b>Some bold text</b></p>]

paragraph-id


### Code navigation

In [43]:
# Path Syntax
soup.body.div.h1.string

'HTML Webpage'

In [44]:
# Know the term: Parent, Sibling, Child
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Task

In [45]:
# Load webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our HTML
#print(soup)
print(soup.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

### Grab all of the social links from the webpage

In [46]:
links = soup.select(".socials a")
print(links)
actual_links = [link['href'] for link in links]
actual_links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>, <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>, <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>, <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Scrap table from the webpage

In [69]:
# Extract Heading of the table
import pandas as pd
table = soup.select(".hockey-stats")[0]
columns = table.find('thead').find_all('th')
columns = [column.get_text() for column in columns]
columns

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [70]:
# Extract value of the table
rows = []
table_rows = table.find('tbody').find_all('tr')
for row in table_rows:
    td = row.find_all('td')
    td = [str(tr.get_text()).strip() for tr in td]
    rows.append(td)
print(rows[0])

['2014-15', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '17', '3', '9', '12', '20', '', '|', '', '', '', '', '', '', '']


In [74]:
# Merge heading and corresponding value
df = pd.DataFrame(rows,columns=columns)
df.head(5)

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [75]:
# Adding some condition and display value
df.loc[df['Team'] != "Did not play"]

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


### Grab all fun facts that use word "is"

In [108]:
import re
fun_facts = soup.select('.fun-facts li')
fun_facts_with = [fact.find_all(string=re.compile("is")) for fact in fun_facts]
fun_facts_with = [fact for fact in fun_facts_with if fact]
fun_facts_with

[['Middle name is Ronald'],
 ['Dunkin Donuts coffee is better than Starbucks'],
 ['A favorite book series of mine is '],
 ['Current video game of choice is '],
 ["The band that I've seen the most times live is the "]]

### Download image

In [116]:
url = "https://keithgalli.github.io/web-scraping/"
images = soup.find_all('img')
images = [url+image['src'] for image in images]
images

['https://keithgalli.github.io/web-scraping/./images/selfie1.jpg',
 'https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg',
 'https://keithgalli.github.io/web-scraping/images/italy/pontevecchio.jpg',
 'https://keithgalli.github.io/web-scraping/images/italy/riomaggiore.jpg',
 'https://keithgalli.github.io/web-scraping/images/flag.png',
 'https://keithgalli.github.io/web-scraping/images/flag.png',
 'https://keithgalli.github.io/web-scraping/images/flag.png',
 'https://keithgalli.github.io/web-scraping/images/flag.png']

In [118]:
# Using python Download an image
'''
import requests

img_data = requests.get(image_url).content
with open("img_name.jpg",'wb') as handler:
    handler.write(img_data)

'''


'\nimport requests\n\nimg_data = requests.get(image_url).content\nwith open("img_name.jpg",\'wb\') as handler:\n    handler.write(img_data)\n\n'

In [134]:
img_data = requests.get(images[0]).content
with open("Selfie1.jpg",'wb') as handler:
    handler.write(img_data)

### Solve the mystery challenge!

### search anything from many url

In [136]:
files = soup.select('div.block a')
relative_files = [f['href'] for f in files]

url = "https://keithgalli.github.io/web-scraping/"
for f in relative_files:
    full_url = url+f
    page = requests.get(full_url)
    bs_page = bs(page.content)
    secret_word_element = bs_page.find('p', attrs={"id":"secret-word"})
    secret_word = secret_word_element.string
    print(secret_word)

Make
sure
to
smash
that
like
button
and
subscribe
!!!


In [154]:
import requests  # pip install requests
from bs4 import BeautifulSoup as bs  # pip install beautifulsoup4
# Load webpage content
r = requests.get("https://meesho.com/search?q=saree&searchType=manual&searchIdentifier=text_search")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our HTML
#print(soup)
#print(soup.prettify())
images = soup.find('body').find_all('img')
images = [image['src'] for image in images]
images

['https://images.meesho.com/images/products/51843421/rtuxu_512.jpg',
 'https://images.meesho.com/images/products/18609123/238c0_512.jpg',
 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
 'https://images.meesho.com/images/products/43304306/cwi1l_512.jpg',
 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
 'https://images.meesho.com/images/products/37884928/njbnx_512.jpg',
 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
 'https://images.meesho.com/images/products/48494528/azzdg_512.jpg',
 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
 'https://images.meesho.com/images/products/16655856/a531b_512.jpg',
 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
 'https://images.meesho.com/images/products/11606570/26175_512.jpg',
 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
 'https://images.me