# Web scrapping basics

## Introduction

In [1]:
import requests

In [3]:
response = requests.get("https://raw.githubusercontent.com/codelikerod/web-scraping/master/exemple1.html")
content = response.content
print(content)

b'<html>\r\n  <head>\r\n      <title> Un exemple de page HTML </title>\r\n  </head>\r\n\r\n  <body>\r\n      <p>Un simple paragraphe</p>\r\n  </body>\r\n</html>'


## GET page element

In [4]:
# lib beautifulSoup of package bs4

In [5]:
from bs4 import BeautifulSoup

In [11]:
# apply BeautifulSoup to analyze content previously downloaded
parser = BeautifulSoup(content, "html.parser")

# Get body tag of the HTML document
body = parser.body
head = parser.head

# Get tag p of the body
p = body.p
title = head.title

# Display text -> use attribute .text
print(p.text)
print(title.text)

Un simple paragraphe
 Un exemple de page HTML 


## Use Find All method

In [15]:
parser = BeautifulSoup(content,"html.parser")

# Get all element of body section
body = parser.find_all("body")
print(body)
head = parser.find_all("head")
print(head)

#Get elements of p/ title section in body/head section
p = body[0].find_all("p") # body[0] because we get the first list element
print(p[0].text)
title = head[0].find_all("title") # head[0] because we get the first list element
print(title[0].text)

[<body>
<p>Un simple paragraphe</p>
</body>]
[<head>
<title> Un exemple de page HTML </title>
</head>]
Un simple paragraphe
 Un exemple de page HTML 


## Elements related to IDs

In [23]:
#Download page
response = requests.get("https://raw.githubusercontent.com/codelikerod/web-scraping/master/exemple2.html")
content = response.content
print(content)
parser = BeautifulSoup(content, "html.parser")

#Get ID requested
first_paragraph = parser.find_all("p", id="first")[0] #We add ID = first
print(first_paragraph.text)
second_paragraph = parser.find_all("p", id="second")[0] #We add ID = first
print(second_paragraph.text)

b'<html>\r\n  <head>\r\n      <title> Un exemple de page HTML </title>\r\n  </head>\r\n\r\n  <body>\r\n    <div>\r\n      <p id="first">1er paragraphe</p>\r\n    </div>\r\n      <p id="second">2nd paragraphe</p>\r\n  </body>\r\n</html>'
1er paragraphe
2nd paragraphe


## Elements related to class

In [29]:
#Download web page
response = requests.get("https://raw.githubusercontent.com/codelikerod/web-scraping/master/exemple3.html")
content = response.content
print(content)
parser = BeautifulSoup(content, "html.parser")

#Get first paragraph of class1
first_class1_paragraph = parser.find_all("p", class_="class1")[0] #We add ID = first
print(first_class1_paragraph.text)
second_class1_paragraph = parser.find_all("p", class_="class1")[1] #We add ID = first
print(second_class1_paragraph.text)
first_class2_paragraph = parser.find_all("p", class_="class2")[0] #We add ID = first
print(first_class2_paragraph.text)

b'<html>\r\n  <head>\r\n      <title> Un exemple de page HTML </title>\r\n  </head>\r\n\r\n  <body>\r\n    <div>\r\n      <p class="class1">1er paragraphe classe 1</p>\r\n      <p class="class1">2nd paragraphe class 1</p>\r\n    </div>\r\n      <p class="class2">1er paragraphe class 2</p>\r\n      <p class="class2">2nd paragraphe class 2</p>\r\n  </body>\r\n</html>'
1er paragraphe classe 1
2nd paragraphe class 1
1er paragraphe class 2


## Elements selected with CSS

In [30]:
# #first { color: red }
# .class1 { color: red }
# select 

In [39]:
#Download page
response = requests.get("https://raw.githubusercontent.com/codelikerod/web-scraping/master/exemple4.html")
content = response.content
#print(content)
parser = BeautifulSoup(content, "html.parser")

#Get all element of class first-item
first_items = parser.select(".first-item") 
print(first_items[0].text)

1er paragraphe classe 1
      


In [36]:
first_class2_text = parser.select(".class2")[0].text
print(first_class2_text)

1er paragraphe class 2
      


In [37]:
second_text = parser.select("#second")[0].text
print(second_text)

1er paragraphe class 2
      


## Associate CSS selectors

In [40]:
# div p 
# div .first-item
# body div #first
# .first-item #first

In [46]:
#Download page
response = requests.get("https://raw.githubusercontent.com/codelikerod/web-scraping/master/psg-vs-chelsea.html")
content = response.content
#print(content)
parser = BeautifulSoup(content, "html.parser")
print(parser)

#Get Chelsea faults numbers
offences = parser.select("#fautes")[0]
chelsea_offences = offences.select("td")[1]
print(chelsea_offences.text)

#Get number of passes succeeded
psg_pass_count = parser.select("#passes")[0].select("td")[2].text
print(psg_pass_count)

<html>
<head lang="en">
<meta charset="utf-8"/>
<title>PSG - Chelsea</title>
</head>
<body>
<table class="stats_table nav_table" id="team_stats">
<tbody>
<tr id="teams">
<th></th>
<th>Chelsea</th>
<th>Paris</th>
</tr>
<tr id="goals">
<td>Buts</td>
<td>2</td>
<td>2</td>
</tr>
<tr id="possession">
<td>Possession</td>
<td>51%</td>
<td>49%</td>
</tr>
<tr id="tirs">
<td>Nombre de tirs</td>
<td>14</td>
<td>12</td>
</tr>
<tr id="corners">
<td>Nombre de corners</td>
<td>7</td>
<td>11</td>
</tr>
<tr id="fautes">
<td>Fautes</td>
<td>24</td>
<td>17</td>
</tr>
<tr id="passes">
<td>Nombre de passes</td>
<td>537</td>
<td>545</td>
</tr>
</tbody>
</table>
</body>
</html>
24
545
