Skip to content

Commit 53883c0

Browse files
committed
try/error pipeline for GHA + update some webscraping codebase to avoid deprecation warning
1 parent 2267580 commit 53883c0

File tree

5 files changed

+197
-86
lines changed

5 files changed

+197
-86
lines changed

_quarto.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ project:
44
- index.qmd
55
- 404.qmd
66
- content/getting-started/index.qmd
7+
- content/manipulation/04a_webscraping_TP.qmd
78
- content/modelisation/index.qmd
89
- content/visualisation/index.qmd
910
- content/visualisation/matplotlib.qmd

content/manipulation/04_webscraping/_exo2_solution.qmd

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import pandas as pd
88
```
99

1010
```{python}
11+
#| eval: false
12+
1113
# 1. We need to use Mozilla user-agent for that site
1214
import requests
1315
@@ -20,6 +22,35 @@ page = bs4.BeautifulSoup(req.content, "lxml")
2022
```
2123

2224

25+
```{python}
26+
#| echo: false
27+
import requests
28+
import bs4
29+
import time
30+
31+
url_root = "http://pokemondb.net/pokedex/national"
32+
33+
def fetch_page(url, max_retries=3, delay=2):
34+
for attempt in range(1, max_retries + 1):
35+
try:
36+
resp = requests.get(
37+
url,
38+
headers={"User-Agent": "Mozilla/5.0"},
39+
timeout=10 # éviter de bloquer indéfiniment
40+
)
41+
resp.raise_for_status() # lève une exception si code HTTP pas 200
42+
return bs4.BeautifulSoup(resp.content, "lxml")
43+
except Exception as e:
44+
print(f"Échec {attempt}/{max_retries}: {e}")
45+
if attempt < max_retries:
46+
time.sleep(delay)
47+
else:
48+
raise # après le dernier essai, on laisse remonter l’erreur
49+
50+
# Exemple d’utilisation
51+
page = fetch_page(url_root)
52+
```
53+
2354
```{python}
2455
#| output: false
2556
@@ -48,21 +79,21 @@ page_pokemon = get_page("bulbasaur")
4879
indice_tableau = 0 #premier tableau : 0
4980
print("\n tableau", indice_tableau+1, " : deux premières lignes")
5081
tableau_1 = page_pokemon.findAll('table', { 'class' : "vitals-table"})[indice_tableau]
51-
for elements in tableau_1.find('tbody').findChildren(['tr'])[0:2]: #Afficher les 2 éléments du tableau
52-
print(elements.findChild('th'))
53-
print(elements.findChild('td'))
82+
for elements in tableau_1.find('tbody').find_children(['tr'])[0:2]: #Afficher les 2 éléments du tableau
83+
print(elements.find_child('th'))
84+
print(elements.find_child('td'))
5485
print("\n\n\n")
5586
5687
# Generalization
5788
def get_cara_pokemon(pokemon_name):
5889
page = get_page(pokemon_name)
5990
data = {}
60-
for table in page.findAll('table', { 'class' : "vitals-table"})[0:4] :
91+
for table in page.find_all('table', { 'class' : "vitals-table"})[0:4] :
6192
table_body = table.find('tbody')
62-
for rows in table_body.findChildren(['tr']) :
93+
for rows in table_body.find_children(['tr']) :
6394
if len(rows) > 1 : # attention aux tr qui ne contiennent rien
64-
column = rows.findChild('th').getText()
65-
cells = rows.findChild('td').getText()
95+
column = rows.find_child('th').getText()
96+
cells = rows.find_child('td').getText()
6697
cells = cells.replace('\t','').replace('\n',' ')
6798
data[column] = cells
6899
data['name'] = pokemon_name

content/manipulation/04_webscraping/_exo2b.qmd

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,21 @@
55
Pour récupérer les informations, le code devra être divisé en plusieurs étapes :
66

77

8-
1. Trouvez la page principale du site et la transformer en un objet intelligible pour votre code.
9-
Les fonctions suivantes vous seront utiles :
10-
- `urllib.request.Request`
11-
- `urllib.request.urlopen`
8+
1. Trouvez la page principale du site et la transformer en un objet intelligible pour votre code. Les fonctions suivantes vous seront utiles :
9+
10+
- `requests.get`
1211
- `bs4.BeautifulSoup`
1312

14-
2. Créez une fonction qui permet de récupérer la page d'un pokémon à partir de son nom.
13+
2. A partir de ce code, créer une fonction qui permet de récupérer le copntenu page d'un pokémon à partir de son nom. Vous pouvez nommer cette fonction `get_name`.
1514

1615
3. À partir de la page de `bulbasaur`, obtenez les 4 tableaux qui nous intéressent :
16+
1717
- on va chercher l'élément suivant : `('table', { 'class' : "vitals-table"})`
1818
- puis stocker ses éléments dans un dictionnaire
1919

2020
4. Récupérez par ailleurs la liste de noms des pokémons qui nous permettra de faire une boucle par la suite. Combien trouvez-vous de pokémons ?
2121

22-
5. Écrivez une fonction qui récupère l'ensemble des informations sur les dix premiers pokémons de la liste et les intègre dans un `DataFrame`
22+
5. Écrivez une fonction qui récupère l'ensemble des informations sur les dix premiers pokémons de la liste et les intègre dans un `DataFrame`.
2323

2424
::::
2525
:::
@@ -28,24 +28,26 @@ Pour récupérer les informations, le code devra être divisé en plusieurs éta
2828
:::: {.callout-tip}
2929
## Exercise 2b: Pokémon (guided version)
3030

31-
To retrieve the information, the code will need to be divided into several steps:
31+
To retrieve the information, the code must be divided into several steps:
3232

33+
1. Find the site's main page and transform it into an intelligible object for your code. The following functions will be useful:
3334

34-
1. Find the main page of the site and transform it into an intelligible object for your code.
35-
The following functions will be useful:
36-
- `urllib.request.Request`
37-
- `urllib.request.urlopen`
35+
- `requests.get`
3836
- `bs4.BeautifulSoup`
3937

40-
2. Create a function that retrieves a Pokémon's page based on its name.
38+
2. From this code, create a function that retrieves a pokémon's page content from its name. You can name this function `get_name`.
4139

42-
3. From the `bulbasaur` page, obtain the 4 tables we are interested in:
43-
- We will look for the following element: `('table', { 'class' : "vitals-table"})`
44-
- Then store its elements in a dictionary
40+
3. From the `bulbasaur` page, obtain the 4 arrays we're interested in:
41+
- look for the following element: `(table, { class : vitals-table})`
42+
- then store its elements in a dictionary
4543

46-
4. Additionally, retrieve the list of Pokémon names that will allow us to loop through later. How many Pokémon do you find?
44+
4. Retrieve the list of pokemon names, which will enable us to loop later. How many pokémons can you find?
4745

48-
5. Write a function that retrieves all the information on the first ten Pokémon in the list and integrates it into a `DataFrame`.
46+
5. Write a function that retrieves all the information on the first ten pokémons in the list and integrates it into a `DataFrame`.
4947

5048
::::
5149
:::
50+
51+
```{python}
52+
# Correction above
53+
```
Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,34 @@
11
```{python}
22
#| echo: true
3+
#| eval: false
34
!pip install scikit-image
45
```
56

7+
```{python}
8+
# Question 2
9+
def get_image_from_name(pokemon_name):
10+
"""
11+
Function enabling to retrieve pokemon info from a page, e.g. https://pokemondb.net/pokedex/bulbasaur
12+
"""
13+
url_pokemon = f"https://img.pokemondb.net/artwork/{pokemon}.jpg"
14+
response = requests.get(
15+
url_pokemon,
16+
headers={'User-Agent': 'Mozilla/5.0'}
17+
)
18+
19+
name = f'{pokemon}.jpg'
20+
21+
with open(f'{pokemon}.jpg', 'wb') as out_file:
22+
shutil.copyfileobj(response.raw, out_file)
23+
24+
return name
25+
```
626

727
```{python}
8-
#| include: false
9-
#| echo: false
28+
#| output: false
29+
#| message: false
30+
#| warning: false
31+
#| label: correction-exo2b-step2
1032
1133
# Correction de l'étape 2
1234
import shutil
@@ -17,21 +39,37 @@ import skimage.io as imio
1739
1840
nb_pokemons = 5
1941
fig, ax = plt.subplots(1, nb_pokemons, figsize=(12,4))
42+
2043
for indice_pokemon in range(0,nb_pokemons) :
44+
2145
pokemon = liste_pokemon[indice_pokemon]
22-
url = f"https://img.pokemondb.net/artwork/{pokemon}.jpg"
23-
response = requests.get(url, stream=True)
24-
with open(f'{pokemon}.jpg', 'wb') as out_file:
25-
shutil.copyfileobj(response.raw, out_file)
26-
name = f'{pokemon}.jpg'
46+
name = get_image_from_name(pokemon)
47+
2748
img = imio.imread(name)
2849
ax[indice_pokemon].imshow(img)
2950
ax[indice_pokemon].get_xaxis().set_visible(False)
3051
ax[indice_pokemon].get_yaxis().set_visible(False)
3152
```
3253

54+
55+
::: {.content-visible when-profile="fr"}
56+
3357
```{python}
3458
#| echo: false
35-
#plt.savefig('pokemon.png', bbox_inches='tight')
59+
#| fig-cap: "Les premiers pokemon du Pokédex"
60+
3661
ax[0].get_figure()
3762
```
63+
64+
:::
65+
66+
::: {.content-visible when-profile="en"}
67+
68+
```{python}
69+
#| echo: false
70+
#| fig-cap: "First pokemon in Pokedex list"
71+
72+
ax[0].get_figure()
73+
```
74+
75+
:::

0 commit comments

Comments
 (0)