Skip to content

Commit f7d7c83

Browse files
committed
solve problem with webscraping chapter
1 parent 0908656 commit f7d7c83

File tree

2 files changed

+197
-115
lines changed

2 files changed

+197
-115
lines changed

_quarto.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ project:
55
- index.qmd
66
- content/getting-started/index.qmd
77
- content/manipulation/index.qmd
8-
- content/modelisation/06_pipeline.qmd
8+
- content/modelisation/04a_webscraping.qmd
99
- content/visualisation/index.qmd
1010
- content/modelisation/index.qmd
1111
- content/NLP/index.qmd

content/manipulation/04a_webscraping_TP.qmd

Lines changed: 196 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -699,119 +699,194 @@ Try to understand step by step what is done in the following steps (retrieving a
699699
#| echo: true
700700
#| output: false
701701
702-
import urllib
702+
import requests
703+
import bs4
703704
import pandas as pd
704-
import bs4
705-
706-
division=[]
707-
equipe=[]
708-
stade=[]
709-
latitude_stade=[]
710-
longitude_stade=[]
711-
712-
url_list=["http://fr.wikipedia.org/wiki/Championnat_de_France_de_football_2019-2020", "http://fr.wikipedia.org/wiki/Championnat_de_France_de_football_de_Ligue_2_2019-2020"]
713-
714-
for url_ligue in url_list :
715-
716-
print(url_ligue)
717-
sock = urllib.request.urlopen(url_ligue).read()
718-
page=bs4.BeautifulSoup(sock)
719-
720-
# Rechercher les liens des équipes dans la liste disponible sur wikipedia
721-
722-
for team in page.findAll('span' , {'class' : 'toponyme'}) :
723-
724-
# Indiquer si c'est de la ligue 1 ou de la ligue 2
725-
726-
if url_ligue==url_list[0] :
727-
division.append("L1")
728-
else :
729-
division.append("L2")
730-
731-
# Trouver le nom et le lien de l'équipe
732-
733-
if team.find('a')!=None :
734-
team_url=team.find('a').get('href')
735-
name_team=team.find('a').get('title')
736-
equipe.append(name_team)
737-
url_get_info = "http://fr.wikipedia.org"+team_url
738-
print(url_get_info)
739-
740-
# aller sur la page de l'équipe
741-
742-
search = urllib.request.urlopen(url_get_info).read()
743-
search_team=bs4.BeautifulSoup(search)
744-
745-
# trouver le stade
746-
compteur = 0
747-
for stadium in search_team.findAll('tr'):
748-
for x in stadium.findAll('th' , {'scope' : 'row'} ) :
749-
if x.contents[0].string=="Stade" and compteur == 0:
750-
compteur = 1
751-
# trouver le lien du stade et son nom
752-
url_stade=stadium.findAll('a')[1].get('href')
753-
name_stadium=stadium.findAll('a')[1].get('title')
754-
stade.append(name_stadium)
755-
url_get_stade = "http://fr.wikipedia.org"+url_stade
756-
print(url_get_stade)
757-
758-
# Aller sur la page du stade et trouver ses coodronnées géographiques
759-
760-
search_stade = urllib.request.urlopen(url_get_stade).read()
761-
soup_stade=bs4.BeautifulSoup(search_stade)
762-
kartographer = soup_stade.find('a',{'class': "mw-kartographer-maplink"})
763-
if kartographer == None :
764-
latitude_stade.append(None)
765-
longitude_stade.append(None)
766-
else :
767-
for coordinates in kartographer :
768-
print(coordinates)
769-
liste = coordinates.split(",")
770-
latitude_stade.append(str(liste[0]).replace(" ", "") + "'")
771-
longitude_stade.append(str(liste[1]).replace(" ", "") + "'")
772-
773-
774-
dict = {'division' : division , 'equipe': equipe, 'stade': stade, 'latitude': latitude_stade, 'longitude' : longitude_stade}
775-
data = pd.DataFrame(dict)
776-
data = data.dropna()
777-
```
778705
779-
```{python}
780-
#| echo: true
781-
data.head(5)
782-
```
783706
784-
::: {.content-visible when-profile="fr"}
785-
On va transformer les coordonnées en degrés en coordonnées numériques
786-
afin d'être en mesure de faire une carte.
787-
:::
707+
def retrieve_page(url: str) -> bs4.BeautifulSoup:
708+
"""
709+
Retrieves and parses a webpage using BeautifulSoup.
788710
789-
::: {.content-visible when-profile="en"}
790-
We will convert the coordinates from degrees to numerical coordinates
791-
to be able to create a map.
792-
:::
711+
Args:
712+
url (str): The URL of the webpage to retrieve.
793713
794-
```{python}
795-
#| echo: true
796-
import re
797-
798-
def dms2dd(degrees, minutes, seconds, direction):
799-
dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60);
800-
if direction in ('S', 'O'):
801-
dd *= -1
802-
return dd
803-
804-
def parse_dms(dms):
805-
parts = re.split('[^\d\w]+', dms)
806-
lat = dms2dd(parts[0], parts[1], parts[2], parts[3])
807-
#lng = dms2dd(parts[4], parts[5], parts[6], parts[7])
808-
return lat
714+
Returns:
715+
bs4.BeautifulSoup: The parsed HTML content of the page.
716+
"""
717+
r = requests.get(url)
718+
page = bs4.BeautifulSoup(r.content, 'html.parser')
719+
return page
720+
721+
722+
def extract_team_name_url(team: bs4.element.Tag) -> dict:
723+
"""
724+
Extracts the team name and its corresponding Wikipedia URL.
725+
726+
Args:
727+
team (bs4.element.Tag): The BeautifulSoup tag containing the team information.
728+
729+
Returns:
730+
dict: A dictionary with the team name as the key and the Wikipedia URL as the value, or None if not found.
731+
"""
732+
try:
733+
team_url = team.find('a').get('href')
734+
equipe = team.find('a').get('title')
735+
url_get_info = f"http://fr.wikipedia.org{team_url}"
736+
print(f"Retrieving information for {equipe}")
737+
return {equipe: url_get_info}
738+
except AttributeError:
739+
print(f"No <a> tag for \"{team}\"")
740+
return None
741+
742+
743+
def explore_team_page(wikipedia_team_url: str) -> bs4.BeautifulSoup:
744+
"""
745+
Retrieves and parses a team's Wikipedia page.
746+
747+
Args:
748+
wikipedia_team_url (str): The URL of the team's Wikipedia page.
749+
750+
Returns:
751+
bs4.BeautifulSoup: The parsed HTML content of the team's Wikipedia page.
752+
"""
753+
r = requests.get(wikipedia_team_url)
754+
page = bs4.BeautifulSoup(r.content, 'html.parser')
755+
return page
756+
757+
758+
def extract_stadium_info(search_team: bs4.BeautifulSoup) -> tuple:
759+
"""
760+
Extracts stadium information from a team's Wikipedia page.
761+
762+
Args:
763+
search_team (bs4.BeautifulSoup): The parsed HTML content of the team's Wikipedia page.
764+
765+
Returns:
766+
tuple: A tuple containing the stadium name, latitude, and longitude, or (None, None, None) if not found.
767+
"""
768+
for stadium in search_team.findAll('tr'):
769+
try:
770+
header = stadium.find('th', {'scope': 'row'})
771+
if header and header.contents[0].string == "Stade":
772+
name_stadium, url_get_stade = extract_stadium_name_url(stadium)
773+
if name_stadium and url_get_stade:
774+
latitude, longitude = extract_stadium_coordinates(url_get_stade)
775+
return name_stadium, latitude, longitude
776+
except (AttributeError, IndexError) as e:
777+
print(f"Error processing stadium information: {e}")
778+
return None, None, None
779+
780+
781+
def extract_stadium_name_url(stadium: bs4.element.Tag) -> tuple:
782+
"""
783+
Extracts the stadium name and URL from a stadium element.
784+
785+
Args:
786+
stadium (bs4.element.Tag): The BeautifulSoup tag containing the stadium information.
787+
788+
Returns:
789+
tuple: A tuple containing the stadium name and its Wikipedia URL, or (None, None) if not found.
790+
"""
791+
try:
792+
url_stade = stadium.findAll('a')[1].get('href')
793+
name_stadium = stadium.findAll('a')[1].get('title')
794+
url_get_stade = f"http://fr.wikipedia.org{url_stade}"
795+
return name_stadium, url_get_stade
796+
except (AttributeError, IndexError) as e:
797+
print(f"Error extracting stadium name and URL: {e}")
798+
return None, None
799+
800+
801+
def extract_stadium_coordinates(url_get_stade: str) -> tuple:
802+
"""
803+
Extracts the coordinates of a stadium from its Wikipedia page.
804+
805+
Args:
806+
url_get_stade (str): The URL of the stadium's Wikipedia page.
807+
808+
Returns:
809+
tuple: A tuple containing the latitude and longitude of the stadium, or (None, None) if not found.
810+
"""
811+
try:
812+
soup_stade = retrieve_page(url_get_stade)
813+
kartographer = soup_stade.find('a', {'class': "mw-kartographer-maplink"})
814+
if kartographer:
815+
coordinates = kartographer.get('data-lat') + "," + kartographer.get('data-lon')
816+
latitude, longitude = coordinates.split(",")
817+
return latitude.strip(), longitude.strip()
818+
else:
819+
return None, None
820+
except Exception as e:
821+
print(f"Error extracting stadium coordinates: {e}")
822+
return None, None
823+
824+
825+
def extract_team_info(url_team_tag: bs4.element.Tag) -> dict:
826+
"""
827+
Extracts information about a team, including its stadium and coordinates.
828+
829+
Args:
830+
url_team_tag (bs4.element.Tag): The BeautifulSoup tag containing the team information.
831+
832+
Returns:
833+
dict: A dictionary with details about the team, including its division, name, stadium, latitude, and longitude.
834+
"""
835+
team_info = extract_team_name_url(url_team_tag)
836+
url_team_wikipedia = next(iter(team_info.values()))
837+
name_team = next(iter(team_info.keys()))
838+
search_team = explore_team_page(url_team_wikipedia)
839+
name_stadium, latitude, longitude = extract_stadium_info(search_team)
840+
dict_stadium_team = {
841+
'division': division,
842+
'equipe': name_team,
843+
'stade': name_stadium,
844+
'latitude': latitude,
845+
'longitude': longitude
846+
}
847+
return dict_stadium_team
848+
849+
850+
def retrieve_all_stadium_from_league(url_list: dict, division: str = "L1") -> pd.DataFrame:
851+
"""
852+
Retrieves information about all stadiums in a league.
853+
854+
Args:
855+
url_list (dict): A dictionary mapping divisions to their Wikipedia URLs.
856+
division (str): The division for which to retrieve stadium information.
857+
858+
Returns:
859+
pd.DataFrame: A DataFrame containing information about the stadiums in the specified division.
860+
"""
861+
page = retrieve_page(url_list[division])
862+
teams = page.findAll('span', {'class': 'toponyme'})
863+
all_info = []
864+
865+
for team in teams:
866+
all_info.append(extract_team_info(team))
867+
868+
stadium_df = pd.DataFrame(all_info)
869+
return stadium_df
870+
871+
872+
# URLs for different divisions
873+
url_list = {
874+
"L1": "http://fr.wikipedia.org/wiki/Championnat_de_France_de_football_2019-2020",
875+
"L2": "http://fr.wikipedia.org/wiki/Championnat_de_France_de_football_de_Ligue_2_2019-2020"
876+
}
877+
878+
# Retrieve stadiums information for Ligue 1
879+
stades_ligue1 = retrieve_all_stadium_from_league(url_list, "L1")
880+
stades_ligue2 = retrieve_all_stadium_from_league(url_list, "L2")
881+
882+
stades = pd.concat(
883+
[stades_ligue1, stades_ligue2]
884+
)
809885
```
810886

811887
```{python}
812888
#| echo: true
813-
data['latitude'] = data['latitude'].apply(parse_dms)
814-
data['longitude'] = data['longitude'].apply(parse_dms)
889+
stades.head(5)
815890
```
816891

817892
::: {.content-visible when-profile="fr"}
@@ -839,23 +914,30 @@ use `folium` for this, which is introduced in the
839914
#| output: false
840915
841916
import geopandas as gpd
842-
from pathlib import Path
843917
import folium
844918
845-
gdf = gpd.GeoDataFrame(
846-
data, geometry=gpd.points_from_xy(data.longitude, data.latitude))
847-
848-
Path("leaflet").mkdir(parents=True, exist_ok=True)
919+
stades = stades.dropna(subset = ['latitude', 'longitude'])
920+
stades.loc[:, ['latitude', 'longitude']] = (
921+
stades
922+
.loc[:, ['latitude', 'longitude']]
923+
.astype(float)
924+
)
925+
stadium_locations = gpd.GeoDataFrame(
926+
stades, geometry = gpd.points_from_xy(stades.longitude, stades.latitude)
927+
)
849928
850-
center = gdf[['latitude', 'longitude']].mean().values.tolist()
851-
sw = gdf[['latitude', 'longitude']].min().values.tolist()
852-
ne = gdf[['latitude', 'longitude']].max().values.tolist()
929+
center = stadium_locations[['latitude', 'longitude']].mean().values.tolist()
930+
sw = stadium_locations[['latitude', 'longitude']].min().values.tolist()
931+
ne = stadium_locations[['latitude', 'longitude']].max().values.tolist()
853932
854933
m = folium.Map(location = center, tiles='openstreetmap')
855934
856935
# I can add marker one by one on the map
857-
for i in range(0,len(gdf)):
858-
folium.Marker([gdf.iloc[i]['latitude'], gdf.iloc[i]['longitude']], popup=gdf.iloc[i]['stade']).add_to(m)
936+
for i in range(0,len(stadium_locations)):
937+
folium.Marker(
938+
[stadium_locations.iloc[i]['latitude'], stadium_locations.iloc[i]['longitude']],
939+
popup=stadium_locations.iloc[i]['stade']
940+
).add_to(m)
859941
860942
m.fit_bounds([sw, ne])
861943
```

0 commit comments

Comments
 (0)