@@ -699,119 +699,194 @@ Try to understand step by step what is done in the following steps (retrieving a
699
699
#| echo: true
700
700
#| output: false
701
701
702
- import urllib
702
+ import requests
703
+ import bs4
703
704
import pandas as pd
704
- import bs4
705
-
706
- division=[]
707
- equipe=[]
708
- stade=[]
709
- latitude_stade=[]
710
- longitude_stade=[]
711
-
712
- url_list=["http://fr.wikipedia.org/wiki/Championnat_de_France_de_football_2019-2020", "http://fr.wikipedia.org/wiki/Championnat_de_France_de_football_de_Ligue_2_2019-2020"]
713
-
714
- for url_ligue in url_list :
715
-
716
- print(url_ligue)
717
- sock = urllib.request.urlopen(url_ligue).read()
718
- page=bs4.BeautifulSoup(sock)
719
-
720
- # Rechercher les liens des équipes dans la liste disponible sur wikipedia
721
-
722
- for team in page.findAll('span' , {'class' : 'toponyme'}) :
723
-
724
- # Indiquer si c'est de la ligue 1 ou de la ligue 2
725
-
726
- if url_ligue==url_list[0] :
727
- division.append("L1")
728
- else :
729
- division.append("L2")
730
-
731
- # Trouver le nom et le lien de l'équipe
732
-
733
- if team.find('a')!=None :
734
- team_url=team.find('a').get('href')
735
- name_team=team.find('a').get('title')
736
- equipe.append(name_team)
737
- url_get_info = "http://fr.wikipedia.org"+team_url
738
- print(url_get_info)
739
-
740
- # aller sur la page de l'équipe
741
-
742
- search = urllib.request.urlopen(url_get_info).read()
743
- search_team=bs4.BeautifulSoup(search)
744
-
745
- # trouver le stade
746
- compteur = 0
747
- for stadium in search_team.findAll('tr'):
748
- for x in stadium.findAll('th' , {'scope' : 'row'} ) :
749
- if x.contents[0].string=="Stade" and compteur == 0:
750
- compteur = 1
751
- # trouver le lien du stade et son nom
752
- url_stade=stadium.findAll('a')[1].get('href')
753
- name_stadium=stadium.findAll('a')[1].get('title')
754
- stade.append(name_stadium)
755
- url_get_stade = "http://fr.wikipedia.org"+url_stade
756
- print(url_get_stade)
757
-
758
- # Aller sur la page du stade et trouver ses coodronnées géographiques
759
-
760
- search_stade = urllib.request.urlopen(url_get_stade).read()
761
- soup_stade=bs4.BeautifulSoup(search_stade)
762
- kartographer = soup_stade.find('a',{'class': "mw-kartographer-maplink"})
763
- if kartographer == None :
764
- latitude_stade.append(None)
765
- longitude_stade.append(None)
766
- else :
767
- for coordinates in kartographer :
768
- print(coordinates)
769
- liste = coordinates.split(",")
770
- latitude_stade.append(str(liste[0]).replace(" ", "") + "'")
771
- longitude_stade.append(str(liste[1]).replace(" ", "") + "'")
772
-
773
-
774
- dict = {'division' : division , 'equipe': equipe, 'stade': stade, 'latitude': latitude_stade, 'longitude' : longitude_stade}
775
- data = pd.DataFrame(dict)
776
- data = data.dropna()
777
- ```
778
705
779
- ``` {python}
780
- #| echo: true
781
- data.head(5)
782
- ```
783
706
784
- ::: {.content-visible when-profile="fr"}
785
- On va transformer les coordonnées en degrés en coordonnées numériques
786
- afin d'être en mesure de faire une carte.
787
- :::
707
+ def retrieve_page(url: str) -> bs4.BeautifulSoup:
708
+ """
709
+ Retrieves and parses a webpage using BeautifulSoup.
788
710
789
- ::: {.content-visible when-profile="en"}
790
- We will convert the coordinates from degrees to numerical coordinates
791
- to be able to create a map.
792
- :::
711
+ Args:
712
+ url (str): The URL of the webpage to retrieve.
793
713
794
- ``` {python}
795
- #| echo: true
796
- import re
797
-
798
- def dms2dd(degrees, minutes, seconds, direction):
799
- dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60);
800
- if direction in ('S', 'O'):
801
- dd *= -1
802
- return dd
803
-
804
- def parse_dms(dms):
805
- parts = re.split('[^\d\w]+', dms)
806
- lat = dms2dd(parts[0], parts[1], parts[2], parts[3])
807
- #lng = dms2dd(parts[4], parts[5], parts[6], parts[7])
808
- return lat
714
+ Returns:
715
+ bs4.BeautifulSoup: The parsed HTML content of the page.
716
+ """
717
+ r = requests.get(url)
718
+ page = bs4.BeautifulSoup(r.content, 'html.parser')
719
+ return page
720
+
721
+
722
+ def extract_team_name_url(team: bs4.element.Tag) -> dict:
723
+ """
724
+ Extracts the team name and its corresponding Wikipedia URL.
725
+
726
+ Args:
727
+ team (bs4.element.Tag): The BeautifulSoup tag containing the team information.
728
+
729
+ Returns:
730
+ dict: A dictionary with the team name as the key and the Wikipedia URL as the value, or None if not found.
731
+ """
732
+ try:
733
+ team_url = team.find('a').get('href')
734
+ equipe = team.find('a').get('title')
735
+ url_get_info = f"http://fr.wikipedia.org{team_url}"
736
+ print(f"Retrieving information for {equipe}")
737
+ return {equipe: url_get_info}
738
+ except AttributeError:
739
+ print(f"No <a> tag for \"{team}\"")
740
+ return None
741
+
742
+
743
+ def explore_team_page(wikipedia_team_url: str) -> bs4.BeautifulSoup:
744
+ """
745
+ Retrieves and parses a team's Wikipedia page.
746
+
747
+ Args:
748
+ wikipedia_team_url (str): The URL of the team's Wikipedia page.
749
+
750
+ Returns:
751
+ bs4.BeautifulSoup: The parsed HTML content of the team's Wikipedia page.
752
+ """
753
+ r = requests.get(wikipedia_team_url)
754
+ page = bs4.BeautifulSoup(r.content, 'html.parser')
755
+ return page
756
+
757
+
758
+ def extract_stadium_info(search_team: bs4.BeautifulSoup) -> tuple:
759
+ """
760
+ Extracts stadium information from a team's Wikipedia page.
761
+
762
+ Args:
763
+ search_team (bs4.BeautifulSoup): The parsed HTML content of the team's Wikipedia page.
764
+
765
+ Returns:
766
+ tuple: A tuple containing the stadium name, latitude, and longitude, or (None, None, None) if not found.
767
+ """
768
+ for stadium in search_team.findAll('tr'):
769
+ try:
770
+ header = stadium.find('th', {'scope': 'row'})
771
+ if header and header.contents[0].string == "Stade":
772
+ name_stadium, url_get_stade = extract_stadium_name_url(stadium)
773
+ if name_stadium and url_get_stade:
774
+ latitude, longitude = extract_stadium_coordinates(url_get_stade)
775
+ return name_stadium, latitude, longitude
776
+ except (AttributeError, IndexError) as e:
777
+ print(f"Error processing stadium information: {e}")
778
+ return None, None, None
779
+
780
+
781
+ def extract_stadium_name_url(stadium: bs4.element.Tag) -> tuple:
782
+ """
783
+ Extracts the stadium name and URL from a stadium element.
784
+
785
+ Args:
786
+ stadium (bs4.element.Tag): The BeautifulSoup tag containing the stadium information.
787
+
788
+ Returns:
789
+ tuple: A tuple containing the stadium name and its Wikipedia URL, or (None, None) if not found.
790
+ """
791
+ try:
792
+ url_stade = stadium.findAll('a')[1].get('href')
793
+ name_stadium = stadium.findAll('a')[1].get('title')
794
+ url_get_stade = f"http://fr.wikipedia.org{url_stade}"
795
+ return name_stadium, url_get_stade
796
+ except (AttributeError, IndexError) as e:
797
+ print(f"Error extracting stadium name and URL: {e}")
798
+ return None, None
799
+
800
+
801
+ def extract_stadium_coordinates(url_get_stade: str) -> tuple:
802
+ """
803
+ Extracts the coordinates of a stadium from its Wikipedia page.
804
+
805
+ Args:
806
+ url_get_stade (str): The URL of the stadium's Wikipedia page.
807
+
808
+ Returns:
809
+ tuple: A tuple containing the latitude and longitude of the stadium, or (None, None) if not found.
810
+ """
811
+ try:
812
+ soup_stade = retrieve_page(url_get_stade)
813
+ kartographer = soup_stade.find('a', {'class': "mw-kartographer-maplink"})
814
+ if kartographer:
815
+ coordinates = kartographer.get('data-lat') + "," + kartographer.get('data-lon')
816
+ latitude, longitude = coordinates.split(",")
817
+ return latitude.strip(), longitude.strip()
818
+ else:
819
+ return None, None
820
+ except Exception as e:
821
+ print(f"Error extracting stadium coordinates: {e}")
822
+ return None, None
823
+
824
+
825
+ def extract_team_info(url_team_tag: bs4.element.Tag) -> dict:
826
+ """
827
+ Extracts information about a team, including its stadium and coordinates.
828
+
829
+ Args:
830
+ url_team_tag (bs4.element.Tag): The BeautifulSoup tag containing the team information.
831
+
832
+ Returns:
833
+ dict: A dictionary with details about the team, including its division, name, stadium, latitude, and longitude.
834
+ """
835
+ team_info = extract_team_name_url(url_team_tag)
836
+ url_team_wikipedia = next(iter(team_info.values()))
837
+ name_team = next(iter(team_info.keys()))
838
+ search_team = explore_team_page(url_team_wikipedia)
839
+ name_stadium, latitude, longitude = extract_stadium_info(search_team)
840
+ dict_stadium_team = {
841
+ 'division': division,
842
+ 'equipe': name_team,
843
+ 'stade': name_stadium,
844
+ 'latitude': latitude,
845
+ 'longitude': longitude
846
+ }
847
+ return dict_stadium_team
848
+
849
+
850
+ def retrieve_all_stadium_from_league(url_list: dict, division: str = "L1") -> pd.DataFrame:
851
+ """
852
+ Retrieves information about all stadiums in a league.
853
+
854
+ Args:
855
+ url_list (dict): A dictionary mapping divisions to their Wikipedia URLs.
856
+ division (str): The division for which to retrieve stadium information.
857
+
858
+ Returns:
859
+ pd.DataFrame: A DataFrame containing information about the stadiums in the specified division.
860
+ """
861
+ page = retrieve_page(url_list[division])
862
+ teams = page.findAll('span', {'class': 'toponyme'})
863
+ all_info = []
864
+
865
+ for team in teams:
866
+ all_info.append(extract_team_info(team))
867
+
868
+ stadium_df = pd.DataFrame(all_info)
869
+ return stadium_df
870
+
871
+
872
+ # URLs for different divisions
873
+ url_list = {
874
+ "L1": "http://fr.wikipedia.org/wiki/Championnat_de_France_de_football_2019-2020",
875
+ "L2": "http://fr.wikipedia.org/wiki/Championnat_de_France_de_football_de_Ligue_2_2019-2020"
876
+ }
877
+
878
+ # Retrieve stadiums information for Ligue 1
879
+ stades_ligue1 = retrieve_all_stadium_from_league(url_list, "L1")
880
+ stades_ligue2 = retrieve_all_stadium_from_league(url_list, "L2")
881
+
882
+ stades = pd.concat(
883
+ [stades_ligue1, stades_ligue2]
884
+ )
809
885
```
810
886
811
887
``` {python}
812
888
#| echo: true
813
- data['latitude'] = data['latitude'].apply(parse_dms)
814
- data['longitude'] = data['longitude'].apply(parse_dms)
889
+ stades.head(5)
815
890
```
816
891
817
892
::: {.content-visible when-profile="fr"}
@@ -839,23 +914,30 @@ use `folium` for this, which is introduced in the
839
914
#| output: false
840
915
841
916
import geopandas as gpd
842
- from pathlib import Path
843
917
import folium
844
918
845
- gdf = gpd.GeoDataFrame(
846
- data, geometry=gpd.points_from_xy(data.longitude, data.latitude))
847
-
848
- Path("leaflet").mkdir(parents=True, exist_ok=True)
919
+ stades = stades.dropna(subset = ['latitude', 'longitude'])
920
+ stades.loc[:, ['latitude', 'longitude']] = (
921
+ stades
922
+ .loc[:, ['latitude', 'longitude']]
923
+ .astype(float)
924
+ )
925
+ stadium_locations = gpd.GeoDataFrame(
926
+ stades, geometry = gpd.points_from_xy(stades.longitude, stades.latitude)
927
+ )
849
928
850
- center = gdf [['latitude', 'longitude']].mean().values.tolist()
851
- sw = gdf [['latitude', 'longitude']].min().values.tolist()
852
- ne = gdf [['latitude', 'longitude']].max().values.tolist()
929
+ center = stadium_locations [['latitude', 'longitude']].mean().values.tolist()
930
+ sw = stadium_locations [['latitude', 'longitude']].min().values.tolist()
931
+ ne = stadium_locations [['latitude', 'longitude']].max().values.tolist()
853
932
854
933
m = folium.Map(location = center, tiles='openstreetmap')
855
934
856
935
# I can add marker one by one on the map
857
- for i in range(0,len(gdf)):
858
- folium.Marker([gdf.iloc[i]['latitude'], gdf.iloc[i]['longitude']], popup=gdf.iloc[i]['stade']).add_to(m)
936
+ for i in range(0,len(stadium_locations)):
937
+ folium.Marker(
938
+ [stadium_locations.iloc[i]['latitude'], stadium_locations.iloc[i]['longitude']],
939
+ popup=stadium_locations.iloc[i]['stade']
940
+ ).add_to(m)
859
941
860
942
m.fit_bounds([sw, ne])
861
943
```
0 commit comments