In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from io import StringIO

In [3]:
#Enregistrement des URL à scrapper dans notre projet (listes des nominés et gagnants des grand prix du cinéma - oscar, césar, cannes et venise)
url_academy_bf = 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture'
url_academy_bd = 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director'
url_academy_bc = 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Cinematography'
url_cesar_bf = 'https://en.wikipedia.org/wiki/C%C3%A9sar_Award_for_Best_Film'
url_cesar_bd = 'https://en.wikipedia.org/wiki/C%C3%A9sar_Award_for_Best_Director'
url_palme_or = 'https://en.wikipedia.org/wiki/Palme_d%27Or'
url_cannes_jury = 'https://en.wikipedia.org/wiki/Jury_Prize_(Cannes_Film_Festival)'
url_cannes_gp = 'https://en.wikipedia.org/wiki/Grand_Prix_(Cannes_Film_Festival)'
url_lion_or = 'https://en.wikipedia.org/wiki/Golden_Lion'
url_lion_argent = 'https://en.wikipedia.org/wiki/Silver_Lion'
url_venice_gj = 'https://en.wikipedia.org/wiki/Grand_Jury_Prize_(Venice_Film_Festival)'

In [10]:
def get_tables_from_wikipedia(url):
    # récupérer le HTML à partit du lien URL
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the HTML. Status code: {response.status_code}")
        return None

    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    
    tables = soup.find_all('table', {'class': 'wikitable'})

    if not tables:
        print("No tables found on the page.")
        return None

    dataframes = []
    for i, table in enumerate(tables):
        try:
            df = pd.read_html(StringIO(str(table)))[0]
            dataframes.append(df)
            print(f"Table {i + 1} successfully scraped.") #notifier que la table a bien été scrappée 
        except Exception as e:
            print(f"Error processing table {i + 1}: {e}")

    return dataframes

#fonction qui concatene toutes les tables de la page et les sauvegarde dans un fichier csv 
def save_tables_to_csv(tables, output_file='merged_tables.csv', folder_path='work/movies/wiki_scrap'):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Créer le chemin complet du fichier CSV en combinant le dossier et le nom du fichier
    file_path = os.path.join(folder_path, output_file)
    
    
    # concatenation des datatframes de la table
    merged_df = pd.concat(tables, axis=0, ignore_index=True)

    # Sauvegarder dans un fichier csv 
    merged_df.to_csv(file_path, index=False)
    print(f'Merged tables saved to {file_path}')


In [12]:
#Créer toutes les tables avec les nominations : 
dict_awards={"academy_bf": url_academy_bf , "academy_bd" : url_academy_bd, "academy_bc": url_academy_bc, "cesar_bf": url_cesar_bf, "cesar_bd": url_cesar_bd, "palme_or": url_palme_or, "cannes_jury":url_cannes_jury,
             "cannes_gp": url_cannes_gp, "lion_or": url_lion_or, "lion_argent":url_lion_argent, "venice_gj":url_venice_gj}
print(dict_awards)
#Loop qui va créer toutes les tables:
for award_name, award_link in dict_awards.items():
    print(f"Processing {award_name}...")
    
    # Utiliser la fonction scrape_wikipedia_table pour obtenir les tables Wikipedia
    tables = get_tables_from_wikipedia(award_link)

    # Sauvegarder les tables dans un fichier CSV portant le nom du prix
    awd_n = f"{award_name}.csv"
    save_tables_to_csv(tables, awd_n, folder_path='wiki_scrap')

print("Toutes les tables ont été extraites et sauvegardées en fichiers CSV.")





{'academy_bf': 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture', 'academy_bd': 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director', 'academy_bc': 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Cinematography', 'cesar_bf': 'https://en.wikipedia.org/wiki/C%C3%A9sar_Award_for_Best_Film', 'cesar_bd': 'https://en.wikipedia.org/wiki/C%C3%A9sar_Award_for_Best_Director', 'palme_or': 'https://en.wikipedia.org/wiki/Palme_d%27Or', 'cannes_jury': 'https://en.wikipedia.org/wiki/Jury_Prize_(Cannes_Film_Festival)', 'cannes_gp': 'https://en.wikipedia.org/wiki/Grand_Prix_(Cannes_Film_Festival)', 'lion_or': 'https://en.wikipedia.org/wiki/Golden_Lion', 'lion_argent': 'https://en.wikipedia.org/wiki/Silver_Lion', 'venice_gj': 'https://en.wikipedia.org/wiki/Grand_Jury_Prize_(Venice_Film_Festival)'}
Processing academy_bf...
Table 1 successfully scraped.
Table 2 successfully scraped.
Table 3 successfully scraped.
Table 4 successfully scraped.
Table 5 successfully scraped.
Tabl