In [2]:
from urllib.request import urlopen


### Extracting the url page content as string 

In [3]:
url = "https://fr.wikipedia.org/wiki/Liste_des_r%C3%A9cipiendaires_du_prix_Nobel"
page = urlopen(url)
html = page.read().decode('utf-8')

### Parsing the page to extract the necessary informations

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

In [5]:
# finding the table of Nobel laureates by category
table_nobel = soup.find_all("table",{"class":["wikitable","sortable","jquery-tablesorter"]})[0]
contents = table_nobel.tbody.contents
header_in_html = contents[0]
data_in_html = [x for x in contents[1:] if x!='\n'] 

In [6]:
header_in_html

<tr>
<th>Année
</th>
<th width="18%"><a href="/wiki/Prix_Nobel_de_physique" title="Prix Nobel de physique">Physique</a>
</th>
<th width="16%"><a href="/wiki/Prix_Nobel_de_chimie" title="Prix Nobel de chimie">Chimie</a>
</th>
<th width="18%"><a href="/wiki/Prix_Nobel_de_physiologie_ou_m%C3%A9decine" title="Prix Nobel de physiologie ou médecine">Physiologie<br/>ou médecine</a>
</th>
<th width="16%"><a href="/wiki/Prix_Nobel_de_litt%C3%A9rature" title="Prix Nobel de littérature">Littérature</a>
</th>
<th width="16%"><a href="/wiki/Prix_Nobel_de_la_paix" title="Prix Nobel de la paix">Paix</a>
</th>
<th width="15%"><a href="/wiki/Prix_de_la_Banque_de_Su%C3%A8de_en_sciences_%C3%A9conomiques_en_m%C3%A9moire_d%27Alfred_Nobel" title="Prix de la Banque de Suède en sciences économiques en mémoire d'Alfred Nobel">Économie</a><span><sup id="ref_Note_GG"><a href="#endnote_Note_GG">[G]</a></sup></span>
</th></tr>

In [7]:
data = {}


In [8]:
def parse_header(header):
    for th in header.contents:
        a = th.find('a')
        if a== -1 or a ==None:
            key = th.string.strip('\n')
            if key!='':
                data[key]=[]
        else:
            key= th.a.string
            if key is None:
                key = th.a['title'].replace('Prix Nobel de ','').strip()
            data[key] = []
            
        
parse_header(header_in_html)

In [9]:
data

{'Année': [],
 'Physique': [],
 'Chimie': [],
 'physiologie ou médecine': [],
 'Littérature': [],
 'Paix': [],
 'Économie': []}

In [10]:
def parse_row_data(tr_elt):
    global data
    td_list = tr_elt.findAll('td')
    if td_list:
        data['Année'].append(td_list[0].string.strip('\n'))
        keys = list(data.keys())[1:]
        vals = td_list[1:]
        for i in range(len(keys)):
            links = vals[i].findAll('a')
            if links:
                parsed_val = links[-1].string
            else:
                parsed_val = None
            data[keys[i]].append(parsed_val)
    else :
        print(f"It's the end, we are at the table's footer")

In [11]:
for tr_element in data_in_html:
    parse_row_data(tr_element)

It's the end, we are at the table's footer


## Data restructuration
I want to have a structure where each record a person or an organization in a given year to see and the name of the category in which it had a Nobel

In [12]:
laureats = {}

In [13]:
persons = []
laureats['Nom'] = []
laureats['category'] = []
for key,val in data.items():
    if key!= 'Année':
        persons = list(set(persons).union(val))
print(len(persons))

laureats['Annee'] = []
for val in persons:
    for year in data['Année']:
        for key in list(data.keys())[1:]:
            if val in data[key]:
                laureats['category'].append(key)
                laureats['Annee'].append(year)
                laureats['Nom'].append(val)

609


In [14]:
import pandas as pd

### Data persitence in csv file

In [15]:
df = pd.DataFrame(laureats)
df.head()

Unnamed: 0,Nom,category,Annee
0,Howard Temin,physiologie ou médecine,1901
1,Howard Temin,physiologie ou médecine,1902
2,Howard Temin,physiologie ou médecine,1903
3,Howard Temin,physiologie ou médecine,1904
4,Howard Temin,physiologie ou médecine,1905


In [16]:
df.to_csv('laureats.csv')