In [1]:
from urllib.request import Request, urlopen, urlretrieve
from IPython.display import display, HTML
from bs4 import BeautifulSoup
import pandas as pd
import re, requests

In [2]:
class HTMLTableParser:
    def __init__(self, url):
        self.url = url

    def find_table(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.text,  "html.parser")
        table = soup.find_all('table')[0]
        return table

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th')
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0, n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                content = column.get_text()
                df.iat[row_marker,column_marker] = content
                if 'mailto:' in str(column):
                    email = re.findall(r'mailto:([a-z0-9.-]+)', str(column))[0]
                    email += '@umontreal.ca'
                    df.iat[row_marker, column_marker] = f'{content}, {email}'
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
        
        return df

In [3]:
def format_header(x):
    if x == 0 :
        return 'Identité'
    if x == 1 :
        return 'Niveau'
    if x == 2 :
        return 'Labo'
    if x == 3 :
        return 'Téléphone'

def custom_column(table):
    header = table.iloc[0]
    for i in header.index :
        header[i] = format_header(i)
    table = table.rename(columns = header)
    return table

def get_table(url):
    udm_site = HTMLTableParser(url)
    table = udm_site.find_table()
    tmp = udm_site.parse_html_table(table)
    format_tmp = custom_column(tmp)
    format_tmp = format_tmp[1:].reset_index()
    return format_tmp

def custom_table(table):
    table['Nom'] = [ table['Identité'][i].split(', ')[0] for i in table.index ]
    table['Prénom'] = [ table['Identité'][i].split(', ')[1] for i in table.index ]
    table['Email'] = [ table['Identité'][i].split(', ')[2] if len(table['Identité'][i].split(', ')) == 3 else 'NA' for i in table.index  ]
    table = table.drop(columns=['index', 'Identité'])
    return table[['Nom', 'Prénom', 'Niveau', 'Labo', 'Email']]

In [9]:
##### Website visited on 02/02/2021
session = 'H21'
student_all_url = 'https://biochimie.umontreal.ca/departement/repertoires/etudiants-gradues/'
all_student_table = custom_table(get_table(student_all_url))

#### Collect only bioinformatics students
bioinfo_students = all_student_table[all_student_table['Niveau'].str.contains('BIN')]
bioinfo_students.to_csv(f'./CSV/bioinfo_student_database_{session}.csv', encoding='utf-8', index=False)