In [35]:
from urllib.request import Request, urlopen, urlretrieve
from IPython.display import display, HTML
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [36]:
class HTMLTableParser:
    def __init__(self, url):
        self.url = url

    def find_table(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.text,  "html.parser")
        table = soup.find_all('table')[0]
        return table
            

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th')
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0, n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
        
        return df

In [48]:
#### Table preprocessing steps

def format_header(x):
    if x == 0 :
        return 'Manuscrit_soumis'
    if x == 1 :
        return 'Etudiant_Directeur'
    if x == 2 :
        return 'Titre_Projet'
    if x == 3 :
        return 'Programme'
    if x == 4 :
        return 'Grade_obtenu'
    
def custom_column(table):
    header = table.iloc[0]
    for i in header.index :
        header[i] = format_header(i)
    table = table.rename(columns = header)
    return table

def get_table(url):
    udm_site = HTMLTableParser(url)
    table = udm_site.find_table()
    tmp = udm_site.parse_html_table(table)
    format_tmp = custom_column(tmp)
    return format_tmp

def format_content(x):
    if '\n' in x :
        x = x.replace('\n', ' / ') 
        return x
    return x

In [51]:
#### Table collection, filtration and saving steps
def save_alumn_tables(start, end, phd_links, master_links, student_type):
    if 'phd' in student_type:
        for i in range(len(phd_links)):
            if i == 0 :
                all_phd_student = get_table(phd_links[i])
            else:
                tmp = get_table(phd_links[i])
                all_phd_student = all_phd_student.append(tmp, sort=True)
        ## All phd student table
        all_phd_student = all_phd_student.dropna().reset_index().drop(columns=['index'])
        all_phd_student['Etudiant_Directeur'] = all_phd_student['Etudiant_Directeur'].apply(format_content)
     
        ## Only Phd Bioinfo students
        bioinfo_phd = all_phd_student[all_phd_student['Programme'] == 'Bio-informatique'][['Programme', 'Etudiant_Directeur', 'Titre_Projet', 'Manuscrit_soumis', 'Grade_obtenu']]
        print(f'Total number of {student_type} alumni: {len(bioinfo_phd)}')

        year=end
        bioinfo_phd.to_csv(f'./{year}/AEBINUM_PHD_ALUMN_LIST_{year}.csv', sep=',', encoding='utf-8', index=False)

    if 'master' in student_type:
        for i in range(len(master_links)):
            if i == 0 :
                all_master_student = get_table(master_links[i])
            else:
                tmp = get_table(master_links[i])
                all_master_student = all_master_student.append(tmp, sort=True)
        ## All master student table
        all_master_student = all_master_student.dropna().reset_index().drop(columns=['index'])
        all_master_student['Etudiant_Directeur'] = all_master_student['Etudiant_Directeur'].apply(format_content)
        
        ## Only Master Bioinfo students
        bioinfo_master = all_master_student[all_master_student['Programme'] == 'Bio-informatique'][['Programme', 'Etudiant_Directeur', 'Titre_Projet', 'Manuscrit_soumis', 'Grade_obtenu']]

        print(f'Total number of {student_type} alumni: {len(bioinfo_master)}')

        year=end
        bioinfo_master.to_csv(f'./{year}/AEBINUM_MASTER_ALUMN_LIST_{year}.csv', sep=',', encoding='utf-8', index=False)


def collect_alumn_tables(start, end):
    #### Collect all links from 2000 to now
    phd_links = []
    master_links = []
    url = 'https://biochimie.umontreal.ca/etudes/memoires-et-soutenances/'
    for Y in range(start, end+1):
        if 2000 == Y or 2001 == Y :
            phd_links.append('{}soutenances-{}/'.format(url, Y))
        else:
            phd_links.append('{}soutenances-de-these-{}/'.format(url, Y))
            master_links.append('{}memoires-de-maitrise-{}/'.format(url, Y))
    
    for student_type in ['master', 'phd']:
        save_alumn_tables(start, end, phd_links, master_links, student_type)
    print(f'Alumni tables saved from {start} to {end}')


In [52]:
#### Collection of alumni from 2000 to 2019
start = 2000
end = 2019
collect_alumn_tables(start, end)

Total number of master alumni: 55
Total number of phd alumni: 26
Alumni tables saved from 2000 to 2019


In [53]:
#### Collection of alumni from 2000 to 2020
start = 2000
end = 2020
collect_alumn_tables(start, end)

Total number of master alumni: 58
Total number of phd alumni: 27
Alumni tables saved from 2000 to 2020


### 1.