In [1]:
from urllib.request import Request, urlopen, urlretrieve
from IPython.display import display, HTML
from bs4 import BeautifulSoup
import pandas as pd
import requests

class HTMLTableParser:
    def __init__(self, url):
        self.url = url

    def find_table(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.text,  "html.parser")
        table = soup.find_all('table')[0]
        return table
            

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th')
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0, n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
        
        return df

In [3]:
def format_header(x):
    if x == 0 :
        return 'Date'
    if x == 1 :
        return 'Etudiant_Directeur'
    if x == 2 :
        return 'Titre_soutenance'
    if x == 3 :
        return 'Programme'
    if x == 4 :
        return 'Grade_obtenu'
    
def custom_column(table):
    header = table.iloc[0]
    for i in header.index :
        header[i] = format_header(i)
    table = table.rename(columns = header)
    return table
        
def get_table(url):
    udm_site = HTMLTableParser(url)
    table = udm_site.find_table()
    tmp = udm_site.parse_html_table(table)
    format_tmp = custom_column(tmp)
    return format_tmp

def format_content(x):
    if '\n' in x :
        x = x.replace('\n', ' / ') 
        return x
    return x

### 1. Collect all links from 2000 to now

In [4]:
## To update each year
start = 2000
end = 2020 ## Change for the current year -1

phd_links = []
master_links = []
url = 'https://biochimie.umontreal.ca/etudes/memoires-et-soutenances/'
for Y in range(start, end+1):
    if 2000 == Y or 2001 == Y :
        phd_links.append('{}soutenances-{}/'.format(url, Y))
    else:
        phd_links.append('{}soutenances-de-these-{}/'.format(url, Y))
        master_links.append('{}memoires-de-maitrise-{}/'.format(url, Y))

### 2. Create table of UdeM bioinformatics PhD students

In [5]:
for i in range(len(phd_links)):
    if i == 0 :
        all_phd_student = get_table(phd_links[i])
    else:
        tmp = get_table(phd_links[i])
        all_phd_student = all_phd_student.append(tmp, sort=True)
## All student table
all_phd_student = all_phd_student.dropna().reset_index().drop(columns=['index'])
all_phd_student['Etudiant_Directeur'] = all_phd_student['Etudiant_Directeur'].apply(format_content)

In [6]:
bioinfo_phd = all_phd_student[all_phd_student['Programme'] == 'Bio-informatique']
print(len(bioinfo_phd))

27


In [7]:
bioinfo_phd

Unnamed: 0,Date,Etudiant_Directeur,Grade_obtenu,Programme,Titre_soutenance
61,17/12/2007,Sivakumar Kannan / Gertraud Burger,07/02/2008,Bio-informatique,Molecular protein function prediction using se...
62,14/12/2007,Nicolas Rodrigue / Hervé Philippe,07/02/2008,Bio-informatique,Phylogenetic structural modeling of molecular ...
65,07/08/2007,Emmanuelle Permal / François Major,04/10/2007,Bio-informatique,Étude de la structure tertiaire des motifs ARN...
86,27/08/2009,Yan Zhou / Hervé Philippe,08/10/2009,Bio-informatique,Modeling heterotachy in phylogenetics
104,28/09/2010,Claudia Laura Kleinman / Hervé Philippe,04/11/2010,Bio-informatique,Statistical potentials for evolutionary studies
107,23/06/2010,Yu Liu / Franz Bernd Lang,05/08/2010,Bio-informatique,A phylogenomics approach to resolving fungal é...
112,20/01/2010,Yaoqing Shen / Gertraud Burger,01/04/2010,Bio-informatique,In silico analysis of mitochondrial proteins
113,29/01/2010,Mathieu Lajoie / Nadia El-Mabrouk,05/05/2010,Bio-informatique,Approches algorithmiques pour l’inférence d’hi...
117,13/12/2011,Lilianne Dupuis / Normand Mousseau,05/04/2012,Bio-informatique,Développement d’une méthode multi-échelle pour...
128,25/07/2011,Béatrice Roure / Hervé Philippe/Franz Lang,06/10/2011,Bio-informatique,Amélioration de l’exactitude de l’inférence ph...


In [8]:
year=2021
bioinfo_phd.to_csv(f'./{year}/AEBINUM_PHD_LIST_{year}.csv', sep=',', encoding='utf-8', index=False)

### 2. Create table of UdeM bioinformatics master students

In [9]:
for i in range(len(master_links)):
    if i == 0 :
        all_master_student = get_table(master_links[i])
    else:
        tmp = get_table(master_links[i])
        all_master_student = all_master_student.append(tmp, sort=True)
## All student table
all_master_student = all_master_student.dropna().reset_index().drop(columns=['index'])
all_master_student['Etudiant_Directeur'] = all_master_student['Etudiant_Directeur'].apply(format_content)

In [10]:
bioinfo_master = all_master_student[all_master_student['Programme'] == 'Bio-informatique']
print(len(bioinfo_master))

58


In [11]:
bioinfo_master

Unnamed: 0,Date,Etudiant_Directeur,Grade_obtenu,Programme,Titre_soutenance
50,07/12/2005,Geneviève Boucher / Normand Mousseau,12/01/2006,Bio-informatique,Exploration des mécanismes d’agrégation de pep...
77,21/12/2006,Mohamed Tikah Marrakchi / Sylvie Hamel / co: A...,01/02/2007,Bio-informatique,Helix Explorer : une nouvelle base de données ...
82,07/11/2006,Valentin Guignon / Sylvie Hamel / co : Cedric ...,07/12/2006,Bio-informatique,Algorithme de comparaison de structures second...
88,28/04/2006,Tan Wang / Joelle Pelletier,05/10/2006,Bio-informatique,Linker-scanning analysis of the HIV-1 integras...
93,17/12/2007,Audrey Noël / Pavel Hamet / co : Ettore Merlo,07/02/2008,Bio-informatique,Recherche de déterminants génomiques impliqués...
94,19/11/2007,Louis Philippe Lavoie / François Major,10/01/2008,Bio-informatique,Analyse de motifs d’ARN
98,31/07/2007,Guy Larochelle / Hervé Philippe / co : Michel ...,06/09/2007,Bio-informatique,Recherche d’une empreinte phylogénétique relié...
99,03/07/2007,Fabrice Baro / Hervé Philippe,02/08/2007,Bio-informatique,Phylogénie et transferts horizontaux de gènes ...
100,27/05/2007,Sébastien Christin / Nadia El-Mabrouk,02/08/2007,Bio-informatique,Recherche de snoRNA dans le génome de S. cerev...
105,12/04/2007,Nicolas Saint-Onge / François Major,03/05/2007,Bio-informatique,"MC-MAP, un nouvel outil d’intégration de motifs"


In [12]:
year=2021
bioinfo_master.to_csv(f'./{year}/AEBINUM_MASTER_LIST_{year}.csv', sep=',', encoding='utf-8', index=False)