In [106]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

import requests
from bs4 import BeautifulSoup
import re

# Preamble — Prepare to fetch data

These are the root URLs for the two kinds of requests we need to perform: filters and lists.

First, the filter dialog is used to

1. Retreive the IDs for the years and semesters we care about
2. Look for the ID of the associated list.

Lists containt the actual data we care about.

Both types of request use a fixed `ww_i_reportModel` constant, which we assume is used for versioning.

In [95]:
root_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.'
report = '&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&'
informatique_id = '249847'
filter_path = root_url + 'filter?ww_b_list=1' + report
list_path = root_url + 'html?' + report
[ filter_path, list_path ]

['http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&',
 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&']

Next, we look up the IDs for the years and semesters we're interested in.

In [100]:
# Find the IDs for the individual years

req = requests.get(filter_path + 'ww_x_UNITE_ACAD=' + informatique_id)

if req.status_code != 200:
  raise Exception('Request failed')

parser = BeautifulSoup(req.text, "html.parser")

years = parser.find('select', attrs={"name": "ww_x_PERIODE_ACAD"}).findAll('option')
year_ids = {}

for el in years:
    year_ids[el.text] = el['value']

year_ids

{'': 'null',
 '2007-2008': '978181',
 '2008-2009': '978187',
 '2009-2010': '978195',
 '2010-2011': '39486325',
 '2011-2012': '123455150',
 '2012-2013': '123456101',
 '2013-2014': '213637754',
 '2014-2015': '213637922',
 '2015-2016': '213638028',
 '2016-2017': '355925344'}

In [98]:
periods = parser.find('select', attrs={'name': 'ww_x_PERIODE_PEDAGO'})
period_ids = {}

for el in periods:
    period_ids[el.text] = el['value']

period_ids

{'': 'null',
 'Bachelor semestre 1': '249108',
 'Bachelor semestre 2': '249114',
 'Bachelor semestre 3': '942155',
 'Bachelor semestre 4': '942163',
 'Bachelor semestre 5': '942120',
 'Bachelor semestre 5b': '2226768',
 'Bachelor semestre 6': '942175',
 'Bachelor semestre 6b': '2226785',
 'Master semestre 1': '2230106',
 'Master semestre 2': '942192',
 'Master semestre 3': '2230128',
 'Master semestre 4': '2230140',
 'Mineur semestre 1': '2335667',
 'Mineur semestre 2': '2335676',
 'Mise à niveau': '2063602308',
 'Projet Master automne': '249127',
 'Projet Master printemps': '3781783',
 'Semestre automne': '953159',
 'Semestre printemps': '2754553',
 'Stage automne 3ème année': '953137',
 'Stage automne 4ème année': '2226616',
 'Stage printemps 3ème année': '983606',
 'Stage printemps 4ème année': '2226626',
 'Stage printemps master': '2227132'}

In [109]:
def get_gps_id(period, year):
    query = 'ww_x_PERIODE_PEDAGO=' + period_ids[period] + '&ww_x_PERIODE_ACAD=' + year_ids[year]
    req = requests.get(filter_path + 'ww_x_UNITE_ACAD=' + informatique_id + '&' + query)
    
    
    if req.status_code != 200:
      raise Exception('Request failed')
    
    parser = BeautifulSoup(req.text, "html.parser")

    onclick_value = parser.findAll('a', class_ = 'ww_x_GPS')[1]['onclick']
    
    return re.search('ww_x_GPS=(\\d+)', onclick_value).group(1)

get_gps_id(period='Bachelor semestre 1', year='2016-2017')

'2021043255'

In [123]:
def get_table(period, year):
    query = 'ww_x_GPS=' + get_gps_id(period, year) + '&ww_x_UNITE_ACAD=' + informatique_id
    req = requests.get(list_path + query)

    if req.status_code != 200:
      raise Exception('Request failed')

    parser = BeautifulSoup(req.text, "html.parser")

    return list(map((lambda x: x.findAll('td')), parser.findAll('tr')[2:]))

get_table(period='Bachelor semestre 6', year='2008-2009')[0]

[<td style="white-space:nowrap">Monsieur</td>,
 <td style="white-space:nowrap">Balas Martin</td>,
 <td style="white-space:nowrap"></td>,
 <td style="white-space:nowrap"></td>,
 <td style="white-space:nowrap"></td>,
 <td style="white-space:nowrap"></td>,
 <td style="white-space:nowrap"></td>,
 <td style="white-space:nowrap">Présent</td>,
 <td style="white-space:nowrap"></td>,
 <td style="white-space:nowrap"></td>,
 <td>174774</td>,
 <td style="white-space:nowrap"></td>]

# Task 1

> Obtain all the data for the Bachelor students, starting from 2007. Keep only the students for which you have an entry for both Bachelor semestre 1 and Bachelor semestre 6. Compute how many months it took each student to go from the first to the sixth semester. Partition the data between male and female students, and compute the average -- is the difference in average statistically significant?

In [119]:
for i in range(1, 7):
    for y in range(2007, 2017):
        print(get_gps_id(period='Bachelor semestre ' + str(i), year=str(y) + '-' + str(y + 1)))

71297531
109576936
213617925
357704486
736308968
1378362092
1650771864
1744377893
1897032870
2021043255
71297550
109576955
213617944
357704513
736308992
1378362120
1650771892
1744377921
1897032920
2021043310
39494788
71297569
109576974
213617963
357704537
736309015
1378362148
1650771920
1744377949
1897032970
39494823
71297587
109576992
213617981
357704559
736309037
1378362175
1650771947
1744377976
1897033019
19561980
39494874
71297605
109577010
213617999
357704581
736309059
1378362202
1650771974
1744378003
19561998
39494897
71297626
109577031
213618020
357704613
736309090
1378362238
1650772010
1744378039


# Task 2

> Perform a similar operation to what described above, this time for Master students. Notice that this data is more tricky, as there are many missing records in the IS-Academia database. Therefore, try to guess how much time a master student spent at EPFL by at least checking the distance in months between Master semestre 1 and Master semestre 2. If the Mineur field is not empty, the student should also appear registered in Master semestre 3. Last but not the least, don't forget to check if the student has an entry also in the Projet Master tables. Once you can handle well this data, compute the "average stay at EPFL" for master students. Now extract all the students with a Spécialisation and compute the "average stay" per each category of that attribute -- compared to the general average, can you find any specialization for which the difference in average is statistically significant?

# Bonus-Task

> BONUS: perform the gender-based study also on the Master students, as explained in 1. Use scatterplots to visually identify changes over time. Plot males and females with different colors -- can you spot different trends that match the results of your statistical tests?