# ADA - Homework 2

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
sns.set_style('whitegrid')

## Requesting and parsing data from IS-Academia

In [None]:
import requests
from bs4 import BeautifulSoup

I parse the first page to get all the option fields useful to make queries later 

In [None]:
r = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?&ww_i_reportmodel=133685247')
soup = BeautifulSoup(r.text, 'html.parser')
options = {}
for option in soup.find_all('option'):
    options[option.parent.get('name')]=options.get(option.parent.get('name'),{})
    options[option.parent.get('name')][option.text]=option.get('value')
options

In [None]:
# INPUT:
# unite_acad : categorical from options.ww_x_UNITE_ACAD.keys()
# periode_acad : categorical from options.ww_x_PERIODE_ACAD.keys()
# periode_pedago : categorical from options.ww_x_PERIODE_PEDAGO.keys()
# hiverete : categorical from options.ww_x_HIVERETE.keys()
# 
# OUTPUT
# data : pandas dataframe containing the required data
def get_data(unite_acad,periode_acad,periode_pedago,hiverete):
    request = ('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?'
        + 'ww_x_GPS=-1'
        + '&ww_i_reportModel=133685247'
        + '&ww_i_reportModelXsl=133685270'
        + '&ww_x_UNITE_ACAD=' + options['ww_x_UNITE_ACAD'][unite_acad]
        + '&ww_x_PERIODE_ACAD=' + options['ww_x_PERIODE_ACAD'][periode_acad]
        + '&ww_x_PERIODE_PEDAGO=' + options['ww_x_PERIODE_PEDAGO'][periode_pedago]
        + '&ww_x_HIVERETE='+ options['ww_x_HIVERETE'][hiverete]
    )
    soup = BeautifulSoup(requests.get(request).text, 'html.parser')
    
    table_lines = soup.find_all('tr')
    columns = [th.text for th in table_lines[1].contents] if len(table_lines)>1 else []
    nColumns = len(columns)
    
    data = {}
    for name in columns:
        data[name] = []
    
    for tr in table_lines[2:]:
        content = tr.contents
        if content[0].name=='td' :
            for i in range(nColumns):
                data[columns[i]].append(content[i].text)
    
    result = pd.DataFrame(data=data)
    result['ww_x_UNITE_ACAD']=unite_acad
    result['ww_x_PERIODE_ACAD']=periode_acad
    result['ww_x_PERIODE_PEDAGO']=periode_pedago
    result['ww_x_HIVERETE']=hiverete
    return result

A small test for the `get_data` method

In [None]:
get_data(
    'Informatique',
    '2007-2008',
    'Bachelor semestre 1',
    "Semestre d'automne"
).head()

In [None]:
def get_all_data(unite_acads,periode_acads,periode_pedagos,hiveretes):
    result = pd.DataFrame({})
    for unite_acad in unite_acads:
        for periode_acad in periode_acads:
            for periode_pedago in periode_pedagos:
                for hiverete in hiveretes:
                    print("_".join([unite_acad,periode_acad,periode_pedago,hiverete]))
                    result = pd.concat([result,get_data(unite_acad,periode_acad,periode_pedago,hiverete)])
    return result

## Question 1

### Importing the data

In [None]:
df_bachelor = get_all_data(
    ['Informatique'],
    ['2007-2008','2008-2009','2009-2010','2010-2011','2011-2012','2012-2013','2013-2014','2014-2015','2015-2016','2016-2017'],
    ['Bachelor semestre 1','Bachelor semestre 2','Bachelor semestre 3','Bachelor semestre 4','Bachelor semestre 5','Bachelor semestre 6','Bachelor semestre 5b','Bachelor semestre 6b'],
    ["Semestre d'automne",'Semestre de printemps']
)
print(df_bachelor.shape)
df_bachelor.head()

### Exploring the data

In [None]:
print(df_bachelor.shape)
df_bachelor.head()

5b ? 6b ?
These have no data! Youhou

In [None]:
print(df_bachelor[df_bachelor.ww_x_PERIODE_PEDAGO == 'Bachelor semestre 6b'].shape)
print(df_bachelor[df_bachelor.ww_x_PERIODE_PEDAGO == 'Bachelor semestre 5b'].shape)

Counting the students

In [None]:
print(len(df_bachelor['No Sciper'].unique()))

## filtering only students with a Bachelor semestre 1 & 6

In [None]:
df_bachelor_semestre_1 = df_bachelor[df_bachelor.ww_x_PERIODE_PEDAGO=='Bachelor semestre 1']
df_bachelor_semestre_6 = df_bachelor[df_bachelor.ww_x_PERIODE_PEDAGO=='Bachelor semestre 6']

In [None]:
df_bachelor_filtered = df_bachelor[df_bachelor['No Sciper'].isin(df_bachelor_semestre_1['No Sciper'].tolist())]
df_bachelor_filtered = df_bachelor_filtered[df_bachelor_filtered['No Sciper'].isin(df_bachelor_semestre_6['No Sciper'].tolist())]
print(df_bachelor_filtered.shape)
df_bachelor_filtered.head()

Counts the students

In [None]:
print(len(df_bachelor_filtered['No Sciper'].unique()))

## Compute average of number of months 
Method used: 
- group by sex and student 
- computes number_of_entries (corresponding on the number of semester studied) for each student
- computes average of (6*number_of_entries)




In [None]:
df_bachelor_grouped = df_bachelor_filtered[['Civilité','No Sciper','ww_x_PERIODE_PEDAGO']].groupby(['Civilité','No Sciper'])
print('number of F: %d' % (df_bachelor_grouped.size()['Madame'].count()))
print('number of M: %d' % (df_bachelor_grouped.size()['Monsieur'].count()))
print('F avg number of months: %f' % (df_bachelor_grouped.size()['Madame'].mean()*6))
print('M avg number of months: %f' % (df_bachelor_grouped.size()['Monsieur'].mean()*6))

### Statistical significance