# Homework 2 : Data from the web

In [14]:
import pandas as pd
import numpy as np
import requests                # Handle HTTP request
from bs4 import BeautifulSoup  # Pull data out of HTML and XML files
import time
import glob                    # Load files
import re                      # Regexp
import scipy.stats as stats    # Hypothesis Testing

## I - Scraping Data 

This part aims to scrap IS-Academia Data. We get Informatique students from 2007 to 2016 for each Bachelor and Master semester.

### a - Analysing form

In this first part, we get the form paramters (name and values) in order to build future HTTP requests.

In [47]:
base_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS'
form_url= base_url + ".filter"
form_get_params = {'ww_i_reportModel': '133685247'}
r = requests.get(form_url, params=form_get_params)
print(r.status_code)                                  # Status code = 200 => OK

200


In [49]:
soup = BeautifulSoup(r.text, 'html.parser')
inputs = soup.select('form select')               # Get all the form inputs
for i in range(len(inputs)):
    print('Param ',i+1,' : ',inputs[i]['name'])

Param  1  :  ww_x_UNITE_ACAD
Param  2  :  ww_x_PERIODE_ACAD
Param  3  :  ww_x_PERIODE_PEDAGO
Param  4  :  ww_x_HIVERETE


In [50]:
for i in range(len(inputs)):
    options = inputs[i].select('option')
    print('Param ',i+1,' : ',inputs[i]['name'], ' - ', len(options), 'values')
    for j in range(len(options)):
        if len(options[j].contents) == 0:
            print('Null', ' - Value : ',options[j]['value'])
        else:
            print(options[j].contents[0], ' - Value : ',options[j]['value'])
    print('\n')

Param  1  :  ww_x_UNITE_ACAD  -  20 values
Null  - Value :  null
Architecture  - Value :  942293
Chimie et génie chimique  - Value :  246696
Cours de mathématiques spéciales  - Value :  943282
EME (EPFL Middle East)  - Value :  637841336
Génie civil  - Value :  942623
Génie mécanique  - Value :  944263
Génie électrique et électronique   - Value :  943936
Humanités digitales  - Value :  2054839157
Informatique  - Value :  249847
Ingénierie financière  - Value :  120623110
Management de la technologie  - Value :  946882
Mathématiques  - Value :  944590
Microtechnique  - Value :  945244
Physique  - Value :  945571
Science et génie des matériaux  - Value :  944917
Sciences et ingénierie de l'environnement  - Value :  942953
Sciences et technologies du vivant  - Value :  945901
Section FCUE  - Value :  1574548993
Systèmes de communication  - Value :  946228


Param  2  :  ww_x_PERIODE_ACAD  -  11 values
Null  - Value :  null
2016-2017  - Value :  355925344
2015-2016  - Value :  213638028
20

### b - Scraping Data

In [173]:
def saveHTML(params, file):
    "Get the result of a request -which parameters are given in params- and save it into a file"
    
    base_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS'   
    data_url = base_url + ".html"
    
    form_params = {'ww_x_GPS': '-1', 
                  'ww_i_reportModel': '133685247', 
                  'ww_i_reportModelXsl': '133685270', # HTML
                  'ww_x_UNITE_ACAD': '249847', # Informatique
                  'ww_x_PERIODE_ACAD': '', 
                  'ww_x_PERIODE_PEDAGO': '',
                  'ww_x_HIVERETE': None }
    for key, value in params.items():
        form_params[key] = value
        
    r = requests.get(data_url, params=form_params)

    html_file = open(file, "w")
    html_file.write(r.text)
    html_file.close()
    
    return r 

In [52]:
def scrapBachelor():
    "Scrap the Bachelor Data from 2007 to 2016 for each pedagogic semester"
    
    params = {'ww_x_PERIODE_ACAD':'', 'ww_x_PERIODE_PEDAGO':''}
    ww_x_PERIODE_ACAD = {'2016':'355925344','2015':'213638028','2014':'213637922','2013':'213637754','2012':'123456101',
                        '2011':'123455150','2010 ':'39486325','2009 ':'978195','2008 ':'978187','2007 ':'978181'}
    ww_x_PERIODE_PEDAGO = {'BS1':'249108','BS2':'249114','BS3':'942155','BS4':'942163','BS5':'942120','BS6':'942175'}
    
    for key_acad, value_acad in ww_x_PERIODE_ACAD.items():
        for key_peda, value_peda in ww_x_PERIODE_PEDAGO.items():
            params['ww_x_PERIODE_ACAD'] = value_acad
            params['ww_x_PERIODE_PEDAGO'] = value_peda
            filename = "./DataApo/Bachelor/Informatique_"+key_acad+"_"+key_peda+".html"
            saveHTML(params, filename)
            time.sleep(0.05)
            
scrapBachelor()

In [53]:
def scrapMaster():
    "Scrap the Master Data from 2007 to 2016 for each pedagogic semester (and project)"
    
    params = {'ww_x_PERIODE_ACAD':'', 'ww_x_PERIODE_PEDAGO':''}
    ww_x_PERIODE_ACAD = {'2016':'355925344','2015':'213638028','2014':'213637922','2013':'213637754','2012':'123456101',
                        '2011':'123455150','2010 ':'39486325','2009 ':'978195','2008 ':'978187','2007 ':'978181'}
    ww_x_PERIODE_PEDAGO = {'MS1':'2230106','MS2':'942192','MS3':'2230128','MS4':'2230140',
                           'ProjetAutomne':'249127','ProjetPrintemps':'3781783'}
    
    for key_acad, value_acad in ww_x_PERIODE_ACAD.items():
        for key_peda, value_peda in ww_x_PERIODE_PEDAGO.items():
            params['ww_x_PERIODE_ACAD'] = value_acad
            params['ww_x_PERIODE_PEDAGO'] = value_peda
            filename = "./DataApo/Master/Informatique_"+key_acad+"_"+key_peda+".html"
            saveHTML(params, filename)
            time.sleep(0.05)
scrapMaster()

### c - Parsing HTML

In [2]:
def parseHTML(file_content):
    "Convert the HTML table into a DataFrame"
    table = BeautifulSoup(file_content, 'html.parser').find('table')
    rows_html = table.findAll('tr')
    header = [th.string for th in rows_html[1]] # row[0] is the title
    # we drop the last column (empty) and the first row (header)
    rows = [[td.string for td in row_html.findAll('td')][:-1] for row_html in rows_html[2:]]
    return pd.DataFrame(data=rows, columns=header)

In [3]:
def get_file_content(file_path):
    "Read a file content and return it as a string"
    file = open(file_path, "r")
    file_content = file.read()
    file.close() 
    return file_content

def getYearSemester(file_path):      # file_path example : './ApoData/Bachelor/Informatique_2007 _BS1.html'
    "Retrieve the year and the semester associated to a file using its name"
    year = int(re.search("[0-9]{4}", file_path).group(0))
    num_semester = int(re.findall("[0-9]", file_path)[-1])
    return year, num_semester

## II - Analysing Bachelor Data

### a - Loading the data

Bachelor Data are loaded from the HTML files previously scrapped.<br/>
Each HTML table is converted to a dataframe thanks to the <i>parseHTML</i> function.<br/>
Year and Bachelor Semester are also added to the dataFrame.<br/>
Then, the tables are concatenated by rows (which is an easy operation because all the dataframes have the same form).

In [4]:
def loadBachelorData():
    folder_path = "./DataApo/Bachelor"
    paths_iterator = glob.iglob(folder_path+'/*.html')
    for idx, file_path in enumerate(paths_iterator):
        file_content = get_file_content(file_path)
        df_temp = parseHTML(file_content)
        year, num_semester = getYearSemester(file_path)
        df_temp['Year'] = year
        df_temp['Bachelor Semester'] = num_semester
        if idx == 0:
            df = df_temp
        else: 
            df = pd.concat([df,df_temp], axis=0)
    return df

In [5]:
df = loadBachelorData()
df.to_csv("Bachelor.csv")
df.head()

Unnamed: 0,Civilité,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Filière opt.,Mineur,Statut,Type Echange,Ecole Echange,No Sciper,Year,Bachelor Semester
0,Monsieur,Arévalo Christian,,,,,,Présent,,,169569,2007,1
1,Monsieur,Aubelle Flavien,,,,,,Présent,,,174905,2007,1
2,Monsieur,Badoud Morgan,,,,,,Présent,,,173922,2007,1
3,Monsieur,Baeriswyl Jonathan,,,,,,Présent,,,179406,2007,1
4,Monsieur,Barroco Michael,,,,,,Présent,,,179428,2007,1


Now, we want to keep the students who get their Bachelor.<br/>
In order to do this, we check if the students have at least one entry for Bachelor Semester 1 and Bachelor Semester 6 (otherwise, we consider that the student failed).<br/>
We will see later the problems we can encounter by applying this method naively.

In [174]:
def semesterOneToSix(sciper, df):
    "Tell if a student has entry for both Bachelor Semester 1 and Semester 6, and his number of Bachelor semesters"
    
    # Get all the semesters of a student
    studentSemesters = df.loc[df['No Sciper'] == sciper] # = df[df.isin([sciper])['No Sciper']]
    # Check semester 1 and 6
    semesterOne = studentSemesters.isin([1])['Bachelor Semester'].value_counts() 
    semesterSix = studentSemesters.isin([6])['Bachelor Semester'].value_counts()
    # At least one 1st and 6th semester
    if (True in semesterOne.keys() and semesterOne[True] > 0) and (True in semesterSix.keys() and semesterSix[True] > 0): 
        return True, len(studentSemesters)
    else:
        return False, len(studentSemesters)

We build a new DataFrame with all the information we need : the student sciper, the gender and the numbers of Bachelor semesters of each student who succeed his Bachelor.

In [7]:
success_bachelor = {'No Sciper': [], 'Gender': [], 'Number of semesters': []}
for row in df.iterrows():
    sciper = row[1]['No Sciper']
    if not (sciper in success_bachelor['No Sciper']):
        success, nb_semesters = semesterOneToSix(sciper, df)
        if success:             # Only the students who have entry for semester 1 and 6
            success_bachelor['No Sciper'].append(sciper)
            success_bachelor['Gender'].append(row[1]['Civilité'])
            success_bachelor['Number of semesters'].append(nb_semesters)      

In [9]:
success_bachelor = pd.DataFrame(success_bachelor)
success_bachelor.head()

Unnamed: 0,Gender,No Sciper,Number of semesters
0,Monsieur,169569,6
1,Monsieur,174905,10
2,Monsieur,179406,8
3,Monsieur,179428,8
4,Monsieur,179449,6


In [10]:
success_bachelor['Number of semesters'].describe()

count    397.000000
mean       7.083123
std        1.524428
min        4.000000
25%        6.000000
50%        6.000000
75%        8.000000
max       12.000000
Name: Number of semesters, dtype: float64

According to this data, it takes approximately 7 semesters to a student to go from the first to the sixth semester.<br/>
Besides, we observe that the minimum of semesters necessary to complete the Bachelor is 4 instead of 6.

In [346]:
success_bachelor.loc[success_bachelor['Number of semesters'] < 6]

Unnamed: 0,Gender,No Sciper,Number of semesters
241,Monsieur,204222,4


Only one student is concerned. Let's have a look at him :

In [99]:
def getStudentSemesters(df, sciper):
    "Get all the bachelor semesters of a student"
    
    return df.loc[df['No Sciper'] == sciper]

In [349]:
getStudentSemesters(df, '204222')

Unnamed: 0,Civilité,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Filière opt.,Mineur,Statut,Type Echange,Ecole Echange,No Sciper,Year,Bachelor Semester
149,Monsieur,Séguy Louis Marie James,,,,,,Présent,,,204222,2011,1
108,Monsieur,Séguy Louis Marie James,,,,,,Présent,,,204222,2011,2
116,Monsieur,Séguy Louis Marie James,,,,,,Présent,,,204222,2014,5
95,Monsieur,Séguy Louis Marie James,,,,,,Présent,,,204222,2014,6


Semesters 4 and 5 are missing, just as years 2012 an 2013. We can suppose that this student is an outlier caused by missing data in IS-Academia.<br/>
So, we chose to delete it in order to not distort the stats:

In [98]:
outlier_index = success_bachelor.loc[success_bachelor['Number of semesters'] < 6].index
success_bachelor = success_bachelor.drop(outlier_index)
len(success_bachelor.loc[success_bachelor['Number of semesters'] < 6])

0

### b - Hypothesis Testing

We want to know if the male and female students need the same time to go from de 1st semester to the 6th semester.

In [11]:
men=success_bachelor.loc[success_bachelor['Gender'] == 'Monsieur']
men.describe()

Unnamed: 0,Number of semesters
count,368.0
mean,7.105978
std,1.536891
min,4.0
25%,6.0
50%,6.0
75%,8.0
max,12.0


In [12]:
women=success_bachelor.loc[success_bachelor['Gender'] == 'Madame']
women.describe()

Unnamed: 0,Number of semesters
count,29.0
mean,6.793103
std,1.346406
min,6.0
25%,6.0
50%,6.0
75%,8.0
max,11.0


- We observe that male are represented by 368 students whereas female are only 29.
- The women need on average 6.8 semesters whereas the men need 7.1 semester.<br/>
The difference is quite small, and the quantiles are the same for both of the groups.<br/>
- Let's see if the difference in average statistically significant.

Let's apply Hypothesis Testing:<br/>
- <b>H0</b> : mean(female) = mean(male)
- <b>H1</b> : mean(female) != mean(male)

Our test statistic is therefore mean(female) - mean(male)<br/>
As we compare 2 populations, we chose to use a <b>two-sample t-test</b>. For this test, we say the variances are different because we observed different std in the 2 precedent tables.<br/>

In [17]:
stats.ttest_ind(a=women['Number of semesters'], b=men['Number of semesters'], equal_var=False)

Ttest_indResult(statistic=-1.191705695448116, pvalue=0.24162457057331926)

We obtain a p-value = 0.22 > 0.05. Therefore, we keep the null hypothesis (H0) and conclude that the difference in average isn't statistically significant with a significance level of 0.05.

<b>Filtering Data</b>

Up to now, we considered that :
- a student succeeded his Bachelor if he had an entry for both Bachelor semester 1 and 6. 
- the number of his Bachelor semesters was egal to his number of entries.

However, there are 2 problems :
- A student who made his Bachelor semester 1 in 2007 can have made another semester 1 in 2006.
- A student who made a Bachelor semester 5 in 2016 and not Bachelor semester 6 in 2015 is likely to make 2 extra semesters. 

In [164]:
df2 = df

# unique index
id_range = np.array(range(len(df)))
df2.index = id_range
df2.index.is_unique

True

We get the No Sciper of students who registered to a Bachelor semester 1 in 2007 and who didn't make a second semester 1 in 2008 (we can't make 3 times the same semester)

In [117]:
index_2007 = df2.loc[df['Year'] == 2007].index
index_2008 = df2.loc[df['Year'] == 2008].index
index_ba1 = df2.loc[df['Bachelor Semester'] == 1].index

index_ba1_2007 = np.intersect1d(index_2007,index_ba1)
index_ba1_2008 = np.intersect1d(index_2008,index_ba1)

sciper_ba1_2007 = df2.iloc[index_ba1_2007]['No Sciper'] # here, iloc and df2.index are the same
sciper_ba1_2008 = df2.iloc[index_ba1_2008]['No Sciper']

sciper_ba1_2007_without_ba1_2008 = np.setdiff1d(sciper_ba1_2007, sciper_ba1_2008)

len(sciper_ba1_2007), len(sciper_ba1_2008), len(sciper_ba1_2007_without_ba1_2008)

(90, 96, 76)

We get the No Sciper of students who registered to a Bachelor semester 5 in 2016 and who didn't make semester 6 in 2015.

In [175]:
index_2015 = df2.loc[df['Year'] == 2015].index
index_2016 = df2.loc[df['Year'] == 2016].index
index_ba5 = df2.loc[df['Bachelor Semester'] == 5].index
index_ba6 = df2.loc[df['Bachelor Semester'] == 6].index

index_ba6_2015 = np.intersect1d(index_2015,index_ba6)
index_ba5_2016 = np.intersect1d(index_2016,index_ba5)

sciper_ba6_2015 = df2.iloc[index_ba6_2015]['No Sciper']
sciper_ba5_2016 = df2.iloc[index_ba5_2016]['No Sciper']

sciper_ba6_2015_and_ba5_2016 = np.setdiff1d(sciper_ba5_2016, sciper_ba6_2015)

len(sciper_ba6_2015), len(sciper_ba5_2016), len(sciper_ba6_2015_and_ba5_2016)

(104, 119, 88)

If we note :
- A : The ensemble of the students who have an entry for both semester 1 and 6
- B : The union of the students to drop

Then we obtain sciper of the students to keep by A\B.

In [176]:
sciperToDrop = np.union1d(sciper_ba1_2007_without_ba1_2008, sciper_ba6_2015_and_ba5_2016)
sciperDropped = np.intersect1d(sciperToDrop, success_bachelor['No Sciper'])
sciperToKeep = np.setdiff1d(success_bachelor['No Sciper'], sciperDropped)
len(sciperToDrop), len(sciperDropped), len(success_bachelor), len(sciperToKeep)

(164, 61, 396, 335)

An inner join enables to keep the desired students.

In [177]:
sciperToKeep_df = pd.DataFrame(sciperToKeep, columns=["No Sciper"])
success_bachelor2 = (pd.merge(success_bachelor,sciperToKeep_df, how="inner"))
success_bachelor2.head()

Unnamed: 0,Gender,No Sciper,Number of semesters
0,Monsieur,175379,8
1,Monsieur,181244,12
2,Monsieur,184772,6
3,Monsieur,186264,11
4,Monsieur,185949,6


The new dataframe is described by the following statistics values :

In [178]:
success_bachelor2.describe()

Unnamed: 0,Number of semesters
count,335.0
mean,7.197015
std,1.579241
min,6.0
25%,6.0
50%,6.0
75%,8.0
max,12.0


In [179]:
men2=success_bachelor2.loc[success_bachelor2['Gender'] == 'Monsieur']
men2.describe()

Unnamed: 0,Number of semesters
count,309.0
mean,7.223301
std,1.592996
min,6.0
25%,6.0
50%,6.0
75%,8.0
max,12.0


In [180]:
women2=success_bachelor2.loc[success_bachelor2['Gender'] == 'Madame']
women2.describe()

Unnamed: 0,Number of semesters
count,26.0
mean,6.884615
std,1.395046
min,6.0
25%,6.0
50%,6.0
75%,8.0
max,11.0


The difference between male and female students is a bit more elevated than previouly, but stays pretty small.

In [181]:
stats.ttest_ind(a=women['Number of semesters'], b=men['Number of semesters'], equal_var=False)

Ttest_indResult(statistic=-1.2242690386586714, pvalue=0.22927095396453942)

The two-sample t-test gives a p-value = 0.22 > 0.05.<br/> 
Again, we keep the null hypothesis (H0) and conclude that the difference in average isn't statistically significant with a significance level of 0.05.

## III - Analysing Master Data

### a - Loading the data

Master Data are loaded from the HTML files previously scrapped.
Each HTML table is converted to a dataframe thanks to the parseHTML function.
Year and Bachelor Semester are also added to the dataFrame.
Then, the tables are concatenated by rows (which is an easy operation because all the dataframes have the same form).

In [185]:
def loadMasterData():
    folder_path = "./DataApo/Master"
    paths_iterator = glob.iglob(folder_path+'/*.html')
    for idx, file_path in enumerate(paths_iterator):
        file_content = get_file_content(file_path)
        df_temp = parseHTML(file_content)
        year, num_semester = getYearSemester(file_path)
        df_temp['Year'] = year
        df_temp['Bachelor Semester'] = num_semester
        if idx == 0:
            df = df_temp
        else: 
            df = pd.concat([df,df_temp], axis=0)
    return df

In [186]:
df = loadBachelorData()
df.to_csv("Master.csv")
df.head()

Unnamed: 0,Civilité,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Filière opt.,Mineur,Statut,Type Echange,Ecole Echange,No Sciper,Year,Bachelor Semester
0,Monsieur,Arévalo Christian,,,,,,Présent,,,169569,2007,1
1,Monsieur,Aubelle Flavien,,,,,,Présent,,,174905,2007,1
2,Monsieur,Badoud Morgan,,,,,,Présent,,,173922,2007,1
3,Monsieur,Baeriswyl Jonathan,,,,,,Présent,,,179406,2007,1
4,Monsieur,Barroco Michael,,,,,,Présent,,,179428,2007,1


In [188]:
# unique index
id_range = np.array(range(len(df)))
df.index = id_range
df.index.is_unique

True

### b - Hypothesis Testing