# Homework: Data from the Web

In [288]:
# setup
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'
import requests
from bs4 import BeautifulSoup
import html5lib
import scipy.stats as stats
import matplotlib.pyplot as plt
import math


#### Examination of IS Academia

We want to examine the IS-Academia page to see what kind of information to pass

In [39]:
# Send an URL request to see the prameters we will need later for getting the data
r = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685270&zz_x_UNITE_ACAD=Informatique&ww_x_UNITE_ACAD=249847&zz_x_PERIODE_ACAD=2007-2008&ww_x_PERIODE_ACAD=978181&zz_x_PERIODE_PEDAGO=Bachelor+semestre+1&ww_x_PERIODE_PEDAGO=249108&zz_x_HIVERETE=Semestre+d%27automne&ww_x_HIVERETE=2936286&dummy=ok')


# get the data of the HTML file
soup = BeautifulSoup(r.text, 'html.parser')


# create the Series (like a dictionnary) that will give us later the code for each parameter that will allow us to get the data
keys = []
values = []
for test in soup.find_all('option') :
    if(test.contents) == [] :
        continue
    keys.append(test.contents[0])
    values.append(test['value'])



encode = pd.Series(data = values, index = keys)
encode


Architecture                                     942293
Chimie et génie chimique                         246696
Cours de mathématiques spéciales                 943282
EME (EPFL Middle East)                        637841336
Génie civil                                      942623
Génie mécanique                                  944263
Génie électrique et électronique                 943936
Humanités digitales                          2054839157
Informatique                                     249847
Ingénierie financière                         120623110
Management de la technologie                     946882
Mathématiques                                    944590
Microtechnique                                   945244
Physique                                         945571
Science et génie des matériaux                   944917
Sciences et ingénierie de l'environnement        942953
Sciences et technologies du vivant               945901
Section FCUE                                 157

#### Examination of one table

For examination we extract the html table for bachelor students in their first semester in 2007-2008

In [40]:
# dict to pass the right values to the form
bach0708 = {'ww_x_GPS':'-1', 'ww_i_reportmodel':'133685247','ww_i_reportModelXsl':'133685270','ww_x_UNITE_ACAD':'249847', 'ww_x_PERIODE_ACAD':'978181', 'ww_x_PERIODE_PEDAGO':'249108', 'ww_x_HIVERETE':'2936286' }
r = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html', params=bach0708)
soup = BeautifulSoup(r.text, 'html.parser')

In [1]:
# we examine the table (we commented it because it take too much place in the document)
#soup.find_all('table')

In [42]:
# we examine the headers of the table
soup.find_all('th')

[<th colspan="12"><font color="black">Informatique, 2007-2008, Bachelor semestre 1</font>
  (90 ét.)
     </th>,
 <th>Civilité</th>,
 <th>Nom Prénom</th>,
 <th>Orientation Bachelor</th>,
 <th>Orientation Master</th>,
 <th>Spécialisation</th>,
 <th>Filière opt.</th>,
 <th>Mineur</th>,
 <th>Statut</th>,
 <th>Type Echange</th>,
 <th>Ecole Echange</th>,
 <th>No Sciper</th>]

### Exercise 1 : Bachelor

Download all public data from http://isa.epfl.ch/imoniteur_ISAP/%21gedpublicreports.htm?ww_i_reportmodel=133685247 for Informatiques section bachelor students since 2007.

#### Bachelor records

Now that we have studied some particular cases to see how is encoded the data in HTML we can code more general methods

We define two methods "tosoup" that takes the pedagocic period as parameter and returns a parsed soup object, and "bachelortoframe" that takes a soup object as parameter and returns a data frame with student number, name, sex and year.

In [43]:
# takes code pedagogic period as parameter, returns the parsed soup object
def tosoup(periode_pedago):
    bach = {'ww_x_GPS':'-1', 'ww_i_reportmodel':'133685247','ww_i_reportModelXsl':'133685270','ww_x_UNITE_ACAD':encode['Informatique'], 'ww_x_PERIODE_ACAD':'', 'ww_x_PERIODE_PEDAGO':periode_pedago, 'ww_x_HIVERETE':'' }
    r = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html', params=bach)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

In [44]:
# takes a soup object as parameter and returns a data frame with all data that is linked to the pedagogic period of the soup parameter
def bachelortoframe(soup):
    
    # create headers for the data frame
    headers = soup.find_all('th')
    H = [h.text.strip() for h in headers]

    #remove the extra column with the table name "Informatique ..." and the repetitions of the header
    del H[0] 
    del H[11:len(H)]

    # find all rows
    datatable_rows = soup.find_all('tr')
    # iterate the data in td cells to an array
    # iterate the year data from the header to an year array
    data = []
    year = []
    for row in datatable_rows:
        # if row is a header and has a tag font it has information about the table: section, period and year
        if (row.th != None and row.th.font):
            # select the year from the header 
            actual_year = row.th.font.contents[0].split(",")[1]
        # if row is not a header find all tds    
        elif (row.th == None and int(actual_year.split("-")[0])>=2007):
            td = row.find_all('td')      
            rowvals = {}
            for i in range(len(H)):
                year.append(actual_year)
                rowvals[H[i]] = td[i].get_text()
                data.append(rowvals)
               
    df = pd.DataFrame(data, columns=H)
    # keep interesting columns     
    df= df[['No Sciper', 'Civilité', 'Nom Prénom']]
    # add year information
    df ['year'] = year
    
    return df




In [45]:
# take all bachelor semester 1 registrations and put them in a dataframe
bachelor1 = bachelortoframe(tosoup(encode['Bachelor semestre 1']))

# we want to keep only one value for bachelor semester 1 which should be the first entry 
bachelor1.drop_duplicates(subset = 'No Sciper' ,keep = 'first', inplace = True)

# we set the sciper number as the index
bachelor1 = bachelor1.set_index('No Sciper')

# add the starting year
start_year = [year.split("-")[0] for year in bachelor1.year]
bachelor1['Start year'] = start_year

# rename year column to Bachelor Semester 1
bachelor1.rename(columns={'year': 'Bachelor semester 1'}, inplace=True)

bachelor1

Unnamed: 0_level_0,Civilité,Nom Prénom,Bachelor semester 1,Start year
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
169569,Monsieur,Arévalo Christian,2007-2008,2007
174905,Monsieur,Aubelle Flavien,2007-2008,2007
173922,Monsieur,Badoud Morgan,2007-2008,2007
179406,Monsieur,Baeriswyl Jonathan,2007-2008,2007
179428,Monsieur,Barroco Michael,2007-2008,2007
179324,Monsieur,Belfis Nicolas,2007-2008,2007
174597,Monsieur,Beliaev Stanislav,2007-2008,2007
179449,Monsieur,Bindschaedler Vincent,2007-2008,2007
178553,Monsieur,Bloch Marc-Olivier,2007-2008,2007
179426,Monsieur,Bloch Remi,2007-2008,2007


In [46]:
# take all bachelor semester 6 registrations and put them in a dataframe
bachelor6 = bachelortoframe(tosoup(encode['Bachelor semestre 6']))

# we want to keep only one value for bachelor semester 6 which should be the last entry (if the student didn't fail 3rd year)
bachelor6.drop_duplicates(subset = 'No Sciper',keep='last', inplace = True)

# we set the sciper number as the index
bachelor6 = bachelor6.set_index('No Sciper')

# add the Ending year
end_year = [year.split("-")[1] for year in bachelor6.year]
bachelor6['End year'] = end_year

# rename year column to Bachelor Semester 6
bachelor6.rename(columns={'year': 'Bachelor semester 6'}, inplace=True)

# we already have name and sex information associated with the student number so we drop those columns
bachelor6 = bachelor6.drop('Civilité', axis=1).drop('Nom Prénom', axis=1)

In [47]:
bachelor6.head()

Unnamed: 0_level_0,Bachelor semester 6,End year
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1
161634,2007-2008,2008
170451,2007-2008,2008
170219,2007-2008,2008
153762,2007-2008,2008
166548,2007-2008,2008


In [48]:
# merge the two tables based on the student number that is index
bachelor1_6 = pd.merge(bachelor1, bachelor6, left_index=True, right_index=True)
bachelor1_6.head()

Unnamed: 0_level_0,Civilité,Nom Prénom,Bachelor semester 1,Start year,Bachelor semester 6,End year
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
169569,Monsieur,Arévalo Christian,2007-2008,2007,2009-2010,2010
179449,Monsieur,Bindschaedler Vincent,2007-2008,2007,2009-2010,2010
178553,Monsieur,Bloch Marc-Olivier,2007-2008,2007,2009-2010,2010
178271,Monsieur,Boéchat Marc-Alexandre,2007-2008,2007,2009-2010,2010
180731,Monsieur,Bricola Jean-Charles,2007-2008,2007,2009-2010,2010


## in case someone repeated semester 5 after their last semester 6 

In [49]:
# take all bachelor semester 5 registrations and put them in a dataframe
bachelor5 = bachelortoframe(tosoup(encode['Bachelor semestre 5']))

# we want to keep only one value for bachelor semester 5 which should be the last entry 
bachelor5.drop_duplicates(subset = 'No Sciper', keep='last', inplace = True)

# we set the sciper number as the index
bachelor5 = bachelor5.set_index('No Sciper')

# add the Bachelor 5 Ending year
end_year = [year.split("-")[1] for year in bachelor5.year]
bachelor5['Bachelor 5 end year'] = end_year

# rename year column to Bachelor Semester 5
bachelor5.rename(columns={'year': 'Bachelor semester 5'}, inplace=True)

# we remove the repeted columns 'civilité' and 'Nom Prénom'
bachelor5 = bachelor5.drop('Civilité', axis=1).drop('Nom Prénom', axis=1)



In [50]:
bachelor5.head()

Unnamed: 0_level_0,Bachelor semester 5,Bachelor 5 end year
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1
154157,2007-2008,2008
160213,2007-2008,2008
161634,2007-2008,2008
170451,2007-2008,2008
170219,2007-2008,2008


In [51]:
# we merge the merged data frame bachelor 1 and 6 with the extra bachelor 5 data frame
all_bachelor = pd.merge(bachelor1_6, bachelor5, left_index=True, right_index=True)

all_bachelor.head()

Unnamed: 0_level_0,Civilité,Nom Prénom,Bachelor semester 1,Start year,Bachelor semester 6,End year,Bachelor semester 5,Bachelor 5 end year
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
169569,Monsieur,Arévalo Christian,2007-2008,2007,2009-2010,2010,2009-2010,2010
179449,Monsieur,Bindschaedler Vincent,2007-2008,2007,2009-2010,2010,2009-2010,2010
178553,Monsieur,Bloch Marc-Olivier,2007-2008,2007,2009-2010,2010,2009-2010,2010
178271,Monsieur,Boéchat Marc-Alexandre,2007-2008,2007,2009-2010,2010,2009-2010,2010
180731,Monsieur,Bricola Jean-Charles,2007-2008,2007,2009-2010,2010,2009-2010,2010


In [52]:
# we compare the end year of bachelor 5 and 6, if the end year for bachelor 5 is later than for bachelor 6
# we add 0.5 (years) in a new column "Extra semester, otherwise we add 0

all_bachelor['Extra semester'] = np.where(all_bachelor['Bachelor 5 end year'] > all_bachelor['End year'] , 0.5, 0)
all_bachelor.head(50)

Unnamed: 0_level_0,Civilité,Nom Prénom,Bachelor semester 1,Start year,Bachelor semester 6,End year,Bachelor semester 5,Bachelor 5 end year,Extra semester
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
169569,Monsieur,Arévalo Christian,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
179449,Monsieur,Bindschaedler Vincent,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
178553,Monsieur,Bloch Marc-Olivier,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
178271,Monsieur,Boéchat Marc-Alexandre,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
180731,Monsieur,Bricola Jean-Charles,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
171619,Monsieur,Buchschacher Nicolas,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
178656,Monsieur,Curreli Alexandre,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
178706,Madame,Falbriard Stéphanie,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
181121,Monsieur,Frund Loïc,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0
180570,Monsieur,Habfast Paul,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0


In [53]:
#Convert the dataFrame columns so we can compute the number of year spent by each student in bachelor
all_bachelor_start = [int(i) for i in all_bachelor['Start year']]
all_bachelor_end   = [int(i) for i in all_bachelor['End year']]
all_bachelor_extra =  [float(i) for i in all_bachelor['Extra semester']]

#number of year spent by each student in bachelor
all_bachelor['Stay in years'] = [all_bachelor_end[i] - all_bachelor_start[i] + all_bachelor_extra[i] for i in range(len(all_bachelor))]
all_bachelor

Unnamed: 0_level_0,Civilité,Nom Prénom,Bachelor semester 1,Start year,Bachelor semester 6,End year,Bachelor semester 5,Bachelor 5 end year,Extra semester,Stay in years
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
169569,Monsieur,Arévalo Christian,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
179449,Monsieur,Bindschaedler Vincent,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
178553,Monsieur,Bloch Marc-Olivier,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
178271,Monsieur,Boéchat Marc-Alexandre,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
180731,Monsieur,Bricola Jean-Charles,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
171619,Monsieur,Buchschacher Nicolas,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
178656,Monsieur,Curreli Alexandre,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
178706,Madame,Falbriard Stéphanie,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
181121,Monsieur,Frund Loïc,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0
180570,Monsieur,Habfast Paul,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0


In [54]:
all_bachelor['Stay in months'] = all_bachelor['Stay in years']*12
all_bachelor.head()


Unnamed: 0_level_0,Civilité,Nom Prénom,Bachelor semester 1,Start year,Bachelor semester 6,End year,Bachelor semester 5,Bachelor 5 end year,Extra semester,Stay in years,Stay in months
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
169569,Monsieur,Arévalo Christian,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0,36.0
179449,Monsieur,Bindschaedler Vincent,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0,36.0
178553,Monsieur,Bloch Marc-Olivier,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0,36.0
178271,Monsieur,Boéchat Marc-Alexandre,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0,36.0
180731,Monsieur,Bricola Jean-Charles,2007-2008,2007,2009-2010,2010,2009-2010,2010,0.0,3.0,36.0


In [55]:
# for analysis we need the sex and duration in months
all_bachelor = all_bachelor[['Civilité','Stay in months']]

In [56]:
all_bachelor.groupby(['Civilité']).mean()

Unnamed: 0_level_0,Stay in months
Civilité,Unnamed: 1_level_1
Madame,40.758621
Monsieur,42.880435


In [57]:
all_bachelor.describe()

Unnamed: 0,Stay in months
count,397.0
mean,42.725441
std,9.372089
min,36.0
25%,36.0
50%,36.0
75%,48.0
max,84.0


As we need to see if the difference in average statistically significant for the months spent in bachelor in EPFL between man and woman, we need to so Two-Sample T-Test. Let us take a significant level of 5%

In [290]:
man = []
woman = []
for i in range(len(all_bachelor)) :
    if all_bachelor['Civilité'][i] == 'Monsieur':
        man.append(all_bachelor['Stay in months'][i])
    else :
        woman.append(all_bachelor['Stay in months'][i])
        

stats.ttest_ind(a= man,
                b= woman,
                equal_var=False)

Ttest_indResult(statistic=1.3437005678090845, pvalue=0.18785555340784144)

We can see that the pvalue is greater than our significant level. hence we cannot reject the hypothesis that the average numbers of months spent in bachelor at EPFL for a man or a woman isn't equal (it can be the same average)

### Exercise 2 : Master

The IS-Academia data for master students is more complicated. We take all students who finished M1 and M2 at EPFL into account. We consider first entry in M1 as start year and we double-check whether students registered for M1 again after completing the

For masters we modify the toframe method to include more columns as now we are interested in specialisations and minors.

In [146]:
def masterstoframe(soup):
    # create headers for the data frame
    headers = soup.find_all('th')
    H = [h.text.strip() for h in headers]

    #remove the extra column with the table name "Informatique ..." and the repetitions of the header
    del H[0] 
    del H[11:len(H)]

    # find all rows
    datatable_rows = soup.find_all('tr')

    # iterate the data in td cells to an array
    # iterate the year data from the header to an year array
    data = []
    year = []
    for row in datatable_rows:
        # if row is a header and has a tag font (information about the table: section, period and year)
        if (row.th != None and row.th.font):
            # select the year from the header 
            actual_year = row.th.font.contents[0].split(",")[1]
        # if row is not a header find all tds    
        elif (row.th == None and int(actual_year.split("-")[0])>=2007): 
            td = row.find_all('td')      
            rowvals = {}
            for i in range(len(H)):
                year.append(actual_year)
                rowvals[H[i]] = td[i].get_text()
                data.append(rowvals)
               
    df = pd.DataFrame(data, columns=H)
    
    # keep interesting columns
    df = df[['No Sciper', 'Civilité', 'Nom Prénom', 'Spécialisation', 'Mineur']]
    
    # add year information
    df['year'] = year
    
    # drop the duplicates of the data frame
    df = df.drop_duplicates()

    # set the student number as the Index
    df = df.set_index(['No Sciper'])
    
    return df

In [236]:

# take all master semester 1 registrations and put them in a dataframe
master1 = masterstoframe(tosoup(encode['Master semestre 1']))
# add master level column
master1['Level'] = 'MA1'


# take all master semester 2 registrations and put them in a dataframe
master2 = masterstoframe(tosoup(encode['Master semestre 2']))
#add master level column
master2['Level'] = 'MA2'



# take all master semester 3 registrations and put them in a dataframe
master3 = masterstoframe(tosoup(encode['Master semestre 3']))
#add master level column
master3['Level'] = 'MA3'

# take all master autumn and spring registrations and put them in two dataframes
autumn_project = masterstoframe(tosoup(encode['Projet Master automne']))
autumn_project['Level'] = 'Autumn Project'
spring_project = masterstoframe(tosoup(encode['Projet Master printemps']))
spring_project['Level'] = 'Semester Project'

# concatenate all masters semesters and projects
master123 = pd.concat([master1,master2,master3, autumn_project, spring_project
                       
# sort by index to get every master semester and project done by each student
master123 = master123.sort_index()

# show the obtained dataFrame
master123

Unnamed: 0_level_0,Civilité,Nom Prénom,Spécialisation,Mineur,year,Level
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
128911,Monsieur,Gulati Asheesh,Internet computing,,2007-2008,MA3
128911,Monsieur,Gulati Asheesh,Internet computing,,2007-2008,MA2
129093,Monsieur,Zhou Maoan,,,2007-2008,MA2
129093,Monsieur,Zhou Maoan,,,2007-2008,MA3
129326,Monsieur,Ni Zhong Zhong,,,2007-2008,MA2
129326,Monsieur,Ni Zhong Zhong,,,2007-2008,MA3
138088,Monsieur,Droz-dit-Busset Arnault,,,2007-2008,MA1
145546,Monsieur,Clivaz Jean-Philippe,,,2007-2008,MA3
145957,Monsieur,Hügli Michael,,,2007-2008,MA3
145957,Monsieur,Hügli Michael,,,2007-2008,Semester Project


Now that we have a dataFrame describing the number of semesters of each student with the number of semester done ordered, we can count the duration of each student in year at EPFL. 
It is important to specify that for the masters, we don't need to take into account the year of begging and ending of each person as even if they stopped courses in between, they are registered as in 'congé' or  'stage'.
We also didn't count people who has done only one semester as they probably didn't get their master at EPFL.

In [170]:
# number of semesters spent by each student in months by Sciper numbers
master123.index.value_counts()

166491    8
146742    7
204393    7
194182    7
179988    7
201600    7
181244    7
192861    6
170224    6
202508    6
184814    6
153066    6
154573    6
226511    6
200932    6
170235    6
181248    6
170530    6
225434    6
178682    6
200246    5
214225    5
245293    5
217469    5
196065    5
204253    5
243164    5
179703    5
244136    5
160213    5
         ..
268464    1
172257    1
268695    1
235568    1
192555    1
153642    1
153506    1
200168    1
152238    1
227120    1
234033    1
235350    1
272620    1
210707    1
221053    1
252556    1
268907    1
233387    1
253734    1
225203    1
272025    1
215605    1
224640    1
224638    1
180161    1
268226    1
272324    1
269057    1
147445    1
153640    1
dtype: int64

In [325]:
# copy the previous merged master frame
semesters_frame = master123.copy()

# get the number of semesters done by each student
stay = master123.index.value_counts()

# just take the first No Sciper of each student (in other terms drop the duplicates by index)
semesters_frame = semesters_frame.groupby(semesters_frame.index).first()

# create a new column in the frame with the number of semesters done by each student
semesters_frame['stay_in_months'] = stay.values

# drop unecessary columns for our analysis
semesters_frame.drop(['year', 'Level'], axis = 1, inplace = True)

# remove student that were registered in one semester or less
semesters_frame = semesters_frame[semesters_frame.stay_in_months >= 2]

# convert the stay from semesters to months
semesters_frame.stay_in_months = semesters_frame.stay_in_months * 6

# show the frame
semesters_frame


Unnamed: 0_level_0,Civilité,Nom Prénom,Spécialisation,Mineur,stay_in_months
No Sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
128911,Monsieur,Gulati Asheesh,Internet computing,,48
129093,Monsieur,Zhou Maoan,,,42
129326,Monsieur,Ni Zhong Zhong,,,42
138088,Monsieur,Droz-dit-Busset Arnault,,,42
145546,Monsieur,Clivaz Jean-Philippe,,,42
145957,Monsieur,Hügli Michael,,,42
146330,Monsieur,Cardinaux Damien,,,42
146441,Madame,Henriot Laetitia,,,36
146742,Monsieur,Marx Clément,"Signals, Images and Interfaces",,36
146752,Monsieur,De Lamarter Erik,,,36


We want to remove students that has specialisation but are registered since less than 3 semesters i.e still doing their masters.

In [326]:
# create a new dataFrame to get indices to remove as explained above 
semesters_frame2 = semesters_frame.copy()
# take intersection of the two sets (<18 and have spécialisation)
semesters_frame2 = semesters_frame2[semesters_frame2.stay_in_months < 18]
semesters_frame2 = semesters_frame2[semesters_frame2.Spécialisation != '']
# indices to remove as a List
to_remove = semesters_frame2.index.values.tolist()
# remove thoses indices from our initial dataFrame
semesters_frame = semesters_frame.drop(to_remove, axis = 0)


In [327]:
semesters_frame.stay_in_months.describe()

count    761.000000
mean      20.475690
std        6.080145
min       12.000000
25%       18.000000
50%       18.000000
75%       24.000000
max       48.000000
Name: stay_in_months, dtype: float64

In [365]:
# take only the people who has Spécialisation from the data Frame
semester_frame_spec = semesters_frame[semesters_frame['Spécialisation'] != '']

# do the groupby by spécialisation
semester_frame_spec.groupby('Spécialisation').mean()

Unnamed: 0_level_0,stay_in_months
Spécialisation,Unnamed: 1_level_1
Biocomputing,28.0
Computer Engineering - SP,20.666667
Data Analytics,18.0
Foundations of Software,21.488372
Information Security - SP,18.0
Internet computing,22.909091
Service science,24.0
"Signals, Images and Interfaces",24.947368
Software Systems,19.5


We can see from the resutls that the new specializations have an average length of 18 months only (i.e the minimum) because they just have been launched. Those are the people in MA3 that started their master with no specialization but took the courses required to have it and asked for it as soon as it has been launched.


### Statistical test : 

As we need to see now if there is any specialization for which the difference in average is statistically significant compared to the general average, we need to do a one One-Sample T-Test as it checks whether a sample mean (spécialisation mean) differs from the population mean (average stay in Master Epfl).

In [None]:
# we loop on all spécialisation
for spec in semester_frame_spec['Spécialisation'].drop_duplicates() :
    # get the months spent by each student of the spec spécialisation to compute the mean
    data = np.array(semester_frame_spec[semester_frame_spec.Spécialisation == spec]['stay_in_months'])
    # if we have a really small number of samples we cannot rely on the statistic test
    if (len(data) < 5) :
        print("we cannot conclude for this spécialisation as we have less than five samples for the " + str(spec) + " spécialisation")
        continue
    # all students average stay at EPFL
    mean_spec = data.mean()
    # Spécialisation students average stay at EPFL
    mean_all = semesters_frame.stay_in_months.mean()
    # call the stats one function and get the p_value
    t, p = stats.ttest_1samp(mean_spec, mean_all)
    # if the p_value is less than 0.05 we reject at 95% level of confidence
    if (p < 0.05) :
        print("as the p value is equal to " + str(p) + " which is less than 0.05 we can say that we reject the hypothesis that the stay of a Spécialisation " + str(spec) + " student average duration is equal to the average stay of a general student at level 95%")
        # if the p_value is higher than 0.05 we accept at 95% level of confidence
    else : 
        print("as the p value is equel to " + str(p) + " which is higher than 0.05 we can say that we cannot reject the hypothesis that the Spécialisation " + str(spec) + "student average duration is equal to the average stay of a general student at level 95% (they can be equal)")