In [1]:
import pandas as pd
import scipy.stats as stats
import os

Concatenate our files. Reminder: we have a file per year, we all of 'Informatique' in it.

In [2]:
# requires os
def concatFiles(direc, fileType):
    files = os.listdir(direc)
    if '.DS_Store' in files:
        files.remove('.DS_Store')
    print(files)
    if fileType == 'csv':
        r = pd.read_csv
    for idx, file in enumerate(files):
        file = direc+file
        if idx == 0:
            df = r(file, header=0)
            print(file)
        else:
            new = r(file, header=0)
            df = pd.concat([df, new], axis=0)
            print(file)
    return df

In [3]:
info = concatFiles('data/', 'csv')

['informatique_2011_2012.csv', 'informatique_2014_2015.csv', 'informatique_2012_2013.csv', 'informatique_2008_2009.csv', 'informatique_2007_2008.csv', 'informatique_2016_2017.csv', 'informatique_2015_2016.csv', 'informatique_2010_2011.csv', 'informatique_2013_2014.csv', 'informatique_2009_2010.csv']
data/informatique_2011_2012.csv
data/informatique_2014_2015.csv
data/informatique_2012_2013.csv
data/informatique_2008_2009.csv
data/informatique_2007_2008.csv
data/informatique_2016_2017.csv
data/informatique_2015_2016.csv
data/informatique_2010_2011.csv
data/informatique_2013_2014.csv
data/informatique_2009_2010.csv


We define a locating function to grab only bachelor data

In [4]:
def locator(s):
    return s.find('Bachelor') != -1

In [5]:
ba = info.loc[info['Semester'].apply(locator)]

We define a function to locate a specific sciper #. They are unique.

In [6]:
def locSciper(df, sciper):
    return df.loc[df['No Sciper'] == sciper]

isOneSix checks if there is an entry for BA1 and BA6 in the given DataFrame

In [7]:
def isOneSix(df):
    """ Take a DataFrame and check that there is an entry for both BA1 and BA6
    
    Args:
        df (DataFrame): a DataFrame, typically a .loc on a specific student
        
    Returns:
        bool: True if it finds both BA1 and BA6, False otherwise
    """
    try:
        one = df.isin(['Bachelor semestre 1'])['Semester'].value_counts()[True] > 0
    except KeyError:
        one = 0
    try:
        six = df.isin(['Bachelor semestre 6'])['Semester'].value_counts()[True] > 0
    except KeyError:
        six = 0
    return (one and six)

In [8]:
def getGender(df):
    """ Take a DataFrame and checks the gender
    
    Args:
        df (DataFrame): a DataFrame, typically a .loc on a specific student
        
    Returns:
        bool: True if student is a woman, False otherwise
    """
    try:
        if df.isin(['Madame'])['Civilité'].value_counts()[True] > 0:
            return 1
    except KeyError:
        return 0

We loop through the bachelor rows, perform checks and fill a dict with appropriate data.

In [9]:
dico = {'sciper': [], 'gender': [], 'length': []}
for row in ba.iterrows():
    o = row[1]
    sciper = o['No Sciper']
    # Making sure we consider unique sciper #s
    if not sciper in dico['sciper']:
        df = locSciper(ba, sciper)
        if isOneSix(df):
            dico['sciper'].append(sciper)
            dico['gender'].append(getGender(df))
            # Calculating length of stay by nbr of rows
            dico['length'].append(len(df))

We transform this dict into a DataFrame, now containing only the students we're concerned with

In [10]:
data = pd.DataFrame(dico)

We do a quick describe to check for inconsistencies

In [11]:
data['length'].describe()

count    397.000000
mean       7.083123
std        1.524428
min        4.000000
25%        6.000000
50%        6.000000
75%        8.000000
max       12.000000
Name: length, dtype: float64

We notice the minimum is 4, which doesn't seem to make sense. So we check what it is:

In [12]:
data.loc[data.length < 6]

Unnamed: 0,gender,length,sciper
64,0,4,204222


In [13]:
locSciper(ba, 204222)

Unnamed: 0,Civilité,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Filière opt.,Mineur,Statut,Type Echange,Ecole Echange,No Sciper,Subject,Period,Semester
165,Monsieur,Séguy Louis Marie James,,,,,,Présent,,,204222,Informatique,2011-2012,Bachelor semestre 1
290,Monsieur,Séguy Louis Marie James,,,,,,Présent,,,204222,Informatique,2011-2012,Bachelor semestre 2
721,Monsieur,Séguy Louis Marie James,,,,,,Présent,,,204222,Informatique,2014-2015,Bachelor semestre 5
840,Monsieur,Séguy Louis Marie James,,,,,,Présent,,,204222,Informatique,2014-2015,Bachelor semestre 6


It seems there are missing records for this person. We could take him out.

Setting up our statistical calculations

In [14]:
men = data.loc[data.gender == 0]

In [15]:
men.length.mean()

7.105978260869565

In [16]:
women = data.loc[data.gender == 1]

In [17]:
women.length.mean()

6.793103448275862

In [18]:
stats.ttest_ind(a=women.length, b=men.length, )

Ttest_indResult(statistic=-1.0643000334248733, pvalue=0.2878429746516184)

Not significant.

Now we try something a bit different. We will remove all students who started their Bachelor after 2012-2013. Indeed if we take the students who finished their Bachelor but started it in 2013-2014, we will only have the students who finished it in 3 years. We won't take into account the students who had to repeat a year. Let's see if it changes something in the stats

In [52]:
def started_before_2013(df):
    """ Take a DataFrame and check that there is an entry for both BA1 and BA6
    
    Args:
        df (DataFrame): a DataFrame, typically a .loc on a specific student
        
    Returns:
        bool: True if it finds both BA1 and BA6, False otherwise
    """
    wrong_years = ['2014-2015', '2013-2014', '2012-2013'] 
    
    try:
        periods_ba1 = df.loc[df['Semester'] == 'Bachelor semestre 1'].Period
        if any(years in wrong_years for years in periods_ba1):
            return False
        else:
            return True
    except KeyError:
        return False

In [53]:
dico_bf2013 = {'sciper': [], 'gender': [], 'length': []}
for row in ba.iterrows():
    o = row[1]
    sciper = o['No Sciper']
    # Making sure we consider unique sciper #s
    if not sciper in dico_bf2013 ['sciper']:
        df = locSciper(ba, sciper)
        if isOneSix(df) and started_before_2013(df):
            dico_bf2013 ['sciper'].append(sciper)
            dico_bf2013 ['gender'].append(getGender(df))
            # Calculating length of stay by nbr of rows
            dico_bf2013['length'].append(len(df))

In [54]:
data_bf2013 = pd.DataFrame(dico_bf2013)

In [55]:
data_bf2013.length.describe()

count    229.000000
mean       7.144105
std        1.603511
min        4.000000
25%        6.000000
50%        6.000000
75%        8.000000
max       12.000000
Name: length, dtype: float64

In [56]:
data_bf2013 = data_bf2013.drop(data_bf2013.loc[data_bf2013.length < 6].index)
data_bf2013.length.describe()

count    228.000000
mean       7.157895
std        1.593373
min        6.000000
25%        6.000000
50%        6.000000
75%        8.000000
max       12.000000
Name: length, dtype: float64

In [62]:
men_bf2013 = data_bf2013.loc[data_bf2013.gender == 0]
men_bf2013.length.mean()

7.20952380952381

In [63]:
women_bf2013 = data_bf2013.loc[data_bf2013.gender == 1]
women_bf2013.length.mean()

6.555555555555555

In [65]:
stats.ttest_ind(a=women_bf2013.length, b=men_bf2013.length, )

Ttest_indResult(statistic=-1.6778271909176858, pvalue=0.094763167414470079)