Gender Biases in Student Evaluations of Teachers
====================================================


In [1]:
%matplotlib inline
import math
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
from __future__ import division
# import permute #Install instructions at https://github.com/statlab/permute

In [2]:
dat = pd.read_stata("sample_permutation.dta")
dat = dat[dat.admission_cep == 0]
dat.describe()

Unnamed: 0,student_id,year,entreescpoen,stu_male,stu_female,admission_exam,admission_cep,admission_bactb,admission_other,stu_avg_final,...,history,micro,ip,macro,socio,scpo,both_female,both_male,stumale_proffemale,stufemale_profmale
count,16.0,16.0,16.0,16.0,16.0,16.0,16,16.0,16,16.0,...,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,2687.5,2010.5625,2010.5,0.5625,0.4375,0.9375,0,0.0625,0,13.314236,...,0.25,0.1875,0.1875,0.0625,0.25,0.0625,0.1875,0.3125,0.25,0.25
std,1121.691401,1.152895,1.21106,0.512348,0.512348,0.25,0,0.25,0,1.401222,...,0.447214,0.403113,0.403113,0.25,0.447214,0.25,0.403113,0.478714,0.447214,0.447214
min,556.0,2008.0,2008.0,0.0,0.0,0.0,0,0.0,0,9.833334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1922.0,2010.0,2010.0,0.0,0.0,1.0,0,0.0,0,12.368056,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2774.0,2010.5,2010.5,1.0,0.0,1.0,0,0.0,0,13.416667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3582.25,2011.25,2011.25,1.0,1.0,1.0,0,0.0,0,14.284722,...,0.25,0.0,0.0,0.0,0.25,0.0,0.0,1.0,0.25,0.25
max,4414.0,2012.0,2012.0,1.0,1.0,1.0,0,1.0,0,15.138889,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
evals = ['q16', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'q13', 'q14', 'q15']
for e in evals:
    dat[e] = dat[e].replace(['nonpertinent', 'insuffisant', 'moyen', 'bon', 'excellent'], [0,1,2,3,4])
    
dat['course'] = dat['prof_id']
dat['course'][dat['history']==1] = 'history' 
dat['course'][dat['micro']==1] = 'micro' 
dat['course'][dat['ip']==1] = 'ip' 
dat['course'][dat['macro']==1] = 'macro' 
dat['course'][dat['socio']==1] = 'socio' 
dat['course'][dat['scpo']==1] = 'scpo' 

dat['gender'] = dat.prof_male
dat['gender'][dat.gender==1] = 'M'
dat['gender'][dat.gender==0] = 'F'

grouped = dat.groupby(['prof_id', 'course', 'gender'])
prof_ratings = grouped[evals].mean()

assess_grade_mean = grouped['note_finale'].agg([np.mean,pass_fail])

teacher_data = pd.concat([prof_ratings, pass_rate], axis=1)
teacher_data = teacher_data.rename(columns = {'mean':'mean_final_exam', '<lambda>':'pass_rate'}).reset_index()
teacher_data

Unnamed: 0,prof_id,course,gender,q16,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,q13,q14,q15,mean_final_exam,pass_rate
0,1,scpo,F,3,3,3,3,3,4,0,2,4,4,3,2,1,2,14.166667,1
1,2,micro,M,2,2,4,3,2,2,3,4,3,3,2,2,2,2,12.333333,1
2,8,socio,F,3,3,3,2,2,3,3,3,3,3,3,2,2,2,16.5,1
3,13,socio,M,3,3,4,3,4,4,3,3,3,3,2,2,2,1,9.0,0
4,16,socio,F,3,3,3,2,2,3,3,2,3,3,2,4,2,2,11.666667,1
5,21,micro,M,3,3,3,2,3,3,2,3,4,4,2,3,2,2,14.833333,1
6,28,ip,F,4,4,3,4,3,4,3,4,4,4,3,3,2,2,13.166667,1
7,34,history,M,3,4,3,4,3,3,4,4,3,3,2,2,2,2,11.333333,1
8,47,history,F,3,3,3,3,4,2,2,3,3,3,2,3,2,2,13.666667,1
9,51,micro,F,3,3,3,2,3,3,0,3,0,3,3,1,2,2,12.0,1


#Data Analysis

###First examine relationship between ratings and student performance, by course subject

In [34]:
# correlation between mean evaluation score and gender and between mean eval score and
# desirability of the course time, by course

print 'correlation between eval score and gender & between eval score and time\n'

theCols = evals + ['prof_male','plumTime']

grouped = dat.groupby(['prof_id'])
prof_ratings = grouped[theCols].agg(np.mean)
t, plo, pup, pboth, sims = corr(x=prof_ratings.q16,.... )
tt, plot, pupt, pbotht, sims = corr(x=prof_ratings.q16, y=prof_ratings.plumTime ,.... )

print 'overall (M,F, Tot)',
      prof_ratings['prof_male'].sum(), (1-prof_ratings['prof_male']).sum(),\
      prof_ratings['prof_male'].size,\
      '\tgender:', t, pup,\
      '\tplumTime:', tt, pupt, '\n'

for topic in np.unique(course):
    perTopic = dat[dat['course']== topic]
    grouped = perTopic.groupby(['prof_id'])
    prof_ratings = grouped[theCols].agg(np.mean)
    t, plo, pup, pboth, sims = corr(x=prof_ratings.16,.... )
    tt, plot, pupt, pbotht, sims = corr(x=prof_ratings.q16, y=prof_ratings.plumTime ,.... )
    print topic, '(M,F, Tot)',
      prof_ratings['prof_male'].sum(), (1-prof_ratings['prof_male']).sum(),\
      prof_ratings['prof_male'].size,\
      '\tgender:', t, pup,\
      '\tplumTime:', tt, pupt

['q16', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'q13', 'q14', 'q15', 'prof_male']
         q16  q1  q2  q3  q4  q5  q6  q7  q8  q9  q10  q13  q14  q15  \
prof_id                                                                
1          3   3   3   3   3   4   0   2   4   4    3    2    1    2   
2          2   2   4   3   2   2   3   4   3   3    2    2    2    2   
8          3   3   3   2   2   3   3   3   3   3    3    2    2    2   
13         3   3   4   3   4   4   3   3   3   3    2    2    2    1   
16         3   3   3   2   2   3   3   2   3   3    2    4    2    2   
21         3   3   3   2   3   3   2   3   4   4    2    3    2    2   
28         4   4   3   4   3   4   3   4   4   4    3    3    2    2   
34         3   4   3   4   3   3   4   4   3   3    2    2    2    2   
47         3   3   3   3   4   2   2   3   3   3    2    3    2    2   
51         3   3   3   2   3   3   0   3   0   3    3    1    2    2   
58         4   4   4   4   4   4 

In [35]:
print dat.columns.values

['student_id' 'year' 'dual_degree' 'entreescpoen' 'datedenaissance'
 'stu_male' 'stu_female' 'admission_exam' 'admission_cep' 'admission_bactb'
 'admission_other' 'stu_avg_final' 'stu_avg_conf' 'stu_avg_cm' 'q16' 'q1'
 'q2' 'q3' 'q4' 'q5' 'q6' 'q7' 'q8' 'q9' 'q10' 'q13' 'q14' 'q15' 'prof_id'
 'teacher_birth_year' 'prof_male' 'prof_female' 'adjunct' 'age_prof'
 'age_prof_sq' 'heures_2008_09' 'heures_2009_10' 'heures_2010_11'
 'heures_2011_12' 'heures_2012_13' 'doctorat' 'mdc' 'profu' 'lawyer'
 'research_assis' 'phd_student' 'qualif_u' 'check' 'enseignant_autre'
 'bq_fr' 'banker_assurance' 'ministere' 'ass_natle_senat' 'magistrat'
 'conseil' 'gvt' 'economist_div' 'cour_comptes' 'alumni_scpo' 'politique'
 'journaliste' 'admin_scpo' 'cherchautre_autre' 'autre' 'already_taught'
 'first_course' 'note_cm' 'note_conf' 'note_finale' 'course_number'
 'triplette_new' 'nbredtudiants' 'day' 'thursday' 'monday' 'tuesday'
 'wednesday' 'friday' 'early_morning' 'mid_morning' 'noon' 'mid_afternoon'
 'la

In [45]:
dat['plumTime'] = (dat['extremetime'] < 2) & dat['day'].isin(['Tuesday','Wednesday','Thursday'])

Unnamed: 0,day,extremetime,plumTime
0,Tuesday,0,True
1,Wednesday,1,True
3,Friday,2,False
4,Wednesday,1,True
5,Wednesday,2,False
6,Thursday,2,False
7,Thursday,2,False
8,Monday,0,False
9,Monday,0,False
10,Thursday,1,True


In [48]:
dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

print corr(x=dMale['q16'], y=dMale['stu_male'], rs=rs)
print corr(x=dFemale['q16'], y=dMale['stu_female'], rs=rs)


count           16
mean        0.5625
std      0.5123475
min          False
25%              0
50%              1
75%              1
max           True
dtype: object

In [50]:
dat['day'][:10]

0       Tuesday
1     Wednesday
3        Friday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Monday
9        Monday
10     Thursday
Name: day, dtype: object