Gender Biases in Student Evaluations of Teachers
====================================================


In [1]:
# boilerplate
%matplotlib inline
import math
import numpy as np
import pandas as pd
from numpy.random import random
import scipy as sp
from scipy import special
import matplotlib.pyplot as plt
from __future__ import division
# import permute #Install instructions at https://github.com/statlab/permute

# initialize PRNG
rs = np.random.RandomState(seed=1)

Permutation test code
============

In [2]:
def corr(x, y, reps=10**4, rs=None):
    '''
    Simulate permutation p-value for Spearman correlation coefficient
    Returns test statistic, simulations, left-sided p-value, right-sided p-value, two-sided p-value
    '''
    if rs == None:
        rs = np.random.RandomState()
    t = np.corrcoef(x, y)[0,1]
    sims = [np.corrcoef(rs.permutation(x), y)[0,1] for i in range(reps)]
    return t, np.sum(sims <= t)/reps, np.sum(sims >= t)/reps, np.sum(np.abs(sims) >= math.fabs(t))/reps, sims

def stratCorrTst(x, y, group):
    '''
    Calculates sum of Spearman correlations between x and y, computed separately in each group.
    '''
    tst = 0.0
    for g in np.unique(group):
        gg = (group == g)
        tst += np.corrcoef(x[gg], y[gg])[0,1]
    return tst

def permuteWithinGroups(x, group, rs=None):
    '''
    Permutes the elements of x within groups
    Input: ndarray x to be permuted, ndarray group of group ids, np.random.RandomState object rs
    '''
    if rs == None:
        rs = np.random.RandomState()
    permuted = x.copy()
    for g in np.unique(group):
        gg = group == g
        permuted[gg] = rs.permutation(permuted[gg])      
    return permuted

def stratCorr(x, y, group, rs, reps=10**4):
    '''
    Simulate permutation p-value of stratified Spearman correlation test.
    Returns test statistic, simulations, left-sided p-value, right-sided p-value, two-sided p-value
    '''
    t = stratCorrTst(x, y, group)
    sims = [stratCorrTst(permuteWithinGroups(x, group, rs), y, group) for i in range(reps)]
    return t, np.sum(sims <= t)/reps, np.sum(sims >= t)/reps, np.sum(np.abs(sims) >= math.fabs(t))/reps, sims


## Read data and define new fields

In [3]:
dat = pd.read_stata("../../SET data/permutation_full.dta",  convert_categoricals=False )
#dat = dat[dat.admission_cep == 0]
dat.describe()

Unnamed: 0,student_id,year,dual_degree,entreescpoen,stu_male,stu_female,admission_exam,admission_cep,admission_bactb,admission_other,...,scpo,both_female,both_male,stumale_proffemale,stufemale_profmale,diff_final_cont,fall,spring,three_evals,sum_q10
count,22665.0,22665.0,22665.0,22665.0,22665.0,22665.0,22647.0,22647.0,22647.0,22647.0,...,22665.0,22665.0,22665.0,22665.0,22665.0,22502.0,22665.0,22665.0,22665.0,22665.0
mean,2381.468652,2010.239268,0.100993,2010.18844,0.433179,0.566821,0.674306,0.133704,0.1122,0.031527,...,0.109243,0.177101,0.295345,0.137834,0.38972,1.601022,0.589852,0.410148,0.864858,6.462696
std,1232.740116,1.345684,0.301326,1.36306,0.495526,0.495526,0.468644,0.340342,0.31562,0.174742,...,0.311951,0.381763,0.456208,0.344733,0.487697,2.793144,0.491871,0.491871,0.341883,1.693313
min,1.0,2008.0,0.0,2004.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-9.5,0.0,0.0,0.0,1.0
25%,1391.0,2009.0,0.0,2009.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.5,0.0,0.0,1.0,6.0
50%,2453.0,2010.0,0.0,2010.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.5,1.0,0.0,1.0,7.0
75%,3420.0,2011.0,0.0,2011.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,3.5,1.0,1.0,1.0,8.0
max,4423.0,2012.0,1.0,2012.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,16.0,1.0,1.0,1.0,9.0


In [4]:
evals = ['q16', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'q13', 'q14', 'q15']
if isinstance(dat.q16[0], str):
    for e in evals:
        dat[e].replace(['nonpertinent', 'insuffisant', 'moyen', 'bon', 'excellent'],\
                       [0,1,2,3,4],\
                       inplace=True)
    
dat['course'] = dat['prof_id']
dat['course'][dat['history']==1] = 'history' 
dat['course'][dat['micro']==1] = 'micro' 
dat['course'][dat['ip']==1] = 'ip' 
dat['course'][dat['macro']==1] = 'macro' 
dat['course'][dat['socio']==1] = 'socio' 
dat['course'][dat['scpo']==1] = 'scpo' 

dat['gender'] = dat.prof_male
dat['gender'][dat.gender==1] = 'M'
dat['gender'][dat.gender==0] = 'F'
dat['student_gender'] = dat.stu_male
dat['student_gender'][dat.student_gender==1] = 'M'
dat['student_gender'][dat.student_gender==0] = 'F'

# Defining the fun times for class
dat['plumTime'] = (dat['extremetime'] < 2) & dat['day'].isin([2,3,4])

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a co

## Data filtering

Most of the analyses omit grades in PI, for two reasons: 
1. The final exam is oral
2. There is a large imbalance in the gender of the instructors: 52 male, 12 female
3. Since the course is "easy," that would bias results in favor of making male instructors look more effective

In [None]:
# remove the Political Institutions courses
datNoIp = dat.copy()
datNoIp = datNoIp[datNoIp['course'] != 'ip']

# Start of the analysis

### Is there a significant association between ratings and student performance?

In [None]:
# test association between ratings and performance, pooled genders.
# Since this uses the final exams, eliminate the IP courses

# group the data by instructor, discipline, gender, student gender

grouped = datNoIp.groupby(['prof_id', 'course', 'gender', 'student_gender'])
prof_ratings = grouped[evals].mean()

pass_fail = lambda x: np.mean([xx >= 10 for xx in x])
pass_rate = grouped['note_finale'].agg([np.mean,pass_fail])

teacher_data = pd.concat([prof_ratings, pass_rate], axis=1)
teacher_data = teacher_data.rename(columns = {'mean':'mean_final_exam', '<lambda>':'pass_rate'}).reset_index()

# Test association between ratings performance, pooled genders.
(t, plower, pupper, pboth, sims) = corr(x = teacher_data.q16, y = teacher_data.mean_final_exam, rs = rs)
print 'Ratings v. performance, pooled gender:',t, plower, pupper, pboth

# Test association between ratings performance, stratified by gender.
(t, plower, pupper, pboth, sims) = stratCorr(x = teacher_data.q16, y = teacher_data.mean_final_exam,
                                            group = teacher_data.gender, rs = rs)
print 'Ratings v. performance, stratified by gender:', t, plower, pupper, pboth

### Ratings v instructor gender

In [None]:
grouped = dat.groupby(['prof_id'])
theCols = evals + ['prof_male']  # evaluation columns, plus indicator for male prof

prof_ratings = grouped[theCols].agg(np.mean)

(t, plower, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.prof_male, rs = rs)

print 'mean rating for instructors vs. instructor gender (positive favors males):', t, plower, pupper, pboth

### Instructor gender and student performance

In [None]:
# remove students who took triads from instructors all of the same gender
# Since this uses final exams, remove IP courses

datNonzeroVar = datNoIp.copy()
for g in np.unique(dat['student_id']):
    gg = dat['student_id'] == g
    if (np.var(dat['note_cm'][gg] == 0.0) or (np.var(dat['prof_male'][gg] == 0.0):
        datNonzeroVar = datNonzeroVar.drop(datNonzeroVar['student_id'] == g].index

(t, plower, pupper, pboth, sims) = stratCorr(x = datNonzeroVar[~np.isnan(datNonzeroVar['note_cm'])]['prof_male'], 
                                            y = datNonzeroVar[~np.isnan(datNonzeroVar['note_cm'])]['note_cm'],
                                            group = datNonzeroVar['student_id'], rs = rs, reps=10**3)
                                           
print 'Student-level association between instructor gender and performance:', t, plower, pupper, pboth

### Association between instructor gender and evaluation scores, by subject

In [None]:
# Analyzing the correlation of avg evaluation score and gender, by course.  Includes IP courses.

print 'Analyzing the correlation btw avg evaluation score and gender, by course' 

theCols = evals + ['prof_male','plumTime']

grouped = dat.groupby(['prof_id'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.prof_male, rs = rs)
print 'overall', t, pupper, prof_ratings['prof_male'].sum(), \
      (1-prof_ratings['prof_male']).sum(),\
       prof_ratings['prof_male'].size, '\n'

for topic in np.unique(dat.course):
    perTopic = dat[dat['course']==topic]
    grouped = perTopic.groupby(['prof_id'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.prof_male, rs = rs)
    print topic, t, pupper, prof_ratings['prof_male'].sum(), \
    (1-prof_ratings['prof_male']).sum(),  prof_ratings['prof_male'].size   

### Association between evaluation scores and class meeting time

In [None]:
# Analyzing the correlation btw avg evaluation score and desirable time, by course. Includes IP courses
print 'Analyzing the correlation btw avg evaluation score and desirable time, by course' 

theCols = evals + ['prof_male','plumTime']

grouped = dat.groupby(['course_number'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.plumTime, rs = rs)
print 'overall', t, pupper, prof_ratings['plumTime'].sum(), \
      (1-prof_ratings['plumTime']).sum(),\
       prof_ratings['plumTime'].size, '\n'

for topic in np.unique(dat.course):
    perTopic = dat[dat['course']==topic]
    grouped = perTopic.groupby(['course_number'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.plumTime, rs = rs)
    print topic, t, pupper, prof_ratings['plumTime'].sum(), \
    (1-prof_ratings['plumTime']).sum(),  prof_ratings['plumTime'].size

### Association between instructors' average ratings and average continuous assessment grades

In [None]:
# Analyzing the correlation btw avg evaluation score and avg cont assessment grades, by instructor. 
# Includes IP courses

print 'Analyzing the correlation btw avg evaluation score and cont assessment, by instructor' 

theCols = evals + ['prof_male','plumTime','note_conf']

grouped = dat.groupby(['prof_id'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.note_conf, rs = rs)
print 'overall', t, pupper,\
       prof_ratings['note_conf'].size, '\n'

for topic in np.unique(dat.course):
    perTopic = dat[dat['course']==topic]
    grouped = perTopic.groupby(['prof_id'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.note_conf, rs = rs)
    print topic, t, pupper, prof_ratings['note_conf'].size
    
    

### Association between concordance of student and teacher genders and overall satisfaction

In [None]:
# Association between concordance of student and teacher genders and overall satisfaction. Includes IP courses.
# Male instructors first

dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dMale['q16'], y=dMale['stu_male'], reps=10**5, rs=rs)
print 'Male instructors:', t, pupper

# Female instructors

(t, plow, pupper, pboth, sims) = corr(x=dFemale['q16'], y=dFemale['stu_female'], reps=10**5, rs=rs)
print 'Female instructors:', t, pupper

## Association between student grades and teacher gender, by student

### Start with continuous assessment scores

In [None]:
# Analyzing the correlation btw avg evaluation score and avg cont assessment grades, by course number.
# Since this is in contrast to the using the final exam as a measure of instructor value-added, omit IP courses
print 'Analyzing the correlation btw avg evaluation score and cont assessment, by course number; no IP' 

theCols = evals + ['prof_male','plumTime','note_conf']

grouped = datNoIp.groupby(['course_number'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings['q16'], y = prof_ratings['note_conf'], rs = rs)
print 'overall', t, pupper,\
       prof_ratings['note_conf'].size, '\n'

for topic in np.unique(datNoIp['course']):
    perTopic = datNoIp[datNoIp['course']==topic]
    grouped = perTopic.groupby(['course_number'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings['q16'], y = prof_ratings['note_conf'], rs = rs)
    print topic, t, pupper, prof_ratings['note_conf'].size
    
    

In [None]:
# Analyzing the correlation btw avg evaluation score and final exam grade, by course number. Omit IP courses

print 'Analyzing the correlation btw avg evaluation score and final exam grade, by course number' 

theCols = evals + ['prof_male','plumTime','note_cm']

grouped = datNoIp.groupby(['course_number'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings['q16'], y = prof_ratings['note_cm'], rs = rs)
print 'overall', t, pupper,\
       prof_ratings['note_cm'].size, '\n'

for topic in np.unique(datNoIp.course):
    perTopic = datNoIp[datNoIp['course']==topic]
    grouped = perTopic.groupby(['course_number'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings['q16'], y = prof_ratings['note_cm'], rs = rs)
    print topic, t, pupper, prof_ratings['note_cm'].size

In [None]:
# Analyzing the correlation of avg teaching dimension scores and final exam grade, by course number.
# Since this uses final, omit IP courses

print 'Analyzing the correlation btw teaching dimension scores and final exam grade, by course number' 

theCols = evals + ['prof_male','plumTime','note_cm']

grouped = datNoIp.groupby(['course_number'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings['q5'], y = prof_ratings['note_cm'], rs = rs)

print 'overall', t, pupper,\
       prof_ratings['note_cm'].size, '\n'

for topic in np.unique(datNoIp.course):
    perTopic = datNoIp[datNoIp['course']==topic]
    grouped = perTopic.groupby(['course_number'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings['q5'], y = prof_ratings['note_cm'], rs = rs)
    print topic, t, pupper, prof_ratings['note_cm'].size

In [None]:
# Gender concordance versus rating of quality of animation. Include IP courses

dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dMale['q5'], y=dMale['stu_male'], rs=rs)
print 'Gender concordance v. animation, male instructors:', t, pupper

(t, plow, pupper, pboth, sims) = corr(x=dFemale['q5'], y=dFemale['stu_female'], rs=rs)
print 'Gender concordance v. animation, female instructors:', t, pupper

In [None]:
# Gender concordance v preparation & organization scores

# redundant, but safe:
dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

dMale = dMale[~np.isnan(dMale['q1'])]  # note! need to re-set in following analyses
dFemale = dFemale[~np.isnan(dFemale['q1'])]  # note! need to re-set in following analyses

# Male instructors
(t, plow, pupper, pboth, sims) = corr(x=dMale['q1'], y=dMale['stu_male'], rs=rs)
print 'Gender concordance v. preparation and organization, male instructors:', t, pupper

# Female instructors
(t, plow, pupper, pboth, sims) = corr(x=dFemale['q1'], y=dFemale['stu_female'], rs=rs)
print 'Gender concordance v. preparation and organization, female instructors:', t, pupper

In [None]:
# Gender concordance v preparation overall satisfaction

dMale_stu = dat[dat['stu_male']==1]
dFemale_stu = dat[dat['stu_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dMale_stu['q16'], y=dMale_stu['prof_male'], rs=rs)
print t, pupper

In [None]:
#Matching male teacher with student gender, concordance grade

dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dMale['note_conf'], y=dMale['stu_male'], rs=rs)
print t, pupper

In [None]:
# Analyzing the correlation btw avg evaluation score and avg cont assessment grades, by course number
print 'Analyzing the correlation btw avg evaluation score and cont assessment, by course number' 

theCols = evals + ['prof_male','plumTime','note_conf']

grouped = dat.groupby(['course_number'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.note_conf, rs = rs)
print 'overall', t, pupper,\
       prof_ratings['note_conf'].size, '\n'

for topic in np.unique(dat.course):
    perTopic = dat[dat['course']==topic]
    grouped = perTopic.groupby(['course_number'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.note_conf, rs = rs)
    print topic, t, pupper, prof_ratings['note_conf'].size
    
    

In [None]:
# Analyzing the correlation btw avg evaluation score and final exam grade, by course number
print 'Analyzing the correlation btw avg evaluation score and final exam grade, by course number' 

theCols = evals + ['prof_male','plumTime','note_cm']

grouped = dat.groupby(['course_number'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.note_cm, rs = rs)
print 'overall', t, pupper,\
       prof_ratings['note_cm'].size, '\n'

for topic in np.unique(dat.course):
    perTopic = dat[dat['course']==topic]
    grouped = perTopic.groupby(['course_number'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.note_cm, rs = rs)
    print topic, t, pupper, prof_ratings['note_cm'].size
    
    

In [None]:
# Analyzing the correlation btw avg teaching dimension scores and final exam grade, by course number
print 'Analyzing the correlation btw teaching dimension scores and final exam grade, by course number' 

theCols = evals + ['prof_male','plumTime','note_cm']

grouped = dat.groupby(['course_number'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q5, y = prof_ratings.note_cm, rs = rs)
print 'overall', t, pupper,\
       prof_ratings['note_cm'].size, '\n'

for topic in np.unique(dat.course):
    perTopic = dat[dat['course']==topic]
    grouped = perTopic.groupby(['course_number'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q5, y = prof_ratings.note_cm, rs = rs)
    print topic, t, pupper, prof_ratings['note_cm'].size
    
    

In [None]:
#Matching male teacher with student gender, quality of animation

dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dMale['q5'], y=dMale['stu_male'], rs=rs)
print t, pupper

In [None]:
#Matching male teacher with student gender, preparation & organization

dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dMale['q1'], y=dMale['stu_male'], rs=rs)
print t, pupper

In [None]:
#Matching female teacher with student gender, preparation & organization

dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dFemale['q1'], y=dFemale['stu_male'], rs=rs)
print t, pupper

In [None]:
#Matching male student with teacher gender, overall satisfaction
dMale_stu = dat[dat['stu_male']==1]
dFemale_stu = dat[dat['stu_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dMale_stu['q16'], y=dMale_stu['prof_male'], rs=rs)
print t, pupper

In [None]:
#Matching male teacher with student gender, concordance grade

dMale = dat[dat['prof_male']==1]
dFemale = dat[dat['prof_female']==1]

(t, plow, pupper, pboth, sims) = corr(x=dMale['note_conf'], y=dMale['stu_male'], rs=rs)
print t, pupper

In [None]:
# Analyzing the correlation btw avg evaluation score and difference btw continuous assessment and final grade, by course number
print 'Analyzing the correlation btw avg evaluation score and difference btw continuous assessment and final grade, \
by course number' 

theCols = evals + ['prof_male','plumTime','note_cm', 'diff_final_cont']

grouped = dat.groupby(['course_number'])
prof_ratings = grouped[theCols].agg(np.mean)
(t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.diff_final_cont, rs = rs)
print 'overall', t, pupper,\
       prof_ratings['diff_final_cont'].size, '\n'

for topic in np.unique(dat.course):
    perTopic = dat[dat['course']==topic]
    grouped = perTopic.groupby(['course_number'])
    prof_ratings = grouped[theCols].agg(np.mean)
    (t, plow, pupper, pboth, sims) = corr(x = prof_ratings.q16, y = prof_ratings.diff_final_cont, rs = rs)
    print topic, t, pupper, prof_ratings['diff_final_cont'].size
    
    

In [None]:
# dat with IP => to test for reliability on q10 (student self-assessed involvment in the course)

wrkld=dat[dat['three_evals']==1]['sum_q10']
plt.hist (wrkld)

    