Gender Biases in Student Evaluations of Teachers - A Randomized, Online Experiment
====================================================


In [1]:
# boilerplate
%matplotlib inline
import math
import numpy as np
import pandas as pd
from numpy.random import random
import scipy as sp
from scipy import special
import matplotlib.pyplot as plt
from __future__ import division

# initialize PRNG
rs = np.random.RandomState(seed=1)

Permutation test code
============
You must install the _permute_ package to use this code. Install instructions can be found at https://github.com/statlab/permute.

In [2]:
from permute.core import corr, two_sample, permute_within_groups

Read data
=================

Some notes on the variables:
* **group** identifies the section the student was placed in.
* **gender** refers to the student's gender: 1 = male, 2 = female.
* **tagender** is the instructor's true gender: 1 = male, 0 = female.
* **taidgender** is the instructor's reported gender: 1 = male, 0 = female.

In [None]:
ratings = pd.read_csv("Macnell-RatingsData.csv")
categories = ratings.columns.values.tolist()[1:15]
ratings.head()

Unnamed: 0,group,professional,respect,caring,enthusiastic,communicate,helpful,feedback,prompt,consistent,fair,responsive,praised,knowledgeable,clear,overall,gender,age,tagender,taidgender
0,3,5,5,4,4,4,3,4,4,4,4,4,4,3,5,4,2,1990,0,1
1,3,4,4,4,4,5,5,5,5,3,4,5,5,5,5,4,1,1992,0,1
2,3,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,1991,0,1
3,3,5,5,5,5,5,3,5,5,5,5,3,5,5,5,5,2,1991,0,1
4,3,5,5,5,5,5,5,5,3,4,5,5,5,5,5,5,2,1992,0,1


# Analysis

### Ratings vs reported instructor gender

In [None]:
(p, t, ci, dist) = two_sample(ratings['overall'][ratings.taidgender==1], ratings['overall'][ratings.taidgender==0], \
                              stat = 't', interval = 'two-sided', keep_dist = True)
print 'Overall rating:'
print 't statistic:', np.round(t, 5)
print 'P-value (two-sided):', np.round(p, 5)
print '95% Confidence Interval for the P-value', np.round(ci, 5)
print 'Number of evaluations for male-identified instructors:', np.sum(ratings.taidgender==1)
print 'Number of evaluations for female-identified instructors:', np.sum(ratings.taidgender==0)

print ('\n\n{0:24} {1:8} {2:8}'.format('Category', 't', 'p-value'))
for col in categories:
    (p, t, ci) = two_sample(ratings[col][ratings.taidgender==1], ratings[col][ratings.taidgender==0], \
                              stat = 't', interval = 'two-sided', keep_dist = False)
    print ('{0:20} {1:8.2f} {2:8.2f}'.format(col, t, p))

Overall rating:
t statistic: 1.82159
P-value (two-sided): 0.04505
95% Confidence Interval for the P-value [ 0.04377  0.04635]
Number of evaluations for male-identified instructors: 23
Number of evaluations for female-identified instructors: 20


Category                 t        p-value 
professional             1.93     0.03

### Ratings vs concordance of student and REPORTED instructor genders


In [None]:
ratings['gender_concordance'] = ( (ratings['gender']% 2)==ratings['taidgender'] )
stu_male = ratings[ratings['gender']==1]
stu_female = ratings[ratings['gender']==2]

(t, plow, pupper, pboth, sims) = corr(x = stu_male['overall'], \
                                      y = stu_male['gender_concordance'], seed = rs)
print 'Male students\n'
print 'Number of male students:', stu_male.shape[0], '\n'
print 'Correlation for overall rating:', t
print 'Upper p-value:', pupper
print 'Two-sided p-value:', pboth
print ('\n{0:15} {1:8} {2:8} {3:8}'.format('Category', 'Correlation',\
                                           'Upper p-value', 'Two-sided p-value'))
(t, plow, pupper, pboth, sims) = corr(x = stu_male['overall'], \
                                      y = stu_male['gender_concordance'], seed = rs)
print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format('Overall', t, pupper, pboth))

for col in categories:
    (t, plow, pupper, pboth, sims) = corr(x = stu_male[col], \
                                          y = stu_male['gender_concordance'], seed = rs)
    print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format(col, t, pupper, pboth))


(t, plow, pupper, pboth, sims) = corr(x = stu_female['overall'], \
                                      y = stu_female['gender_concordance'], seed = rs)
print '\nFemale students\n'
print 'Number of female students:', stu_female.shape[0], '\n'
print 'Correlation for overall rating:', t
print 'Upper p-value:', pupper
print 'Two-sided p-value:', pboth
print ('\n{0:15} {1:8} {2:8} {3:8}'.format('Category', 'Correlation', \
                                           'Upper p-value', 'Two-sided p-value'))
(t, plow, pupper, pboth, sims) = corr(x = stu_female['overall'], \
                                      y = stu_female['gender_concordance'], seed = rs)
print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format('Overall', t, pupper, pboth))

for col in categories:
    (t, plow, pupper, pboth, sims) = corr(x = stu_female[col], \
                                          y = stu_female['gender_concordance'], seed = rs)
    print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format(col, t, pupper, pboth))

### As a sanity check -- Ratings vs concordance of student and ACTUAL instructor genders

Since the students didn't know the instructors' actual gender, we hope that there is no correlation between gender concordance and ratings.

In [None]:
ratings['gender_concordance_actual'] = ( (ratings['gender']% 2)==ratings['tagender'] )
stu_male = ratings[ratings['gender']==1]
stu_female = ratings[ratings['gender']==2]

(t, plow, pupper, pboth, sims) = corr(x = stu_male['overall'], \
                                      y = stu_male['gender_concordance_actual'], seed = rs)
print 'Male students\n'
print 'Number of male students:', stu_male.shape[0]
print 'Correlation:', t
print 'Upper p-value:', pupper
print 'Two-sided p-value:', pboth

(t, plow, pupper, pboth, sims) = corr(x = stu_female['overall'], \
                                      y = stu_female['gender_concordance_actual'], seed = rs)
print '\nFemale students\n'
print 'Number of female students:', stu_female.shape[0]
print 'Correlation:', t
print 'Upper p-value:', pupper
print 'Two-sided p-value:', pboth

### Difference in differences

We'd like to test whether the decrease in ratings due to self-identifying as a female is the same for the two instructors. We'll look at the test statistic $\left( \overline{\text{rating}}_{MM}-\overline{\text{rating}}_{MF}\right) - \left( \overline{\text{rating}}_{FM}-\overline{\text{rating}}_{FF}\right)$, where the first subscript refers to the instructor's actual gender and the second subscript refers to their reported gender. Under the null hypothesis of no difference, this statistic is $0$.

In [None]:
weights = np.ones(len(ratings.taidgender))
for g in ratings.group.unique():
    gg = np.array(ratings.group == g)
    if (ratings.tagender[gg]==ratings.taidgender[gg]).all():
        weights[gg] = 1/sum(gg)
    else:
        weights[gg] = -1/sum(gg)
tst = lambda ratings: (ratings*weights).sum()
obs = tst(ratings['overall'])

B = 100
dist = np.empty(B)
for b in range(B):
    perm = rs.permutation(ratings['overall'])
    dist[b] = tst(perm)

print ('{0:15} {1:8} {2:8}'.format('Category', 'Test Statistic', \
                                           'Two-sided p-value'))
print ('{0:15} {1:8.3f} {2:10.2f}'.format('Overall', obs, np.mean(abs(dist)>=abs(obs))))

for col in categories:
    obs = tst(ratings[col])
    dist = np.empty(B)
    for b in range(B):
        perm = rs.permutation(ratings['overall'])
        dist[b] = tst(perm)
    print ('{0:15} {1:8.3f} {2:10.2f}'.format(col, obs, np.mean(abs(dist)>=abs(obs))))

## References

MacNell, L., Driscoll, A., and Hunt, A.N. (2014), "What’s in a Name: Exposing Gender Bias in Student Ratings of Teaching," Innovative Higher Education, 1-13.