Gender Biases in Student Evaluations of Teachers - A Randomized, Online Experiment
====================================================


In [56]:
# boilerplate
%matplotlib inline
import math
import numpy as np
import pandas as pd
from numpy.random import random
import scipy as sp
from scipy import special
import matplotlib.pyplot as plt
from __future__ import division
from scipy.stats import ttest_ind

# initialize PRNG
rs = np.random.RandomState(seed=1)

Permutation test code
============
You must install the _permute_ package to use this code. Install instructions can be found at https://github.com/statlab/permute.

In [42]:
from permute.core import corr, two_sample, permute_within_groups
from permute.utils import get_prng

def stratified_corr(x, y, group, alternative="greater", reps=10**4, keep_dist=False, seed=None):
    """
    Tests the null hypothesis of 0 correlation between x and y, against the alternative
    hypothesis that the correlation is
    (a) greater than 0 if side = 'greater'
    (b) less than 0 if side = 'less'
    (c) different from 0 if side = 'two-sided'

    Permutations are carried out within the given groups.  Under the null hypothesis,
    observations within each group are exchangeable.

    If ``keep_dist``, return the distribution of values of the test statistic;
    otherwise, return only the number of permutations for which the value of
    the test statistic and p-value.

    Parameters
    ----------
    x : array-like
        First variable
    y : array-like
        Second variable
    group : array-like
        Group assignments; permutations are done within each group
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test
    reps : int
        Number of permutations
    keep_dist : bool
        flag for whether to store and return the array of values
        of the irr test statistic
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator.

    Returns
    -------
    float
        the estimated p-value
    float
        the test statistic
    list
        The distribution of test statistics.
        These values are only returned if `keep_dist` == True        
    """
    
    prng = get_prng(seed)
    
    observed_tst = np.corrcoef(x, y)[0,1]
    theStat = {
        'greater': lambda u,v: np.corrcoef(u, v)[0,1],
        'less': lambda u,v: -np.corrcoef(u, v)[0,1],
        'two-sided': lambda u,v: math.fabs(np.corrcoef(u, v)[0,1])
    }
    tst = theStat[alternative](x, y)
    
    if keep_dist:
        dist = np.empty(reps)
        for i in range(reps):
            dist[i] = theStat[alternative](x, permute_within_groups(y, group, seed=prng))
        hits = np.sum(dist >= tst)
        return hits/reps, observed_tst, dist
    else:
        hits = np.sum([(theStat[alternative](x, permute_within_groups(y, group, seed=prng)) >= tst)
                       for i in range(reps)])
        return hits/reps, observed_tst
    
    
def stratified_two_sample(x, y, group_x, group_y, stat='mean', alternative="greater", reps=10**4, 
                          keep_dist=False, seed=None):
    """
    One-sided or two-sided, two-sample permutation test for equality of
    two means, with p-value estimated by simulated random sampling with
    reps replications.

    Tests the hypothesis that x and y are a random partition of x,y
    against the alternative that x comes from a population with mean

    (a) greater than that of the population from which y comes,
        if side = 'greater'
    (b) less than that of the population from which y comes,
        if side = 'less'
    (c) different from that of the population from which y comes,
        if side = 'two-sided'

    Permutations are carried out within the given groups.  Under the null hypothesis,
    observations within each group are exchangeable.

    If ``keep_dist``, return the distribution of values of the test statistic;
    otherwise, return only the number of permutations for which the value of
    the test statistic and p-value.

    Parameters
    ----------
    x : array-like
        Sample 1
    y : array-like
        Sample 2
    group_x : array-like
        Group assignments for sample 1
    group_y : array-like
        Group assignments for sample 2
    stat : {'mean', 't'}
        The test statistic.

        (a) If stat == 'mean', the test statistic is (mean(x) - mean(y))
            (equivalently, sum(x), since those are monotonically related)
        (b) If stat == 't', the test statistic is the two-sample t-statistic--
            but the p-value is still estimated by the randomization,
            approximating the permutation distribution.
            The t-statistic is computed using scipy.stats.ttest_ind
        (c) If stat is a function (a callable object), the test statistic is
            that function.  The function should take a permutation of the pooled
            data and compute the test function from it. For instance, if the
            test statistic is the Kolmogorov-Smirnov distance between the
            empirical distributions of the two samples, max_t |F_x(t) - F_y(t)|,
            the test statistic could be written:

            f = lambda u: np.max( \
                [abs(sum(u[:len(x)]<=v)/len(x)-sum(u[len(x):]<=v)/len(y)) for v in u]\
                )        
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test
    reps : int
        Number of permutations
    keep_dist : bool
        flag for whether to store and return the array of values
        of the irr test statistic
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator.

    Returns
    -------
    float
        the estimated p-value
    float
        the test statistic
    list
        The distribution of test statistics.
        These values are only returned if `keep_dist` == True        
    """
    
    prng = get_prng(seed)
    
    z = np.concatenate([x, y])   # pooled responses
    groups = np.concatenate([group_x, group_y])   # pooled group assignments
    
    # If stat is callable, use it as the test function. Otherwise, look in the dictionary
    stats = {
        'mean': lambda u: np.mean(u[:len(x)]) - np.mean(u[len(x):]),
        't': lambda u: ttest_ind(
            u[:len(y)], u[len(y):], equal_var=True)[0]
    }
    if callable(stat):
        tst_fun = stat
    else:
        tst_fun = stats[stat]

    theStat = {
        'greater': tst_fun,
        'less': lambda u: -tst_fun(u),
        'two-sided': lambda u: math.fabs(tst_fun(u))
    }
    tst = theStat[alternative](z)
    observed_tst = tst_fun(z)
    
    if keep_dist:
        dist = np.empty(reps)
        for i in range(reps):
            dist[i] = theStat[alternative](permute_within_groups(z, groups, seed=prng))
        hits = np.sum(dist >= tst)
        return hits/reps, tst, dist
    else:
        hits = np.sum([(theStat[alternative](permute_within_groups(z, groups, seed=prng)) >= tst)
                       for i in range(reps)])
        return hits/reps, tst

Read data
=================

Some notes on the variables:
* **group** identifies the section the student was placed in.
* **gender** refers to the student's gender: 1 = male, 2 = female.
* **tagender** is the instructor's true gender: 1 = male, 0 = female.
* **taidgender** is the instructor's reported gender: 1 = male, 0 = female.
* **grade** is on a scale from 0-100

Furthermore, the IRB did not allow grades to be linked to ratings. 4 students did not submit evaluations, but we do not know which ones. There are 43 ratings and 47 grades.

In [3]:
ratings = pd.read_csv("Macnell-RatingsData.csv")
categories = ratings.columns.values.tolist()[1:15]
ratings.head()

Unnamed: 0,group,professional,respect,caring,enthusiastic,communicate,helpful,feedback,prompt,consistent,fair,responsive,praised,knowledgeable,clear,overall,gender,age,tagender,taidgender
0,3,5,5,4,4,4,3,4,4,4,4,4,4,3,5,4,2,1990,0,1
1,3,4,4,4,4,5,5,5,5,3,4,5,5,5,5,4,1,1992,0,1
2,3,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,1991,0,1
3,3,5,5,5,5,5,3,5,5,5,5,3,5,5,5,5,2,1991,0,1
4,3,5,5,5,5,5,5,5,3,4,5,5,5,5,5,5,2,1992,0,1


In [4]:
grades = pd.read_csv("Macnell-GradeData.csv")
grades.head()

Unnamed: 0,group,grade,tagender,taidgender
0,3,77.4,0,1
1,3,89.02,0,1
2,3,53.5,0,1
3,3,88.32,0,1
4,3,90.02,0,1


# Analysis

## Evidence of gender bias

### Ratings vs reported instructor gender

In [5]:
(p, t) = two_sample(ratings['overall'][ratings.taidgender==1], ratings['overall'][ratings.taidgender==0], \
                              stat = 't', alternative = "two-sided", keep_dist = False)
print 'Overall rating:'
print 't statistic:', np.round(t, 5)
print 'P-value (two-sided):', np.round(p, 5)
print 'Number of evaluations for male-identified instructors:', np.sum(ratings.taidgender==1)
print 'Number of evaluations for female-identified instructors:', np.sum(ratings.taidgender==0)

print ('\n\n{0:24} {1:8} {2:8}'.format('Category', 't', 'p-value'))
for col in categories:
    (p, t) = two_sample(ratings[col][ratings.taidgender==1], ratings[col][ratings.taidgender==0], \
                              stat = 't', alternative = "two-sided", keep_dist = False)
    print ('{0:20} {1:8.2f} {2:8.2f}'.format(col, t, p))

Overall rating:
t statistic: 1.82159
P-value (two-sided): 0.0951
Number of evaluations for male-identified instructors: 23
Number of evaluations for female-identified instructors: 20


Category                 t        p-value 
professional             1.93     0.07
respect                  1.93     0.07
caring                   2.24     0.04
enthusiastic             2.14     0.05
communicate              2.06     0.06
helpful                  1.79     0.09
feedback                 1.63     0.14
prompt                   2.21     0.04
consistent               1.65     0.09
fair                     2.96     0.00
responsive               1.12     0.29
praised                  2.73     0.01
knowledgeable            1.64     0.13
clear                    1.37     0.19


### Ratings vs reported instructor gender (correlation)
We'll use a stratified test to evaluate the significance of the correlation between ratings and reported instructor gender.  The strata are defined by the instructor.  Permutations are done within strata (under the null, rating doesn't depend on reported gender, but may differ by teacher), then strata are pooled to compute an overall Spearman correlation.

In [41]:
(p, t) = stratified_corr(ratings['overall'], ratings['taidgender'], ratings['tagender'], alternative = "two-sided")
print 'Overall rating:'
print 'Correlation:', t
print 'P-value (two-sided):', np.round(p, 5), "\n"

print ('\n\n{0:24} {1:8} {2:8}'.format('Category', 'Corr', 'P-value'))
for col in categories:
    (p, t) = stratified_corr(ratings[col], ratings['taidgender'], ratings['tagender'], alternative = "two-sided")
    print ('{0:20} {1:8.2f} {2:8.2f}'.format(col, t, p))

Overall rating:
Correlation: 0.233924970178
P-value (two-sided): 0.1308 



Category                 t        p-value 
professional             0.29     0.06
respect                  0.29     0.05
caring                   0.25     0.11
enthusiastic             0.28     0.05
communicate              0.27     0.09
helpful                  0.22     0.17
feedback                 0.21     0.16
prompt                   0.37     0.02
consistent               0.20     0.24
fair                     0.39     0.01
responsive               0.11     0.49
praised                  0.36     0.02
knowledgeable            0.18     0.29
clear                    0.17     0.31


### Ratings vs reported instructor gender (difference in means)
We'll use a stratified test to evaluate the significance of the difference in means between ratings and reported instructor gender.  As above, strata are defined by reported gender.  There is a monotone transformation from the correlation to the difference in means statistic, so the two tests are equivalent.  P-values for these two tests should be roughly the same.

In [43]:
(p, t) = stratified_two_sample(ratings['overall'][ratings.taidgender==1], ratings['overall'][ratings.taidgender==0], 
                               ratings['tagender'][ratings.taidgender == 1], 
                               ratings['tagender'][ratings.taidgender == 0],
                               alternative = "two-sided", seed = rs)
print 'Overall rating:'
print 'Difference in means:', t
print 'P-value (two-sided):', np.round(p, 5), "\n"

print ('\n\n{0:24} {1:8} {2:8}'.format('Category', 'Diff means', 'P-value'))
for col in categories:
    (p, t) = stratified_two_sample(ratings[col][ratings.taidgender==1], ratings[col][ratings.taidgender==0], 
                               ratings['tagender'][ratings.taidgender == 1],
                               ratings['tagender'][ratings.taidgender == 0],
                               alternative = "two-sided", seed = rs)
    print ('{0:20} {1:8.2f} {2:8.2f}'.format(col, t, p))

Overall rating:
Difference in means: 0.473913043478
P-value (two-sided): 0.1476 



Category                 Diff means P-value 
professional             0.61     0.06
respect                  0.61     0.06
caring                   0.52     0.12
enthusiastic             0.57     0.08
communicate              0.57     0.09
helpful                  0.46     0.19
feedback                 0.47     0.19
prompt                   0.80     0.02
consistent               0.46     0.24
fair                     0.76     0.01
responsive               0.22     0.54
praised                  0.67     0.02
knowledgeable            0.35     0.29
clear                    0.41     0.34


### Ratings vs concordance of student and REPORTED instructor genders
Unstratified analysis

In [6]:
ratings['gender_concordance'] = ( (ratings['gender']% 2)==ratings['taidgender'] )
stu_male = ratings[ratings['gender']==1]
stu_female = ratings[ratings['gender']==2]

(t, plow, pupper, pboth, sims) = corr(x = stu_male['overall'], \
                                      y = stu_male['gender_concordance'], seed = rs)
print 'Male students\n'
print 'Number of male students:', stu_male.shape[0], '\n'
print 'Correlation for overall rating:', t
print 'Upper p-value:', pupper
print 'Two-sided p-value:', pboth
print ('\n{0:15} {1:8} {2:8} {3:8}'.format('Category', 'Correlation',\
                                           'Upper p-value', 'Two-sided p-value'))
(t, plow, pupper, pboth, sims) = corr(x = stu_male['overall'], \
                                      y = stu_male['gender_concordance'], seed = rs)
print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format('Overall', t, pupper, pboth))

for col in categories:
    (t, plow, pupper, pboth, sims) = corr(x = stu_male[col], \
                                          y = stu_male['gender_concordance'], seed = rs)
    print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format(col, t, pupper, pboth))


(t, plow, pupper, pboth, sims) = corr(x = stu_female['overall'], \
                                      y = stu_female['gender_concordance'], seed = rs)
print '\nFemale students\n'
print 'Number of female students:', stu_female.shape[0], '\n'
print 'Correlation for overall rating:', t
print 'Upper p-value:', pupper
print 'Two-sided p-value:', pboth
print ('\n{0:15} {1:8} {2:8} {3:8}'.format('Category', 'Correlation', \
                                           'Upper p-value', 'Two-sided p-value'))
(t, plow, pupper, pboth, sims) = corr(x = stu_female['overall'], \
                                      y = stu_female['gender_concordance'], seed = rs)
print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format('Overall', t, pupper, pboth))

for col in categories:
    (t, plow, pupper, pboth, sims) = corr(x = stu_female[col], \
                                          y = stu_female['gender_concordance'], seed = rs)
    print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format(col, t, pupper, pboth))

Male students

Number of male students: 20 

Correlation for overall rating: 0.089757421773
Upper p-value: 0.4227
Two-sided p-value: 0.8122

Category        Correlation Upper p-value Two-sided p-value
Overall            0.090       0.42       0.81
professional       0.217       0.27       0.42
respect            0.217       0.23       0.35
caring             0.020       0.53       0.99
enthusiastic       0.090       0.42       0.81
communicate        0.123       0.35       0.65
helpful            0.211       0.19       0.36
feedback           0.040       0.44       0.93
prompt             0.377       0.08       0.14
consistent         0.073       0.43       0.83
fair               0.408       0.06       0.08
responsive         0.181       0.28       0.53
praised            0.287       0.13       0.24
knowledgeable      0.078       0.41       0.80
clear              0.056       0.41       0.78

Female students

Number of female students: 23 

Correlation for overall rating: -0.363538598

### Ratings vs concordance of student and REPORTED instructor genders
Stratified analysis

In [53]:
ratings['gender_concordance'] = ( (ratings['gender']% 2)==ratings['taidgender'] )
stu_male = ratings[ratings['gender']==1]
stu_female = ratings[ratings['gender']==2]

print 'Male students\n'
print 'Number of male students:', stu_male.shape[0], '\n'
print ('\n{0:15} {1:8} {2:8}'.format('Category', 'Correlation','Two-sided p-value'))
(p, t) = stratified_corr(x = stu_male['overall'], y = stu_male['gender_concordance'],
                         group = stu_male['tagender'], alternative="two-sided", seed = rs)
print ('{0:15} {1:8.3f} {2:10.2f}'.format('Overall', t, p))

for col in categories:
    (p, t) = stratified_corr(x = stu_male[col], y = stu_male['gender_concordance'], 
                  group = stu_male['tagender'], alternative="two-sided", seed = rs)
    print ('{0:15} {1:8.3f} {2:10.2f}'.format(col, t, p))


Male students

Number of male students: 20 


Category        Correlation Two-sided p-value
Overall            0.090       0.81
professional       0.217       0.52
respect            0.217       0.34
caring             0.020       1.00
enthusiastic       0.090       0.82
communicate        0.123       0.68
helpful            0.211       0.41
feedback           0.040       0.90
prompt             0.377       0.15
consistent         0.073       0.85
fair               0.408       0.09
responsive         0.181       0.53
praised            0.287       0.27
knowledgeable      0.078       0.78
clear              0.056       0.76


In [52]:
print 'Female students\n'
print 'Number of female students:', stu_female.shape[0], '\n'
print ('\n{0:15} {1:8} {2:8}'.format('Category', 'Correlation','Two-sided p-value'))
(p, t) = stratified_corr(x = stu_female['overall'], y = stu_female['gender_concordance'], 
                         group = stu_female['tagender'], alternative="two-sided", seed = rs)
print ('{0:15} {1:8.3f} {2:10.2f}'.format('Overall', t, p))

for col in categories:
    (p, t) = stratified_corr(x = stu_female[col], y = stu_female['gender_concordance'], 
                  group = stu_female['tagender'], alternative="two-sided", seed = rs)
    print ('{0:15} {1:8.3f} {2:10.2f}'.format(col, t, p))

Female students

Number of female students: 23 


Category        Correlation Two-sided p-value
Overall           -0.364       0.11
professional      -0.361       0.10
respect           -0.361       0.10
caring            -0.458       0.05
enthusiastic      -0.440       0.05
communicate       -0.394       0.10
helpful           -0.242       0.35
feedback          -0.373       0.10
prompt            -0.367       0.13
consistent        -0.335       0.18
fair              -0.431       0.04
responsive        -0.032       0.99
praised           -0.473       0.01
knowledgeable     -0.293       0.21
clear             -0.248       0.29


### As a sanity check -- Ratings vs concordance of student and ACTUAL instructor genders

Since the students didn't know the instructors' actual gender, we hope that there is no correlation between gender concordance and ratings.

(unstratified)

In [7]:
ratings['gender_concordance_actual'] = ( (ratings['gender']% 2)==ratings['tagender'] )
stu_male = ratings[ratings['gender']==1]
stu_female = ratings[ratings['gender']==2]

(t, plow, pupper, pboth, sims) = corr(x = stu_male['overall'], \
                                      y = stu_male['gender_concordance_actual'], seed = rs)
print 'Male students\n'
print 'Number of male students:', stu_male.shape[0]
print 'Correlation:', t
print 'Upper p-value:', pupper
print 'Two-sided p-value:', pboth
print ('\n{0:15} {1:8} {2:8} {3:8}'.format('Category', 'Correlation',\
                                           'Upper p-value', 'Two-sided p-value'))
(t, plow, pupper, pboth, sims) = corr(x = stu_male['overall'], \
                                      y = stu_male['gender_concordance_actual'], seed = rs)
print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format('Overall', t, pupper, pboth))

for col in categories:
    (t, plow, pupper, pboth, sims) = corr(x = stu_male[col], \
                                          y = stu_male['gender_concordance_actual'], seed = rs)
    print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format(col, t, pupper, pboth))


(t, plow, pupper, pboth, sims) = corr(x = stu_female['overall'], \
                                      y = stu_female['gender_concordance_actual'], seed = rs)
print '\nFemale students\n'
print 'Number of female students:', stu_female.shape[0]
print 'Correlation:', t
print 'Upper p-value:', pupper
print 'Two-sided p-value:', pboth
print ('\n{0:15} {1:8} {2:8} {3:8}'.format('Category', 'Correlation',\
                                           'Upper p-value', 'Two-sided p-value'))
(t, plow, pupper, pboth, sims) = corr(x = stu_female['overall'], \
                                      y = stu_female['gender_concordance_actual'], seed = rs)
print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format('Overall', t, pupper, pboth))

for col in categories:
    (t, plow, pupper, pboth, sims) = corr(x = stu_female[col], \
                                          y = stu_female['gender_concordance_actual'], seed = rs)
    print ('{0:15} {1:8.3f} {2:10.2f} {3:10.2f}'.format(col, t, pupper, pboth))

Male students

Number of male students: 20
Correlation: -0.0718144366713
Upper p-value: 0.6896
Two-sided p-value: 0.7214

Category        Correlation Upper p-value Two-sided p-value
Overall           -0.072       0.69       0.72
professional       0.080       0.37       0.74
respect            0.080       0.45       0.84
caring            -0.106       0.73       0.59
enthusiastic      -0.072       0.57       0.82
communicate       -0.010       0.61       0.84
helpful            0.014       0.52       0.96
feedback          -0.117       0.70       0.69
prompt            -0.049       0.64       0.89
consistent         0.054       0.49       0.85
fair              -0.034       0.63       0.88
responsive        -0.064       0.56       0.84
praised            0.010       0.56       1.00
knowledgeable      0.106       0.42       0.70
clear             -0.119       0.72       0.65

Female students

Number of female students: 23
Correlation: 0.132379460479
Upper p-value: 0.33
Two-sided p-value

## Grades and instructor gender

### Course grade and reported instructor gender
Do students of male- and female-identified instructors perform equally, as measured by course grade? We do a two-sample permutation t-test.

(unstratified)

In [11]:
(p, t) = two_sample(grades['grade'][grades.taidgender==1], grades['grade'][grades.taidgender==0], \
                              stat = 't', alternative = "two-sided")
print 'Course grade:'
print 't statistic:', np.round(t, 5)
print 'P-value (two-sided):', np.round(p, 5)
print 'Number of students of male-identified instructors:', np.sum(grades.taidgender==1)
print 'Number of students of female-identified instructors:', np.sum(grades.taidgender==0)

Course grade:
t statistic: 0.21442
P-value (two-sided): 0.8322
Number of students of male-identified instructors: 23
Number of students of female-identified instructors: 24


In [57]:
(p, t) = stratified_two_sample(grades['grade'][grades.taidgender==1], grades['grade'][grades.taidgender==0], \
                               grades['tagender'][grades.taidgender==1], grades['tagender'][grades.taidgender==0], \
                              stat = 't', alternative = "two-sided")
print 'Course grade and reported instructor gender:'
print 't statistic:', np.round(t, 5)
print 'P-value (two-sided):', np.round(p, 5)
print 'Number of students of male-identified instructors:', np.sum(grades.taidgender==1)
print 'Number of students of female-identified instructors:', np.sum(grades.taidgender==0)

Course grade:
t statistic: 0.21442
P-value (two-sided): 0.831
Number of students of male-identified instructors: 23
Number of students of female-identified instructors: 24


### Course grade and actual instructor gender
Do students of male and female instructors perform equally, as measured by course grade?  We do a two-sample permutation t-test.

In [12]:
(p, t) = two_sample(grades['grade'][grades.tagender==1], grades['grade'][grades.tagender==0], \
                              stat = 't', alternative = "two-sided")
print 'Course grade:'
print 't statistic:', np.round(t, 5)
print 'P-value (two-sided):', np.round(p, 5)
print 'Number of students of male instructors:', np.sum(grades.taidgender==1)
print 'Number of students of female instructors:', np.sum(grades.taidgender==0)

Course grade:
t statistic: 2.65325
P-value (two-sided): 0.00991
Number of students of male instructors: 23
Number of students of female instructors: 24


## References

MacNell, L., Driscoll, A., and Hunt, A.N. (2014), "What’s in a Name: Exposing Gender Bias in Student Ratings of Teaching," _Innovative Higher Education_, 1-13.