## Code Preamble

In [None]:
from math import pi, floor, ceil
from itertools import combinations, product
from time import monotonic_ns
from random import randrange

import numpy as np
import pandas as pd

from IPython.display import HTML
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, ColumnDataSource, show
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, LinearColorMapper, NumeralTickFormatter
from bokeh.models.tickers import BasicTicker
from bokeh.palettes import Blues9 as blues, Greys256 as greys, Category20_20 as category_palette
from bokeh.transform import dodge
from scipy.stats import chi2_contingency as two_way_chi_square

pd.set_option('display.float_format', (lambda x: f'{x:.4f}' if x > 0.0001 else f'{x:.2e}'))

output_notebook()

FOR_VIEWING = False
EXPORT_TO_PNG = False
ATTRIBUTES = ['race', 'sex', 'first_gen_status']

def create_hide_code():

    if not FOR_VIEWING:
        return (lambda: None)

    display(HTML('''
        <script>
            function code_toggle(button) {
                button = $(button);
                var input = button.parents(".cell").children(".input");
                input.toggle()
                if (input.css("display") === "none") {
                    button.html("Show Code");
                } else {
                    button.html("Hide Code");
                }
            }
        </script>
    '''))

    def _hide_code():
        timestamp = str(monotonic_ns())
        display(HTML('''
            <button id="''' + timestamp + '''" onclick="javascript:code_toggle(this);">
                Hide Code
            </button>
            <script>
                setTimeout(function () {
                    var button = $("#''' + timestamp + '''");
                    var input = button.parents(".cell").children(".input");
                    input.hide();
                    button.html("Show Code");
                }, 500);
            </script>
        '''))

    return _hide_code

hide_code = create_hide_code()

def grade_to_gpa(grade):
    if grade is np.nan:
        return np.nan
    elif grade[0] not in 'ABCDF' or not (len(grade) < 2 or grade[1] in '+-'):
        return np.nan
    else:
        return 'FDCBA'.index(grade[0]) + (0 if len(grade) < 2 else (0.6 * ('-+'.index(grade[1]) - 0.5)))

def show_plot(plot, filename):
    show(plot)
    if EXPORT_TO_PNG:
        export_png(plot, filename=filename)

def read_twe_data():
    raw_df = pd.read_excel('demographics.xlsx', sheet_name='2015-2019')
    df = raw_df[['Fall Rubric Score', 'Spring Rubric Score', 'Blue Book', 'YEAR', 'RACE', 'SEX', 'FIRSTGEN', 'CSP_FallGRADE', 'CSP_SpringGRADE']].copy()
    df.columns = ['fall_score', 'spring_score', 'twe', 'year', 'race', 'sex', 'first_gen', 'fall_grade', 'spring_grade']
    df['assessment'] = 'twe'
    df['first_gen_status'] = df['first_gen'].map(lambda value: 'First Gen' if value == 'Y' else 'Not First Gen')
    df['sex'] = df['sex'].map(lambda value: 'Female' if value == 'F' else 'Male')
    df['fall_gpa'] = df['fall_grade'].map(grade_to_gpa)
    df['spring_gpa'] = df['spring_grade'].map(grade_to_gpa)
    df['fall_pass'] = df['fall_score'].map(lambda score: score > 3)
    df['spring_pass'] = df['spring_score'].map(lambda score: score > 3)
    df['eval_pass'] = df['twe'] > 3
    df['pass'] = (
        ((df['fall_score'] > 3) & (df['spring_score'] > 3))
        | ((df['fall_score'] > 3) & (df['twe'] > 3))
        | ((df['spring_score'] > 3) & (df['twe'] > 3))
    )
    df = df[[
        'assessment',
        'year', 'race', 'sex', 'first_gen_status',
        'fall_grade', 'fall_gpa',
        'spring_grade', 'spring_gpa',
        'fall_score', 'spring_score', 'twe',
        'fall_pass', 'spring_pass', 'eval_pass', 'pass',
    ]]
    return df

def read_portfolio_data():
    raw_df = pd.read_excel('demographics.xlsx', sheet_name='2020-2021')
    df = raw_df[['low score', 'high score', 'adjudicator score', 'final determination', 'YEAR', 'RACE', 'SEX', 'FIRSTGEN', 'CSP_FallGRADE', 'CSP_SpringGRADE']].copy()
    df.columns = ['low', 'high', 'tiebreaker', 'pass', 'year', 'race', 'sex', 'first_gen', 'fall_grade', 'spring_grade']
    df['assessment'] = 'portfolio'
    df['first_gen_status'] = df['first_gen'].map(lambda value: 'First Gen' if value == 'Y' else 'Not First Gen')
    df['sex'] = df['sex'].map(lambda value: 'Female' if value == 'F' else 'Male')
    df['fall_gpa'] = df['fall_grade'].map(grade_to_gpa)
    df['spring_gpa'] = df['spring_grade'].map(grade_to_gpa)
    df = df.replace('-', np.nan)
    df['mean'] = df[['low', 'high', 'tiebreaker']].mean(axis=1, numeric_only=True)
    df['fall_pass'] = df['fall_gpa'].map(lambda gpa: gpa > 2)
    df['spring_pass'] = df['spring_gpa'].map(lambda gpa: gpa > 2)
    df['eval_pass'] = df['mean'] >= 3
    # drop students with extensions or did not turn in their portfolios
    df = df[(df['pass'] != 'EXTENSION UNTIL 5/08/20') & (~df['low'].isna()) & (~df['high'].isna())]
    df['pass'] = df['pass'].map(lambda value: value.lower() == 'completed')
    df = df[[
        'assessment',
        'year', 'race', 'sex', 'first_gen_status',
        'fall_grade', 'fall_gpa',
        'spring_grade', 'spring_gpa',
        'low', 'high', 'tiebreaker', 'mean',
        'fall_pass', 'spring_pass', 'eval_pass', 'pass',
    ]]
    return df

TWE_DATA = read_twe_data()
PORT_DATA = read_portfolio_data()
ATTR_VALUES = {
    attr: tuple(
        pd.concat([TWE_DATA[attr], PORT_DATA[attr]])
        .value_counts()
        .sort_values(ascending=False)
        .index
    )
    for attr in ATTRIBUTES
}
ATTR_VALUES['year'] = sorted([*TWE_DATA['year'].unique(), *PORT_DATA['year'].unique()])
COLOR_MAPS = {
    'race': {
        'White': category_palette[5],
        'Hispanic or Latino': category_palette[11],
        'Asian': category_palette[3],
        'Two or More Races': category_palette[9],
        'Non Resident Alien': category_palette[1],
        'Black or African American': category_palette[15],
        'Unknown': category_palette[13],
        'Native Hawaiian or Other Pacific Islander': category_palette[7],
    },

    'sex': {
        'Female': category_palette[7],
        'Male': category_palette[1],
    },
    'first_gen_status': {
        'Not First Gen': category_palette[1],
        'First Gen': category_palette[3],
    },
}

hide_code()

## Sanity Check: College Demographics Over Time

In [None]:
def plot_demographics_old():
    grid = []
    for attr in ATTRIBUTES:
        attr_df = pd.concat([
            TWE_DATA.groupby(['year', attr]).count()['fall_grade'],
            PORT_DATA.groupby(['year', attr]).count()['fall_grade'],
        ])
        display(attr_df.groupby([attr]).sum().sort_values() / attr_df.sum())
        attr_df = attr_df.unstack(attr, fill_value=0).reset_index()
        fig = figure(
            width=960//3, height=960//3,
            x_range=ATTR_VALUES['year'], y_range=[0, 600],
            title=f'{attr.title().replace("_", " ")} Distribution of Cohorts',
            x_axis_label='Cohort',
        )
        fig.vbar_stack(
            ATTR_VALUES[attr],
            x='year',
            width=0.9,
            color=[*COLOR_MAPS[attr].values()],
            source=ColumnDataSource(attr_df),
        )
        fig.line(
            x=[4, 4],
            y=[0, 600],
            color='black',
            line_width=1.5,
            line_dash='dashed',
        )
        fig.xaxis.major_label_orientation = -pi/4
        grid.append(fig)
    show_plot(gridplot([grid]), filename='1-demographic-baseline.png')

def plot_demographics(attr):
    attr_df = pd.concat([
        TWE_DATA.groupby(['year', attr]).count()['fall_grade'],
        PORT_DATA.groupby(['year', attr]).count()['fall_grade'],
    ])
    attr_df = attr_df.unstack(attr, fill_value=0)
    attr_df['All'] = attr_df.sum(axis=1)
    attr_df = pd.concat(
        [
            attr_df.stack(),
            (2 * attr_df.div(attr_df.sum(axis=1), axis=0)).stack(),
        ],
        axis=1,
    ).rename(columns={0:'count', 1:'percent'}).reset_index()

    fig = figure(
        width=960, height=480,
        x_range=ATTR_VALUES['year'], y_range=[0, 600],
        title=f'{attr.title().replace("_", " ")} Distribution of Cohorts',
        x_axis_label='Cohort',
    )
    year_width = 0.9
    renderers = []
    renderer = fig.vbar(
        x='year',
        top='count',
        width=year_width,
        color='#C0C0C0',
        fill_alpha=0.5,
        source=ColumnDataSource(attr_df[attr_df[attr] == 'All']),
        legend_label='All',
    )
    renderers.append(renderer)
    num_attr_vals = len(ATTR_VALUES[attr])
    dodge_offsets = [
        i * year_width / num_attr_vals + ((num_attr_vals + 1 ) % 2) * (year_width / num_attr_vals / 2)
        for i in range(-num_attr_vals // 2, num_attr_vals // 2 )
    ]
    dodge_width = year_width / num_attr_vals - 0.01
    for i, attr_val in enumerate(ATTR_VALUES[attr]):
        renderer = fig.vbar(
            x=dodge('year', dodge_offsets[i], range=fig.x_range),
            top='count',
            width=dodge_width,
            color=COLOR_MAPS[attr][attr_val],
            source=ColumnDataSource(attr_df[attr_df[attr] == attr_val]),
            legend_label=attr_val,
        )
        renderers.append(renderer)

    fig.add_tools(HoverTool(renderers=renderers, tooltips=[
        ('Cohort', '@year'),
        (attr.replace('_', ' ').title(), f'@{attr}'),
        ('Count', '@count'),
        ('Percent', '@percent{0.00%}'),
    ]))
    years_before = len(TWE_DATA['year'].unique())
    fig.line(
        x=[years_before, years_before],
        y=[0, 1000],
        color='black',
        line_width=1.5,
        line_dash='dashed',
    )
    fig.xgrid.visible = False
    fig.ygrid.visible = False
    fig.legend.location = 'top_left'
    show_plot(fig, filename=f'1-demographic-baseline-{attr}.png')

hide_code()

The plots here do not directly say anything about the writing assessment, but they lay the groundwork for the demographic analysis we do later. In particular, these plots show that the demographics before and after the change in assessment are not significantly different. The exception is for the 2020-2021 academic year, which was completely remote due to the COVID-19 pandemic. This led to an overall decrease in enrollment, as well as a reduction in Asian and non-resident alien students. The distribution of sex and first-generation status of students was not significantly affected.

In [None]:
plot_demographics('race')

In [None]:
plot_demographics('sex')

In [None]:
plot_demographics('first_gen_status')

## Ideological Lens

### Pass Rates for Non-Course Assessment (TWE and Portfolio)

In [None]:
def pass_rate_df(attr, overall):
    indices = ['assessment', 'year']
    if attr is not None:
        indices.append(attr)
    return (
        pd.concat([
            TWE_DATA.pivot_table(
                index=indices,
                columns=[('pass' if overall else 'eval_pass')],
                values=['fall_grade'],
                aggfunc=len,
                fill_value=0,
            ),
            PORT_DATA.pivot_table(
                index=indices,
                columns=[('pass' if overall else 'eval_pass')],
                values=['fall_grade'],
                aggfunc=len,
                fill_value=0,
            )
        ])
        .reset_index()
        .set_axis([*indices, 'failed', 'passed'], axis=1)
        .assign(
            count=(lambda df: df['passed'] + df['failed']),
            passed_percent=(lambda df: df['passed'] / (df['passed'] + df['failed'])),
            failed_percent=(lambda df: df['failed'] / (df['passed'] + df['failed'])),
        )
    )

def plot_pass_rates(attr=None, overall=True, row_size=1):
    attr_df = pass_rate_df(attr, overall)
    fig = figure(
        width=960 // row_size, height=480 // row_size,
        x_range=ATTR_VALUES['year'], y_range=[0, 1],
        title=f'Pass Rate{"" if attr is None else attr.title().replace("_", " ")}',
        x_axis_label='Cohort',
    )
    renderers = []
    if attr is None:
        renderer = fig.square(
            x='year',
            y='passed_percent',
            size=8,
            source=ColumnDataSource(attr_df),
            legend_label='Pass Rate',
        )
        fig.line(
            x='year',
            y='passed_percent',
            line_width=2,
            source=ColumnDataSource(attr_df),
        )
        renderers.append(renderer)
    else:
        for attr_val in ATTR_VALUES[attr]:
            renderer = fig.square(
                x='year',
                y='passed_percent',
                size=8,
                color=COLOR_MAPS[attr][attr_val],
                source=ColumnDataSource(attr_df[attr_df[attr] == attr_val]),
                legend_label=attr_val,
            )
            fig.line(
                x='year',
                y='passed_percent',
                color=COLOR_MAPS[attr][attr_val],
                line_width=2,
                source=ColumnDataSource(attr_df[attr_df[attr] == attr_val]),
            )
            renderers.append(renderer)
    fig.add_tools(HoverTool(renderers=renderers, tooltips=[
        ('Cohort', '@year'),
        ('Race', f'@{attr}'),
        ('Count', '@count'),
        ('Pass Rate', '@passed_percent{0%}')
    ]))
    fig.line(
        x=[4, 4],
        y=[0, 600],
        color='black',
        line_dash='dashed',
    )
    fig.yaxis.formatter = NumeralTickFormatter(format='0 %')
    fig.legend.location = 'bottom_left'
    return fig

def chi_square_pass_rate_between_assessments(attr, overall, attr_val=None):
    df = pass_rate_df(attr, overall)
    if attr_val is not None:
        df = df[df[attr] == attr_val]
    observations = df.groupby('assessment')[['failed', 'passed']].sum()
    observations = observations.to_numpy()
    return two_way_chi_square(observations)

def assessment_chi_square_results(attr, overall, p_level=0.01):
    data = []
    for attr_val in ATTR_VALUES[attr]:
        chi_square, p, _, _ = chi_square_pass_rate_between_assessments(attr, overall, attr_val)
        data.append([attr_val, chi_square, p])
    results_df = pd.DataFrame(data, columns=[attr, 'chi_square', 'p'])
    results_df['significant'] = results_df['p'] < p_level
    return results_df[[attr, 'p', 'significant']].set_index([attr])

hide_code()

Here we are interested in whether a portfolio-based evaluation is more representative of student ability than a TWE-based evaluation. A simple metric for answering this question is looking at the percentage of students who pass the evaluation:

In [None]:
show(plot_pass_rates(overall=False))

## Curricular Lens

### Pass Rate of Fall and Spring FYS

In [None]:
def plot_course_pass_rate():
    df = pd.concat([
        TWE_DATA[['year', 'assessment', 'fall_pass', 'spring_pass', 'eval_pass']],
        PORT_DATA[['year', 'assessment', 'fall_pass', 'spring_pass', 'eval_pass']],
    ])
    eval_types = ['fall', 'spring', 'eval']
    dfs = []
    for i, eval_type in enumerate(eval_types):
        dfs.append(
            df.pivot_table(
                index=['year', 'assessment'],
                columns=[f'{eval_type}_pass'],
                values=[f'{eval_types[(i + 1) % len(eval_types)]}_pass'],
                aggfunc=len,
            )
            .reset_index()
            .set_axis(['year', 'assessment', 'fail', 'pass'], axis=1)
            .assign(
                total=(lambda df: df['pass'] + df['fail']),
                pass_portion=(lambda df: df['pass'] / df['total']),
                eval_type=eval_type,
            )
        )
    df = pd.concat(dfs)
    fig = figure(
        x_range=sorted(df['year'].unique()),
        y_range=[0, 1],
    )
    for eval_type in eval_types:
        fig.square(
            x='year',
            y='pass_portion',
            source=ColumnDataSource(df[df['eval_type'] == eval_type]),
            legend_label=eval_type,
        )
        fig.line(
            x='year',
            y='pass_portion',
            source=ColumnDataSource(df[df['eval_type'] == eval_type]),
        )
    return fig

show(plot_course_pass_rate())

FIXME scatter plot matrix of TWE

This should show that the increase in TWE pass rate doesn't help anyone - almost everyone who pass the TWE that way would have passed the two FYS courses anyway

In [None]:
def plot_matrix_twe():
    with pd.option_context('display.float_format', '{:.2f}'.format):
        display(
            pd.concat([
                TWE_DATA[['year', 'assessment', 'fall_pass', 'spring_pass', 'eval_pass', 'pass']],
                PORT_DATA[['year', 'assessment', 'fall_pass', 'spring_pass', 'eval_pass', 'pass']],
            ])
            .groupby(['assessment', 'eval_pass', 'fall_pass', 'spring_pass', 'pass']).count()
            ['year']
            .reset_index()
            .rename(columns={'year':'count'})
            .assign(
                course_pass=(lambda df: df.apply(
                    (lambda row:
                        'neither' if not (row['fall_pass'] or row['spring_pass'])
                        else (
                            'both' if row['fall_pass'] and row['spring_pass']
                            else 'one'
                        )
                    ),
                    axis=1,
                )),
            )
            .pivot_table(
                index=['pass', 'course_pass', 'eval_pass'],
                columns=['assessment'],
                values=['count'],
                fill_value=0,
            )
            .reset_index()
            .set_axis(['pass', 'course_pass', 'eval_pass', 'port_count', 'twe_count'], axis=1)
            .assign(
                port_portion=(lambda df: df['port_count'] / df['port_count'].sum()),
                twe_portion=(lambda df: df['twe_count'] / df['twe_count'].sum()),
                diff=(lambda df: df['port_portion'] - df['twe_portion']),
            )
            [['course_pass', 'eval_pass', 'pass', 'twe_portion', 'port_portion', 'diff']]
        )

hide_code()
plot_matrix_twe()

fundamental contradiction: is the TWE/portfolio supposed to align with courses, or to provide an independent third evaluation?
* if the former, should correlate with course grade
* if the later, should have lower correlation (due to _independence_)

## Performance Lens

In [None]:
def chi_square_pass_rate_between_demographics(assessment, attr, overall, attr_val=None):
    df = pass_rate_df(attr, overall)
    df = df[df['assessment'] == assessment]
    observations = df.groupby(attr)[['failed', 'passed']].sum()
    if attr_val is not None:
        if attr_val not in observations.index:
            return np.nan, np.nan, np.nan, None
        observations = observations.loc[[attr_val, ATTR_VALUES[attr][0]]].to_numpy()
    else:
        observations = observations.to_numpy()
    return two_way_chi_square(observations)


def demographic_chi_square_results(attr, overall, p_level=0.01):
    data = []
    for attr_val in ATTR_VALUES[attr][1:]:
        for assessment in ['twe', 'portfolio']:
            chi_square, p, _, _ = chi_square_pass_rate_between_demographics(assessment, attr, overall, attr_val)
            data.append([assessment, attr_val, chi_square, p])
    results_df = pd.DataFrame(data, columns=['assessment', attr, 'chi_square', 'p'])
    results_df['significant'] = results_df['p'] < p_level
    return results_df[[attr, 'assessment', 'p', 'significant']].set_index([attr, 'assessment'])

hide_code()

for TWE: two raters, if both pass/fail, will take low score
if split score, will just use third score

### Pass Rates for Non-Course Assessment (TWE and Portfolio)

### Pass Rate for Overall Assessment

### Pass Rate

Here we look at the overall pass rate and whether that differs by demographic attributes. We are specifically interested in two questions:

1. Are there differences in pass rates between races, sexes, and first-generation students?
2. Did the change in assessment lead to a change in pass rate?

We first look at the pass rate by race in the TWE versus the portfolio:

In [None]:
show(plot_pass_rates('race', overall=False))

Note that the Native Hawaiian or Other Pacific Islander population is small (never more than two a year), hence the dramatic changes it exhibits.

It is immediately obvious that switching to a portfolio-based evaluation had a substantial impact on pass rates for all students, regardless of race. We can confirm this statistically with a chi-square test of independence. In our case, our null hypothesis is that the evaluation method is independent of pass rate, which is to say pass rate is _not_ correlated with whether the TWE or the portfolio was used. A low p-value (p < 0.01) would reject the null hypothesis, suggesting that the evaluation method _does_ have an effect on pass rate.

In [None]:
assessment_chi_square_results('race', overall=False)

As expected, these results show that the change to a portfolio-based evaluation led to significant changes in the pass rates for all races, with the exception for students identified as Native Hawaiian or Other Pacific Islander or of unknown race, of which there are too few for the chi-square test to be meaningful.

---

We first look at the pass rate by race:

In [None]:
show(plot_pass_rates('race', overall=True))

At a glance, the only significant difference in pass rates is for non-resident aliens, who seem to have a lower pass rate than other students both before and after the change in assessment. This appearance, however, can be deceiving. To better quantify whether race plays a role in the pass rate, we can use a chi-square test of independence. In our case, our null hypothesis is that race is independent of pass rate, which is to say that a student's race is _not_ correlated with whether they will pass. A low p-value (p < 0.01) would reject the null hypothesis, suggesting that race _does_ have an effect on pass rate.

Since like must be compared against like, we look at the old TWE assessment and the new portfolio assessment separately, and aggregate over all years when that specific assessment was used. Performing the chi-square test on the TWE results gives a p-value of:

In [None]:
chi_square_pass_rate_between_demographics('twe', 'race', overall=True).pvalue

Performing the chi-square test on the portfolio results gives a p-value of:

In [None]:
chi_square_pass_rate_between_demographics('portfolio', 'race', overall=True).pvalue

Both of these p-values are below the p-level of 0.01, meaning that for both TWE and portfolio assessments, race _does_ have an effect on pass rate. We can further explore which race(s) have pass rates that differ significantly from the pass rates of white students by performing pairwise chi-square tests. Since we already know there will be significant results, we apply the Bonferroni correction to use a p-level of 0.01 / 7 ~= 0.00143, which raises the standard for finding a result significant. In this case, the null hypothesis is that race _does not_ have an effect on pass rate compared to white students; a low p-value (p < 0.01) would reject the null hypothesis, suggesting that race _does_ have an effect on pass rate compared to white students.

In [None]:
demographic_chi_square_results('race', overall=True, p_level=0.00143)

FIXME description of these results
* does a higher P-value after the change in assessment mean anything? I don't think so, hence the next test

We could also look at whether the change in assessment led to differences in pass rates. The null hypothesis here is that the assessment method _does not_ have an effect on pass rate; a low p-value (p < 0.01) would reject the null hypothesis, suggesting that the assessment _does_ have an effect on pass rate.

In [None]:
assessment_chi_square_results('race', overall=True)

FIXME description of these results

Comparatively, the pass rates for sex and first-generation status are clearer, where male students and first-generation students have lower pass rates than female and non-first-generation students respectively.

In [None]:
show(gridplot([[
    plot_pass_rates('sex', overall=True, row_size=2),
    plot_pass_rates('first_gen_status', overall=True, row_size=2),
]]))

Although unnecessary, the chi-square tests confirm that sex and first-generation status do affect pass rates, with p-values < 0.01

In [None]:
chi_square_pass_rate_between_demographics('twe', 'sex', overall=True).pvalue

In [None]:
chi_square_pass_rate_between_demographics('twe', 'first_gen_status', overall=True).pvalue

Similarly, it is also clear that the change from TWE to portfolio did not effect pass rates:

In [None]:
assessment_chi_square_results('sex', overall=True)

In [None]:
assessment_chi_square_results('first_gen_status', overall=True)

### Rubric Category Scores

[This is interesting, but potentially irrelevant for this paper.]

## Feedback Lens

The "feedback" lens can be seen as a combination of the ecological lens and FIXME

### Grade Correlation with Assessment Score

In [None]:
GRADES = ['F', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A']

def _kendalls_tau(pairs):
    concordant = 0
    discordant = 0
    for (x1, y1), (x2, y2) in combinations(pairs, 2):
        if x1 == x2 or y1 == y2:
            continue
        elif x1 < x2 and y1 < y2:
            concordant += 1
        elif x1 > x2 and y1 > y2:
            concordant += 1
        else:
            discordant += 1
    n = len(pairs)
    return (concordant - discordant) / (n * (n - 1) / 2)

def kendalls_tau(df, semester):
    return _kendalls_tau(list(df[[f'{semester}_gpa', 'mod_score']].itertuples(index=False)))

def create_fys_correlation_df(df, assessment, split_fn=None):
    heatmap_df = df[df['assessment'] == assessment].copy()
    if split_fn is None:
        heatmap_df['mod_score'] = heatmap_df['score']
    else:
        heatmap_df['mod_score'] = heatmap_df['score'].map(split_fn)
    return heatmap_df

def create_fys_correlation_plot(df, assessment, semester):
    heatmap_df = create_fys_correlation_df(df, assessment)
    tau = kendalls_tau(heatmap_df, semester)
    heatmap_df = heatmap_df.groupby([f'{semester}_grade', 'mod_score']).count()['assessment'].reset_index().rename(columns={'assessment':'count'})
    heatmap_df['text'] = heatmap_df['count'].map(str)
    if assessment == 'twe':
        assessment_name = 'TWE'
    else:
        assessment_name = 'Portfolio'
    fig = figure(
        width=960//2, height=int((6/11)*(960//2)),
        x_range=GRADES, y_range=[-0.5, 5.5],
        match_aspect=True,
        title=f'Correlation between {semester.title()} Grade and {assessment_name} Score (tau = {tau:.3f})',
    )
    fig.rect(x=0, y=0, width=100, height=100, fill_color=blues[-1])
    fig.rect(
        x=f'{semester}_grade',
        y='mod_score',
        width=1, height=(1 if assessment == 'twe' else 0.5),
        fill_color={
            'field': 'count',
            'transform': LinearColorMapper(palette=[*reversed(blues)], low=1, high=heatmap_df['count'].max()),
        },
        line_color=None,
        source=ColumnDataSource(heatmap_df),
    )
    fig.text(
        x=f'{semester}_grade',
        y='mod_score',
        text='text',
        source=ColumnDataSource(heatmap_df),
        text_align='center',
        text_baseline='middle',
        text_color={
            'field': 'count',
            'transform': LinearColorMapper(palette=(greys[:50] + greys[-50:]), low=1, high=heatmap_df['count'].max()),
        },
        text_font_size='18px',
        y_offset=2,
    )
    fig.yaxis.ticker = BasicTicker(num_minor_ticks=(1 if assessment == 'twe' else 2))
    return fig

def plot_fys_correlations():
    fys_df = pd.concat([
        TWE_DATA[['assessment', 'fall_grade', 'fall_gpa', 'spring_grade', 'spring_gpa', 'twe']].rename(columns={'twe':'score'}),
        PORT_DATA[['assessment', 'fall_grade', 'fall_gpa', 'spring_grade', 'spring_gpa', 'mean']].rename(columns={'mean':'score'}),
    ])
    fys_df = fys_df.dropna() # FIXME why is this necessary?
    grid = [[],[]]
    for i, (assessment, semester) in enumerate(product(['twe', 'portfolio'], ['fall', 'spring'])):
        grid[i//2].append(create_fys_correlation_plot(fys_df, assessment, semester))
    show_plot(gridplot(grid), '3-grade-correlations.png')

def check_binning_tau():
    split_fns = {
        'ceil': ceil,
        'floor': floor,
        'random': (lambda x: x if x == int(x) else floor(x) + randrange(2)),
        'to-0': (lambda x: floor(x) if x > 0 else ceil(x)),
        'to-1': (lambda x: floor(x) if x > 1 else ceil(x)),
        'to-2': (lambda x: floor(x) if x > 2 else ceil(x)),
        'to-3': (lambda x: floor(x) if x > 3 else ceil(x)),
        'to-4': (lambda x: floor(x) if x > 4 else ceil(x)),
        'to-5': (lambda x: floor(x) if x > 5 else ceil(x)),
    }
    fys_df = pd.concat([
        TWE_DATA[['assessment', 'fall_grade', 'fall_gpa', 'spring_grade', 'spring_gpa', 'twe']].rename(columns={'twe':'score'}),
        PORT_DATA[['assessment', 'fall_grade', 'fall_gpa', 'spring_grade', 'spring_gpa', 'mean']].rename(columns={'mean':'score'}),
    ])
    fys_df = fys_df.dropna() # FIXME why is this necessary?
    rows = []
    for name, split_fn in split_fns.items():
        df = create_fys_correlation_df(fys_df.copy(), 'portfolio', split_fn=split_fn)
        for semester in ['fall', 'spring']:
            rows.append([name, semester, kendalls_tau(df, semester)])
    return (
        pd.DataFrame(rows, columns=['method', 'semester', 'tau'])
        .pivot_table(index=['method'], columns=['semester'])
        .droplevel(0, axis=1)
    )

hide_code()

Note that some TWE scores are NANs; why?

In [None]:
plot_fys_correlations()

Reasons why correlation is low:
* course grades include non-writing components (eg. participation, knowledge-based content, etc.)
* FIXME

Areas of concern:

1. There are more years of TWE than of the portfolio (and since FYS switched to S/U, we will never get more data for the latter)
2. There are finer distinctions in the portfolio, due to averaging

In [None]:
check_binning_tau()