## Code Preamble

In [None]:
from uuid import uuid4 as uuid
from math import pi as PI

import numpy as np
import pandas as pd

from IPython.display import HTML
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, ColumnDataSource, show
from bokeh.layouts import gridplot, column, layout
from bokeh.models import HoverTool, LinearColorMapper, NumeralTickFormatter, ColorBar
from bokeh.palettes import Blues9 as blues, Oranges9 as oranges, Category20_20 as category_palette
from bokeh.transform import dodge
from scipy.stats import chi2_contingency as two_way_chi_square

pd.set_option('display.float_format', (lambda x: f'{x:.4f}' if x > 0.0001 else f'{x:.2e}'))

output_notebook()

FOR_VIEWING = False
EXPORT_TO_PNG = True
ATTRIBUTES = ['race', 'sex', 'first_gen_status']
CATEGORIES = ['reflection', 'focus', 'organization', 'evidence & development', 'writing']


if 'HIDE_CODE_INJECTED' not in locals():
    display(HTML('''
        <script>
            function code_toggle(button) {
                let cell = button;
                while (!cell.matches(".cell")) {
                    cell = cell.parentElement;
                }
                let input = null;
                for (const child of cell.children) {
                    if (child.matches(".input")) {
                        input = child;
                        break;
                    }
                }
                if (input.style.display === "none") {
                    button.innerHTML = "Hide Code";
                    input.style.display = "";
                } else {
                    input.style.display = "none";
                    button.innerHTML = "Show Code";
                }
            }
        </script>
    '''))
    HIDE_CODE_INJECTED = True

def hide_code():
    if not FOR_VIEWING:
        return
    uid = uuid()
    display(HTML(f'''
        <button id="{uid}" onclick="javascript:code_toggle(this);"></button>
        <script>
            setTimeout(function () {{
                code_toggle(document.getElementById("{uid}"));
            }}, 500);
        </script>
    '''))

def grade_to_gpa(grade):
    if grade is np.nan:
        return np.nan
    elif grade[0] == 'S':
        return 3
    elif grade[0] == 'U':
        return 0
    elif grade[0] not in 'ABCDF' or not (len(grade) < 2 or grade[1] in '+-'):
        return np.nan
    else:
        return 'FDCBA'.index(grade[0]) + (0 if len(grade) < 2 else (0.6 * ('-+'.index(grade[1]) - 0.5)))

def show_plot(fig, filename):
    show(fig)
    if EXPORT_TO_PNG:
        def recur_notool(fig):
            if hasattr(fig, 'toolbar'):
                fig.toolbar.logo = None
                fig.toolbar_location = None
            elif hasattr(fig, 'children'):
                for child in fig.children:
                    recur_notool(child)
        recur_notool(fig)
        export_png(fig, filename=f'output-images/{filename}')

def read_twe_data():
    raw_df = pd.read_excel('demographics.xlsx', sheet_name='2015-2019')
    df = raw_df[['Fall Rubric Score', 'Spring Rubric Score', 'Blue Book', 'YEAR', 'RACE', 'SEX', 'FIRSTGEN', 'CSP_FallGRADE', 'CSP_SpringGRADE']].copy()
    df.columns = ['fall_score', 'spring_score', 'twe', 'cohort', 'race', 'sex', 'first_gen', 'fall_grade', 'spring_grade']
    df['assessment'] = 'twe'
    df['first_gen_status'] = df['first_gen'].map(lambda value: 'First Gen' if value == 'Y' else 'Not First Gen')
    df['sex'] = df['sex'].map(lambda value: 'Female' if value == 'F' else 'Male')
    df['fall_gpa'] = df['fall_grade'].map(grade_to_gpa)
    df['spring_gpa'] = df['spring_grade'].map(grade_to_gpa)
    df['fall_fys_pass'] = df['fall_gpa'] >= 2
    df['spring_fys_pass'] = df['spring_gpa'] >= 2
    df['fall_pass'] = df['fall_score'] > 3
    df['spring_pass'] = df['spring_score'] > 3
    df['twe_pass'] = df['twe'] > 3
    df['eval_pass'] = (
        (df['fall_pass'] & df['spring_pass'])
        | (df['fall_pass'] & df['twe_pass'])
        | (df['spring_pass'] & df['twe'])
    )
    df['fswr_pass'] = df['fall_pass'] & df['spring_pass'] & df['eval_pass']
    df = df[[
        'assessment',
        'cohort', 'race', 'sex', 'first_gen_status',
        'fall_grade', 'fall_gpa', 'fall_fys_pass',
        'spring_grade', 'spring_gpa', 'spring_fys_pass',
        'fall_score', 'spring_score', 'twe',
        'fall_pass', 'spring_pass', 'twe_pass', 'eval_pass', 'fswr_pass',
    ]]
    return df

def read_portfolio_data():
    raw_df = pd.read_excel('demographics.xlsx', sheet_name='2020-2023')
    df = raw_df[['low score', 'high score', 'adjudicator score', 'YEAR', 'RACE', 'SEX', 'FIRSTGEN', 'CSP_FallGRADE', 'CSP_SpringGRADE']].copy()
    df.columns = ['low', 'high', 'tiebreaker', 'cohort', 'race', 'sex', 'first_gen', 'fall_grade', 'spring_grade']
    df['assessment'] = 'portfolio'
    df['first_gen_status'] = df['first_gen'].map(lambda value: 'First Gen' if value == 'Y' else 'Not First Gen')
    df['sex'] = df['sex'].map(lambda value: 'Female' if value == 'F' else 'Male')
    df['race'] = df['race'].replace({
        'Black/African A': 'Black or African American',
        'Hispanic/Latinx': 'Hispanic or Latino',
        'International': 'Non Resident Alien',
        'Native Hawaiian': 'Native Hawaiian or Other Pacific Islander',
        'Two or More': 'Two or More Races',
    })
    df['fall_gpa'] = df['fall_grade'].map(grade_to_gpa)
    df['spring_gpa'] = df['spring_grade'].map(grade_to_gpa)
    df['fall_fys_pass'] = df['fall_gpa'] >= 2
    df['spring_fys_pass'] = df['spring_gpa'] >= 2
    df = df.replace('-', np.nan)
    df['tiebreaker'] = pd.to_numeric(df['tiebreaker'])
    df['median'] = df[['low', 'high', 'tiebreaker']].median(axis=1, numeric_only=True)
    df['port_pass'] = df['median'] >= 3
    df['eval_pass'] = df['port_pass']
    # drop students who did not turn in their portfolios
    df = df[(~df['low'].isna()) & (~df['high'].isna())]
    df['fswr_pass'] = df['fall_fys_pass'] & df['spring_fys_pass'] & df['eval_pass']
    df = df[[
        'assessment',
        'cohort', 'race', 'sex', 'first_gen_status',
        'fall_grade', 'fall_gpa', 'fall_fys_pass',
        'spring_grade', 'spring_gpa', 'spring_fys_pass',
        'low', 'high', 'tiebreaker', 'median',
        'port_pass', 'eval_pass', 'fswr_pass',
    ]]
    return df

TWE_DATA = read_twe_data()
PORT_DATA = read_portfolio_data()
ATTR_VALUES = {
    attr: tuple(
        pd.concat([TWE_DATA[attr], PORT_DATA[attr]])
        .value_counts()
        .sort_values(ascending=False)
        .index
    )
    for attr in ATTRIBUTES
}
ATTR_VALUES['cohort'] = sorted([*TWE_DATA['cohort'].unique(), *PORT_DATA['cohort'].unique()])
COLOR_MAPS = {
    'race': {
        'White': category_palette[4],
        'Hispanic or Latino': category_palette[10],
        'Asian': category_palette[2],
        'Two or More Races': category_palette[8],
        'Non Resident Alien': category_palette[0],
        'Black or African American': category_palette[14],
        'Unknown': category_palette[12],
        'Native Hawaiian or Other Pacific Islander': category_palette[6],
    },
    'sex': {
        'Female': category_palette[7],
        'Male': category_palette[1],
    },
    'first_gen_status': {
        'Not First Gen': category_palette[1],
        'First Gen': category_palette[3],
    },
}

hide_code()

## Curricular Lens

In [None]:
def plot_course_pass_rate():
    df = (
        pd.concat([
            TWE_DATA[[
                'cohort', 'assessment', 
                'fall_fys_pass', 'spring_fys_pass',
                'fall_pass', 'spring_pass', 'twe_pass',
                'eval_pass', 'fswr_pass'
            ]],
            (
                PORT_DATA[[
                    'cohort', 'assessment',
                    'fall_fys_pass', 'spring_fys_pass',
                    'eval_pass', 'fswr_pass'
                ]]
                .assign(
                    fall_pass=np.nan,
                    spring_pass=np.nan,
                    twe_pass=np.nan,
                )
            ),
        ])
        .assign(dummy=1)
    )
    eval_types = {
        'fall_fys': ('Fall FYS', category_palette[6]),
        'spring_fys': ('Spring FYS', category_palette[2]),
        'fall': ('Fall Score', category_palette[4]),
        'spring': ('Spring Score', category_palette[0]),
        'twe': ('TWE Score', category_palette[8]),
        'eval': ('Overall Evaluation', '#000000'),
    }
    plot_df = (
        pd.concat([
            df.pivot_table(
                index=['cohort', 'assessment'],
                columns=[f'{eval_type}_pass'],
                values=['dummy'],
                aggfunc=len,
            )
            .assign(
                eval_type=eval_type,
            )
            for i, eval_type in enumerate(eval_types)
        ])
        .reset_index()
        .set_axis(['cohort', 'assessment', 'fail', 'pass', 'eval_type'], axis=1)
        .assign(
            total=(lambda df: df['pass'] + df['fail']),
            pass_portion=(lambda df: df['pass'] / df['total']),
        )
    )
    fig = figure(
        width=960, height=480,
        x_range=sorted(plot_df['cohort'].unique()),
        y_range=[0, 1],
        title='Pass Rates of FYW Assessment Components',
        x_axis_label='Cohort',
        y_axis_label='Pass Rate',
    )
    renderers = []
    for eval_type, (legend, color) in eval_types.items():
        renderer = fig.square(
            x='cohort',
            y='pass_portion',
            source=ColumnDataSource(plot_df[plot_df['eval_type'] == eval_type]),
            legend_label=legend,
            color=color,
        )
        fig.line(
            x='cohort',
            y='pass_portion',
            color=color,
            source=ColumnDataSource(plot_df[plot_df['eval_type'] == eval_type]),
            line_width=(1.5 if eval_type == 'eval' else 1),
            line_dash=('dashed' if eval_type == 'eval' else 'solid'),
        )
        renderers.append(renderer)
    fig.line(
        x=[4, 4],
        y=[0, 600],
        color='black',
        line_dash='dashed',
    )
    fig.add_tools(HoverTool(renderers=renderers, tooltips=[
        ('cohort', '@cohort'),
        ('pass rate', '@pass_portion'),
        ('num students', '@total'),
        ('num_pass', '@pass'),
    ]))
    fig.yaxis.formatter = NumeralTickFormatter(format='0 %')
    fig.legend.location = 'bottom_right'
    show_plot(fig, filename='1-curricular-component-pass-rates.png')

def plot_outcome_flow():
    display(
        pd.concat([
            TWE_DATA[['cohort', 'assessment', 'fall_pass', 'spring_pass', 'eval_pass', 'pass']],
            PORT_DATA[['cohort', 'assessment', 'fall_pass', 'spring_pass', 'eval_pass', 'pass']],
        ])
        .assign(
            courses_passed=(lambda df: df['fall_pass'].astype(int) + df['spring_pass'].astype(int)),
        )
        .pivot_table(
            index=['courses_passed', 'assessment'],
            columns=['eval_pass'],
            values=['cohort'],
            aggfunc=len,
            fill_value=0,
        )
        .reset_index()
        .assign(
            pass_rate=(lambda df: df[('cohort', True)] / (df[('cohort', True)] + df[('cohort', False)])),
        )
        .droplevel(1, axis=1)
        .assign(
            count=(lambda df: df['cohort'].sum(axis=1)),
        )
        .pivot_table(
            index=['courses_passed'],
            columns=['assessment'],
            values=['count', 'pass_rate'],
        )
        .sort_values(['courses_passed'], ascending=False)
        .swaplevel(axis=1)
        .sort_index(axis=1, ascending=(False, True))
    )

hide_code()

In [None]:
plot_course_pass_rate()

## Performance Lens

In [None]:
def pass_rate_df(attr):
    indices = ['assessment', 'cohort']
    if attr is not None:
        indices.append(attr)
    return (
        pd.concat([
            TWE_DATA.pivot_table(
                index=indices,
                columns=['twe_pass'],
                values=['fall_grade'], # dummy variable to be aggregated over
                aggfunc=len,
                fill_value=0,
            ),
            PORT_DATA.pivot_table(
                index=indices,
                columns=['port_pass'],
                values=['fall_grade'], # dummy variable to be aggregated over
                aggfunc=len,
                fill_value=0,
            )
        ])
        .reset_index()
        .set_axis([*indices, 'failed', 'passed'], axis=1)
        .assign(
            count=(lambda df: df['passed'] + df['failed']),
            passed_percent=(lambda df: df['passed'] / (df['passed'] + df['failed'])),
            failed_percent=(lambda df: df['failed'] / (df['passed'] + df['failed'])),
        )
    )

def plot_pass_rates(attr=None, row_size=1, legend_location='bottom_left'):
    attr_df = pass_rate_df(attr)
    fig = figure(
        width=960 // row_size, height=480 // row_size,
        x_range=ATTR_VALUES['cohort'], y_range=[0, 1],
        title=' '.join([
            'Non-Course Evaluation',
            'Pass Rate by',
            ('' if attr is None else attr.title().replace('_', ' ')),
        ]),
        x_axis_label='Cohort',
        y_axis_label='Pass Rate',
    )
    renderers = []
    if attr is not None:
        for attr_val in ATTR_VALUES[attr]:
            if attr_val in ['Unknown', 'Native Hawaiian or Other Pacific Islander']:
                continue
            renderer = fig.square(
                x='cohort',
                y='passed_percent',
                color=COLOR_MAPS[attr][attr_val],
                source=ColumnDataSource(attr_df[attr_df[attr] == attr_val]),
                legend_label=attr_val,
            )
            fig.line(
                x='cohort',
                y='passed_percent',
                color=COLOR_MAPS[attr][attr_val],
                source=ColumnDataSource(attr_df[attr_df[attr] == attr_val]),
            )
            renderers.append(renderer)
    renderer = fig.square(
        x='cohort',
        y='passed_percent',
        size=8,
        source=ColumnDataSource(pass_rate_df(None)),
        color='#000000',
        legend_label='All Students',
    )
    fig.line(
        x='cohort',
        y='passed_percent',
        line_width=1.5,
        line_dash='dashed',
        color='#000000',
        source=ColumnDataSource(pass_rate_df(None)),
    )
    renderers.append(renderer)
    fig.add_tools(HoverTool(renderers=renderers, tooltips=[
        ('Cohort', '@year'),
        ('Race', f'@{attr}'),
        ('Count', '@count'),
        ('Pass Rate', '@passed_percent{0%}')
    ]))
    fig.line(
        x=[4, 4],
        y=[0, 600],
        color='black',
        line_dash='dashed',
    )
    fig.yaxis.formatter = NumeralTickFormatter(format='0 %')
    fig.legend.location = legend_location
    fig.legend.items = [fig.legend.items[-1], *fig.legend.items[:-1]]
    if row_size > 1:
        fig.xaxis.major_label_orientation = -PI / 4
        fig.legend.spacing = 0
        fig.legend.label_height = 10
        fig.legend.glyph_height = 10
    return fig

def chi_square_pass_rate_between_assessments(attr, attr_val=None):
    df = pass_rate_df(attr)
    if attr_val is not None:
        df = df[df[attr] == attr_val]
    observations = df.groupby('assessment')[['failed', 'passed']].sum()
    observations = observations.to_numpy()
    return two_way_chi_square(observations)

def assessment_chi_square_results(attr, p_level=0.01):
    data = []
    for attr_val in ATTR_VALUES[attr]:
        chi_square, p, _, _ = chi_square_pass_rate_between_assessments(attr, attr_val)
        data.append([attr_val, chi_square, p])
    results_df = pd.DataFrame(data, columns=[attr, 'chi_square', 'p-value'])
    results_df['significant'] = results_df['p-value'] < p_level
    return (
        results_df[[attr, 'p-value', 'significant']]
        .set_index([attr])
        .replace({True: 'Yes', False: 'No'})
    )

def chi_square_pass_rate_between_demographics(assessment, attr, attr_val=None):
    df = pass_rate_df(attr)
    df = df[df['assessment'] == assessment]
    observations = df.groupby(attr)[['failed', 'passed']].sum()
    if attr_val is not None:
        if attr_val not in observations.index:
            return np.nan, np.nan, np.nan, None
        observations = observations.loc[[attr_val, ATTR_VALUES[attr][0]]].to_numpy()
    else:
        observations = observations.to_numpy()
    return two_way_chi_square(observations)


def demographic_chi_square_results(attr, p_level=0.01):
    data = []
    for attr_val in ATTR_VALUES[attr][1:]:
        for assessment in ['twe', 'portfolio']:
            chi_square, p, _, _ = chi_square_pass_rate_between_demographics(assessment, attr, attr_val)
            data.append([assessment, attr_val, chi_square, p])
    results_df = pd.DataFrame(data, columns=['assessment', attr, 'chi_square', 'p-value'])
    results_df['significant'] = results_df['p-value'] < p_level
    results_df = results_df[[attr, 'assessment', 'p-value', 'significant']].set_index([attr, 'assessment'])
    results_df = results_df.rename_axis('results', axis=1)
    return (
        results_df
        .unstack('assessment')
        .swaplevel('results', 'assessment', axis=1)
        .sort_index(axis=1, level=['assessment', 'results'], ascending=[False, True])
        .sort_values(
            ['race'],
            key=(lambda series: series.map(lambda value: ATTR_VALUES[attr].index(value)))
        )
        .replace({True: 'Yes', False: 'No'})
    )

hide_code()

Here we look at the overall pass rate and whether that differs by demographic attributes. We are specifically interested in two questions:

1. Did the change in assessment lead to a change in pass rate?
2. Are there differences in pass rates between races, sexes, and first-generation students?

In [None]:
show_plot(
    plot_pass_rates(
        attr='race',
        legend_location='top_left',
    ),
    filename='2-performance-eval-by-race.png',
)

Note that the Native Hawaiian or Other Pacific Islander population is small (never more than two a year), hence the dramatic changes it exhibits.

It is immediately obvious that switching to a portfolio-based evaluation had a substantial impact on pass rates for all students, regardless of race. We can confirm this statistically with a chi-square test of independence. In our case, our null hypothesis is that the evaluation method is independent of pass rate, which is to say pass rate is _not_ correlated with whether the TWE or the portfolio was used. A low p-value (p < 0.01) would reject the null hypothesis, suggesting that the evaluation method _does_ have an effect on pass rate.

In [None]:
assessment_chi_square_results('race', p_level=0.01)

As expected, these results show that the change to a portfolio-based evaluation led to significant changes in the pass rates for all races, with the exception for students identified as Native Hawaiian or Other Pacific Islander or of unknown race, of which there are too few for the chi-square test to be meaningful.

Both of these p-values are below the p-level of 0.01, meaning that for both TWE and portfolio assessments, race _does_ have an effect on pass rate. We can further explore which race(s) have pass rates that differ significantly from the pass rates of white students by performing pairwise chi-square tests. Since we already know there will be significant results, we apply the Bonferroni correction to use a p-level of 0.01 / 7 ~= 0.00143, which raises the standard for finding a result significant. In this case, the null hypothesis is that race _does not_ have an effect on pass rate compared to white students; a low p-value (p < 0.01) would reject the null hypothesis, suggesting that race _does_ have an effect on pass rate compared to white students.

In [None]:
demographic_chi_square_results('race',  p_level=0.01)

FIXME description of these results
* does a higher P-value after the change in assessment mean anything? I don't think so, hence the next test

We could also look at whether the change in assessment led to differences in pass rates. The null hypothesis here is that the assessment method _does not_ have an effect on pass rate; a low p-value (p < 0.01) would reject the null hypothesis, suggesting that the assessment _does_ have an effect on pass rate.

Comparatively, the pass rates for sex and first-generation status are clearer, where male students and first-generation students have lower pass rates than female and non-first-generation students respectively.

In [None]:
show_plot(
    gridplot([[
        plot_pass_rates('sex', row_size=2, legend_location='bottom_right'),
        plot_pass_rates('first_gen_status', row_size=2, legend_location='bottom_right'),
    ]]),
    filename='2-performance-overall-gender-gen.png',
)

Although unnecessary, the chi-square tests confirm that sex and first-generation status do affect pass rates, with p-values < 0.01

In [None]:
chi_square_pass_rate_between_demographics('twe', 'sex').pvalue

In [None]:
chi_square_pass_rate_between_demographics('twe', 'first_gen_status').pvalue

## Reliability Lens

In [None]:
def read_assessments():
    return pd.concat([
        (
            pd.read_csv('assessments.csv', sep='\t')
            .assign(
                total=(lambda df: sum(df[col] for col in CATEGORIES)),
                tiebreaker=False,
            )
        ),
        (
            pd.read_csv('tiebreakers.csv', sep='\t')
            .assign(tiebreaker=True)
        ),
    ])

ASS_DF = read_assessments()

def intra_category_agreement():
    display(
        pd.concat(
            [
                (
                    ASS_DF[['student', *CATEGORIES]]
                    .dropna()
                    .set_index(['student'])
                    .stack()
                    .reset_index()
                    .rename(columns={'level_1':'category', 0:'score'})
                    .groupby(['student', 'category']).aggregate(['count', 'mean'])
                    .droplevel(0, axis=1)
                    .where(lambda df: df['count'] == 2)
                    .dropna()
                    .drop(columns=['count'])
                    .replace({0:'same', 0.5:'diff', 1:'same'})
                    .reset_index()
                    .groupby(['category', 'mean']).aggregate(['count'])
                    .unstack()
                    .droplevel([0, 1], axis=1)
                    .assign(
                        total=(lambda df: df['diff'] + df['same']),
                        same_proportion=(lambda df: df['same'] / df['total']),
                        diff_proportion=(lambda df: df['diff'] / df['total']),
                    )
                ),
                (
                    ASS_DF[[*CATEGORIES]]
                    .dropna()
                    .aggregate(['sum', 'count'])
                    .transpose()
                    .assign(
                        pass_rate=(lambda df: df['sum'] / df['count']),
                    )
                ),
            ],
            axis=1,
        )
        .sort_index(key=(lambda index: index.map(CATEGORIES.index)))
        [['pass_rate', 'same_proportion']]
        .transpose()
    )

def inter_category_correlation():
    initial_map = {category: category[0].upper() for category in CATEGORIES}
    choosing_df = (
        pd.concat(
            [
                (
                    ASS_DF[ASS_DF['cohort'] == '2022-2023']
                    .groupby(['assessor'])['cohort'].count()
                ),
                (
                    ASS_DF[ASS_DF['cohort'] == '2022-2023']
                    .groupby(['assessor'])[CATEGORIES].corr()
                    .stack()
                    .reset_index()
                    .rename(columns={'level_1':'x', 'level_2':'y', 0:'corr'})
                    .assign(
                        x_index=(lambda df: df['x'].apply(lambda s: list(initial_map.keys()).index(s))),
                        y_index=(lambda df: df['y'].apply(lambda s: list(initial_map.keys()).index(s))),
                    )
                    .loc[lambda df: df['x_index'] > df['y_index']]
                    .groupby(['assessor'])['corr'].describe()
                ),
            ],
            axis=1,
        )
        .rename(columns={'cohort':'n'})
        .loc[lambda df: (20 < df['n']) & (df['n'] < 30)]
        .loc[lambda df: df['count'] == 10]
        .reset_index()
    )
    chosen = [
        # assessor with the most positively correlated categories
        choosing_df.sort_values(['max'], ascending=[False]).iloc[0]['assessor'],
        # assessor with the lowest all-possitive correlations
        choosing_df[choosing_df['min'] > 0].sort_values(['max'], ascending=[True]).iloc[0]['assessor'],
        # assessor with the smallest standard deviation
        choosing_df.sort_values(['std'], ascending=[True]).iloc[0]['assessor'],
        # assessor with the most negatively correlated categories
        choosing_df.sort_values(['min'], ascending=[True]).iloc[0]['assessor'],
    ]
    heatmaps = []
    color_mapper = LinearColorMapper(palette=[*oranges, *reversed(blues)], low=-1, high=1)
    for i, assessor in enumerate([None, *chosen]):
        assessor_df = (
            ASS_DF[
                (ASS_DF['cohort'] == '2022-2023')
                & (not assessor or (ASS_DF['assessor'] == assessor))
            ][CATEGORIES].corr()
            .unstack()
            .reset_index()
            .rename(columns={'level_0':'x', 'level_1':'y', 0:'corr'})
            .assign(
                x_index=(lambda df: df['x'].apply(lambda s: list(initial_map.keys()).index(s))),
                y_index=(lambda df: df['y'].apply(lambda s: list(initial_map.keys()).index(s))),
            )
            .loc[lambda df: df['x_index'] > df['y_index']]
            .assign(
                x_initial=(lambda df: df['x'].apply(lambda s: initial_map[s])),
                y_initial=(lambda df: df['y'].apply(lambda s: initial_map[s])),
            )
        )
        if assessor:
            n = choosing_df.set_index(['assessor']).at[assessor, 'n']
            title = f'Assessor #{i} (n={n})'
            #title = f'{assessor} (n={n})'
        else:
            n = len(ASS_DF[ASS_DF['cohort'] == '2022-2023'].dropna())
            title = f'Overall (n={n})'
        fig = figure(
            height=175, width=175,
            x_range=list(initial_map.values())[1:],
            y_range=list(reversed(initial_map.values()))[1:],
            title=title,
        )
        fig.rect(
            x=0,
            y=0,
            width=10,
            height=10,
            color='#808080',
        )
        fig.rect(
            x='x_initial',
            y='y_initial',
            width=1, height=1,
            fill_color={
                'field': 'corr',
                'transform': color_mapper,
            },
            line_color=None,
            source=ColumnDataSource(assessor_df),
        )
        heatmaps.append(fig)
    legend_df = pd.DataFrame(
        [
            [3.5, 'R:', 'Reflection'],
            [3, 'F:', 'Focus'],
            [2.5, 'O:', 'Organization'],
            [2, 'E:', 'Evidence and'],
            [1.5, '  ', 'Development'],
            [1, 'W:', 'Writing Features'],
        ],
        columns=['y', 'initial', 'category'],
    )
    legend_fig = figure(
        height=175, width=175,
        x_range=[0, 6], y_range=[0, 6],
    )
    legend_fig.text(
        x=0,
        y='y',
        text='initial',
        text_font_size='14px',
        source=ColumnDataSource(legend_df),
    )
    legend_fig.text(
        x=1,
        y='y',
        text='category',
        text_font_size='14px',
        source=ColumnDataSource(legend_df),
    )
    legend_fig.axis.visible = False
    legend_fig.xgrid.visible = False
    legend_fig.ygrid.visible = False
    legend_fig.outline_line_width = 0
    colorbar_fig = figure(frame_width=0, height=350)
    colorbar_fig.rect(
        x=0,
        y=0,
        width=10,
        height=10,
        color='#808080',
    )
    colorbar_fig.axis.visible = False
    colorbar_fig.add_layout(
        ColorBar(
            color_mapper=color_mapper,
            formatter=NumeralTickFormatter(format='0.00'),
        ),
        'right',
    )
    show_plot(
        layout([[
            column(heatmaps[0], legend_fig),
            column(heatmaps[1], heatmaps[3]),
            column(heatmaps[2], heatmaps[4]),
            colorbar_fig,
        ]]),
        filename='3-reliability-category-correlation.png'
    )

In [None]:
intra_category_agreement()

In [None]:
inter_category_correlation()