In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from plotting import create_bar_chart, create_stacked_bar_chart, plot_portfolio_outcome
import matplotlib.pyplot as plt
import seaborn as sns
from create_student_df import create_student_df
from create_program_df import create_program_df, add_selectivity_info, add_offer_rates

In [None]:
df = pd.read_csv('../clean_dfs/students_extended_export.csv')

In [None]:
df['offer_rate_match'] = df['offer_rate_match'].apply(lambda x: x if pd.notnull(x) else 1)
df['offer_rates'] = df.apply(lambda row: [row[f'offer_rate_{i}'] for i in range(1, 13) if pd.notnull(row[f'offer_rate_{i}'])], axis=1)

for metric in ['program_college_rate', 'program_grad_rate', 'impact', 'performance', 'aggregated_quality']:
    df[f'{metric}_match'] = df.apply(lambda row: row[f'{metric}_match'] if pd.notnull(row[f'{metric}_match']) else row[f'{metric}_mp'], axis=1)

In [None]:
# folder to save plots to
savefolder = '../export_nature_cities_revisions'

# Undermatching

In [None]:
df['undermatching'] = df['offer_rate_match'] - df['offer_rate_best_cf_pareto']
df['undermatching_withinportfolio'] = df['offer_rate_match'] - df['offer_rate_best_cf_pareto_withinportfolio']

for metric in ['offer_rate', 'program_college_rate', 'program_grad_rate', 'impact', 'performance', 'aggregated_quality']:
    df[f'undermatching_{metric}'] = df[f'{metric}_best_cf_pareto'] - df[f'{metric}_match'] # opposite direction as offer rate
    df[f'undermatching_withinportfolio_{metric}'] = df[f'{metric}_best_cf_pareto_withinportfolio'] - df[f'{metric}_match'] # opposite direction as offer rate

## Summary table for undermatching

In [None]:
table = ''
for metric in ['offer_rate', 'program_college_rate', 'program_grad_rate', 'impact', 'performance']:
    table_row = ''
    table_row += str(round(df[f'{metric}_match'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[df['ethnicity']==ethnicity][f'{metric}_match'].mean(), 2)) + ' & '
    table_row += str(round(df[f'{metric}_best_cf_pareto'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[df['ethnicity']==ethnicity][f'{metric}_best_cf_pareto'].mean(), 2)) + ' & '
    table_row += str(round(df[f'undermatching_{metric}'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[df['ethnicity']==ethnicity][f'undermatching_{metric}'].mean(), 2)) + ' & '
    table += table_row + '\n'

    table_row = ''
    table_row += str(round(df[df['poverty']==True][f'{metric}_match'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==True)][f'{metric}_match'].mean(), 2)) + ' & '
    table_row += str(round(df[df['poverty']==True][f'{metric}_best_cf_pareto'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==True)][f'{metric}_best_cf_pareto'].mean(), 2)) + ' & '
    table_row += str(round(df[df['poverty']==True][f'undermatching_{metric}'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==True)][f'undermatching_{metric}'].mean(), 2)) + ' & '
    table += table_row + '\n'

    table_row = ''
    table_row += str(round(df[df['poverty']==False][f'{metric}_match'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==False)][f'{metric}_match'].mean(), 2)) + ' & '
    table_row += str(round(df[df['poverty']==False][f'{metric}_best_cf_pareto'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==False)][f'{metric}_best_cf_pareto'].mean(), 2)) + ' & '
    table_row += str(round(df[df['poverty']==False][f'undermatching_{metric}'].mean(), 2)) + ' & '
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==False)][f'undermatching_{metric}'].mean(), 2)) + ' & '
    table += table_row + '\n'

with open(f'{savefolder}/undermatching_table.txt', 'w') as f:
    f.write(table)

## Offer Rate

In [None]:
create_bar_chart(df, 'offer_rate_match', title='Offer Rate of Match', save=False, savefolder=savefolder)

In [None]:
for metric, title in [('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:
    create_bar_chart(df, f'{metric}_match', title=title, save=True, savefolder=savefolder)

In [None]:
create_stacked_bar_chart(df, ['offer_rate_match', 'offer_rate_best_cf_pareto_withinportfolio', 'offer_rate_best_cf_pareto'], labels=['Match', 'CF (In Portfolio)', 'CF'], title='Offer Rate', save=False, savefolder=savefolder)

In [None]:
for metric, title in [('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:
    create_stacked_bar_chart(df, [f'{metric}_best_cf_pareto', f'{metric}_best_cf_pareto_withinportfolio', f'{metric}_match'], labels=['CF', 'CF (In Portfolio)', 'Match'], title=title, save=True, savefolder=savefolder)

# Undermatching

In [None]:
create_bar_chart(df, 'undermatching', title='Undermatching', save=False, savefolder=savefolder)

In [None]:
create_stacked_bar_chart(df, ['undermatching', 'undermatching_withinportfolio'], labels=['Overall', 'In Portfolio'], title='Undermatching', save=False, savefolder=savefolder)

In [None]:
for metric, title in [('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:
    create_stacked_bar_chart(df, [f'undermatching_{metric}', f'undermatching_withinportfolio_{metric}'], labels=['Overall', 'In Portfolio'], title=f'Undermatching ({title})', save=True, savefolder=savefolder)

In [None]:
for metric, title in [('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:
    create_bar_chart(df, f'undermatching_{metric}', title=f'Undermatching ({title})', save=True, savefolder=savefolder)

### Undermatching by student competitiveness (i.e., best cf offer rate)

In [None]:
def get_bin(offer_rate, bins):
    for i in range(len(bins)-1):
        if offer_rate >= bins[i] and offer_rate < bins[i+1]:
            return i
    
    return len(bins)-2

bins = [0, 0.2, 0.4, 0.6, 0.8, 1, 100]
df['offer_rate_cf_bin'] = df['offer_rate_best_cf_pareto'].apply(lambda x: get_bin(x, bins))

In [None]:
group_names_pretty = {
        'asian': 'Asian', 'black':'Black', 'hispanic':'Hispanic', 'white':'white',
        'M':'Male',
        'F' : 'Female'
    }

groups = ['asian', 'black', 'hispanic', 'white']
undermatching_values = {}
for group in groups:
    sns.lineplot(x='offer_rate_cf_bin', y='undermatching', data=df[df['ethnicity']==group], label=group_names_pretty[group])

plt.legend(fontsize=12)
plt.xticks(range(len(bins)-1), ['[0, 0.2)', '[0.2, 0.4)', '[0.4, 0.6)', '[0.6, 0.8)', '[0.8, 1)', '1'])
plt.xlabel('Offer Rate of Most Selective Counterfactual', fontsize=12)
plt.ylabel('Undermatching', fontsize=12)
sns.despine()
plt.savefig(f'{savefolder}/offer_rate_by_best_cf.png', dpi=600)
plt.show()

In [None]:
for metric, title in [('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:

    group_names_pretty = {
            'asian': 'Asian', 'black':'Black', 'hispanic':'Hispanic', 'white':'white',
            'M':'Male',
            'F' : 'Female'
        }

    groups = ['asian', 'black', 'hispanic', 'white']
    undermatching_values = {}
    for group in groups:
        sns.lineplot(x=f'offer_rate_cf_bin', y=f'undermatching_{metric}', data=df[df['ethnicity']==group], label=group_names_pretty[group])

    plt.legend(fontsize=12)
    plt.xticks(range(len(bins)-1), ['[0, 0.2)', '[0.2, 0.4)', '[0.4, 0.6)', '[0.6, 0.8)', '[0.8, 1)', '1'])
    plt.xlabel(f'Offer Rate of Most Selective Counterfactual', fontsize=12)
    plt.ylabel(f'Undermatching ({title})', fontsize=12)
    sns.despine()
    plt.savefig(f'{savefolder}/{title}_by_best_cf.png', dpi=600)
    plt.show()

In [None]:
df['tier'] = df['avg_grades_for_tier'].apply(
        lambda x: 1 if x >= 350 else (2 if x >= 250 else (3 if x >= 150 else 4))
    )

undermatching_values = {}
for group in groups:
    sns.lineplot(x='tier', y='undermatching', data=df[df['ethnicity']==group], label=group_names_pretty[group])

plt.legend(fontsize=12)
plt.xticks([1, 2, 3, 4], ['1 (Top)', '2', '3', '4 (Bottom)'])
plt.xlabel('Tier', fontsize=12)
plt.ylabel('Undermatching', fontsize=12)
sns.despine()
plt.savefig(f'{savefolder}/offer_rate_by_tier.png', dpi=600)
plt.show()

In [None]:
for metric, title in [('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:

    df['tier'] = df['avg_grades_for_tier'].apply(
            lambda x: 1 if x >= 350 else (2 if x >= 250 else (3 if x >= 150 else 4))
        )

    undermatching_values = {}
    for group in groups:
        sns.lineplot(x='tier', y=f'undermatching_{metric}', data=df[df['ethnicity']==group], label=group_names_pretty[group])

    plt.legend(fontsize=12)
    plt.xticks([1, 2, 3, 4], ['1 (Top)', '2', '3', '4 (Bottom)'])
    plt.xlabel('Tier', fontsize=12)
    plt.ylabel(f'Undermatching ({title})', fontsize=12)
    sns.despine()
    plt.savefig(f'{savefolder}/{title}_by_tier.png', dpi=600)
    plt.show()

In [None]:
def get_tiebreaker_num(tiebreaker):
    if type(tiebreaker)==str:
        head = tiebreaker[:5]
        return int(format(int(head, 16), 'd'))/1048518
    else:
        return np.nan

bins_tiebreaker = [0, 0.2, 0.4, 0.6, 0.8, 1]
df['tiebreaker_num'] = df['tiebreaker'].apply(lambda x: get_tiebreaker_num(x))
df['tiebreaker_bin'] = df['tiebreaker_num'].apply(lambda x: get_bin(x, bins_tiebreaker))

undermatching_values = {}
for group in groups:
    sns.lineplot(x='tiebreaker_bin', y=f'undermatching', data=df[df['ethnicity']==group], label=group_names_pretty[group])

plt.legend(fontsize=12)
plt.xticks(range(len(bins_tiebreaker)-1), ['[0, 0.2)', '[0.2, 0.4)', '[0.4, 0.6)', '[0.6, 0.8)', '[0.8, 1)'])
plt.xlabel('Lottery Number', fontsize=12)
plt.ylabel(f'Undermatching', fontsize=12)
sns.despine()
plt.savefig(f'{savefolder}/offer_rate_by_lottery.png', dpi=600)
plt.show()

In [None]:
for metric, title in [('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:

    undermatching_values = {}
    for group in groups:
        sns.lineplot(x='tiebreaker_bin', y=f'undermatching_{metric}', data=df[df['ethnicity']==group], label=group_names_pretty[group])

    plt.legend(fontsize=12)
    plt.xticks(range(len(bins_tiebreaker)-1), ['[0, 0.2)', '[0.2, 0.4)', '[0.4, 0.6)', '[0.6, 0.8)', '[0.8, 1)'])
    plt.xlabel('Lottery Number', fontsize=12)
    plt.ylabel(f'Undermatching ({title})', fontsize=12)
    sns.despine()
    plt.savefig(f'{savefolder}/{title}_by_lottery.png', dpi=600)
    plt.show()

## Lottery number analysis

In [None]:
for group in groups:
    sns.lineplot(x='tiebreaker_bin', y=f'offer_rate_1', data=df[df['ethnicity']==group], label=group_names_pretty[group])

plt.legend(fontsize=12)
plt.xticks(range(len(bins_tiebreaker)-1), ['[0, 0.2)', '[0.2, 0.4)', '[0.4, 0.6)', '[0.6, 0.8)', '[0.8, 1)'])
plt.xlabel('Lottery Number', fontsize=12)
plt.ylabel(f'Offer Rate of Top Choice', fontsize=12)
sns.despine()
plt.savefig(f'{savefolder}/offer_rate_1_by_lottery.png', dpi=600)
plt.show()


for group in groups:
    sns.lineplot(x='tiebreaker_bin', y=f'ofchoices', data=df[df['ethnicity']==group], label=group_names_pretty[group])

plt.legend(fontsize=12)
plt.xticks(range(len(bins_tiebreaker)-1), ['[0, 0.2)', '[0.2, 0.4)', '[0.4, 0.6)', '[0.6, 0.8)', '[0.8, 1)'])
plt.xlabel('Lottery Number', fontsize=12)
plt.ylabel(f'List Length', fontsize=12)
sns.despine()
plt.savefig(f'{savefolder}/ofchoices_by_lottery.png', dpi=600)
plt.show()

In [None]:
df['offer_rates_mean'] = df['offer_rates'].apply(lambda x: np.mean(x))
df['offer_rates_max'] = df['offer_rates'].apply(lambda x: np.max(x))

for group in groups:
    sns.lineplot(x='tiebreaker_bin', y=f'offer_rates_mean', data=df[df['ethnicity']==group], label=group_names_pretty[group])

plt.legend(fontsize=12)
plt.xticks(range(len(bins_tiebreaker)-1), ['[0, 0.2)', '[0.2, 0.4)', '[0.4, 0.6)', '[0.6, 0.8)', '[0.8, 1)'])
plt.xlabel('Lottery Number', fontsize=12)
plt.ylabel(f'Mean Offer Rate', fontsize=12)
sns.despine()
plt.savefig(f'{savefolder}/offer_rates_mean_by_lottery.png', dpi=600)
plt.show()

for group in groups:
    sns.lineplot(x='tiebreaker_bin', y=f'offer_rates_max', data=df[df['ethnicity']==group], label=group_names_pretty[group])

plt.legend(fontsize=12)
plt.xticks(range(len(bins_tiebreaker)-1), ['[0, 0.2)', '[0.2, 0.4)', '[0.4, 0.6)', '[0.6, 0.8)', '[0.8, 1)'])
plt.xlabel('Lottery Number', fontsize=12)
plt.ylabel(f'Max Offer Rate', fontsize=12)
sns.despine()
plt.savefig(f'{savefolder}/offer_rates_max_by_lottery.png', dpi=600)
plt.show()

In [None]:
import statsmodels.formula.api as smf

formula = "offer_rate_1 ~ ethnicity + tiebreaker_num + tiebreaker_num:ethnicity"
mod = smf.ols(formula = formula, data = df)
res = mod.fit()
with open(f'regressions-20250809/lottery_offer_rate_1.txt', 'w') as f:
        f.write(res.summary().as_latex())
display(res.summary())

formula = "ofchoices ~ ethnicity + tiebreaker_num + tiebreaker_num:ethnicity"
mod = smf.ols(formula = formula, data = df)
res = mod.fit()
with open(f'regressions-20250809/lottery_ofchoices.txt', 'w') as f:
        f.write(res.summary().as_latex())
display(res.summary())

formula = "offer_rates_mean ~ ethnicity + tiebreaker_num + tiebreaker_num:ethnicity"
mod = smf.ols(formula = formula, data = df)
res = mod.fit()
with open(f'regressions-20250809/lottery_offer_rate_mean.txt', 'w') as f:
        f.write(res.summary().as_latex())
display(res.summary())

formula = "offer_rates_max ~ ethnicity + tiebreaker_num + tiebreaker_num:ethnicity"
mod = smf.ols(formula = formula, data = df)
res = mod.fit()
with open(f'regressions-20250809/lottery_offer_rate_max.txt', 'w') as f:
        f.write(res.summary().as_latex())
res = mod.fit()
display(res.summary())

# Interpretable Portfolio Behaviors

## Deviations from reach-match-safety

In [None]:
# No reach: Applied to an unselective program first
df['unselective_first'] = df['offer_rate_1'] == 1

# No match: Rejected by programs before matching to an unselective program
df['no_match_program'] = df.apply(lambda row: row['offer_rate_match']==1 and row['matched_choice_num'] > 1 and row['matched']==True, axis=1)

# No safety: Did not match to any programs
df['not_matched'] = df['matched'] != 1

In [None]:
df['interpretable_deviation'] = (df['unselective_first'] | df['no_match_program'] | df['not_matched'])
print(df['unselective_first'].mean(),  df['no_match_program'].mean(), df['not_matched'].mean())
print(df['interpretable_deviation'].mean())

In [None]:
create_bar_chart(df, 'unselective_first', title='No Reach Program', save=False, savefolder=savefolder)
create_bar_chart(df, 'no_match_program', title='No Match Program', save=False, savefolder=savefolder)
create_bar_chart(df, 'not_matched', title='No Safety Program', save=False, savefolder=savefolder)

In [None]:
create_bar_chart(df, 'offer_rate_1', title='Offer Rate of Top-Ranked Program', save=False, savefolder=savefolder)

In [None]:
df['matched_to_1'] = df['matched_choice_num'] == 1
create_bar_chart(df, 'matched_to_1', title='Matched to Top Choice', save=False, savefolder=savefolder)

In [None]:
df['offer_rate_safety'] = df.apply(lambda row: np.nanmax([row[f'offer_rate_{i}'] for i in range(1, 13)]), axis=1)
df['no_unselective'] = df['offer_rate_safety'] < 0.9
create_bar_chart(df, 'no_unselective', title='No Unselective (Offer Rate > 0.9)', save=False, savefolder=savefolder)

In [None]:
print(df[df['no_unselective']==True]['not_matched'].mean(), df[df['no_unselective']==False]['not_matched'].mean())

### Example portfolios from each deviation

In [None]:
plot_portfolio_outcome(df[df['unselective_first']].sample(1, random_state=8), title='No Reach Program', save=False, savefolder=savefolder)
plot_portfolio_outcome(df[df['no_match_program']==1].sample(1, random_state=8), title='No Match Program', save=False, savefolder=savefolder)
plot_portfolio_outcome(df[df['not_matched']==1].sample(1, random_state=9), title='No Safety Program', save=False, savefolder=savefolder)

### Inversions and Effective List Length

In [None]:
df['undermatching_withinportfolio'] = df['offer_rate_match'] - df['offer_rate_best_cf_pareto_withinportfolio']

In [None]:
create_stacked_bar_chart(df, ['undermatching', 'undermatching_withinportfolio'], labels=['Overall', 'In Portfolio'], title='Undermatching', save=False, savefolder=savefolder)

In [None]:
df['inversion'] = df['undermatching_withinportfolio'] > 0
df['inversion'].mean()

In [None]:
create_bar_chart(df, 'inversion', title='Portfolio Contains Inversion', save=False, savefolder=savefolder)

In [None]:
def get_ofchoices_effective(row):
    for i in range(1, int(row['ofchoices'])):
        if row[f'offer_rate_{i}'] == 1:
            return i
    else:
        return row['ofchoices']
    
df['ofchoices_effective'] = df.apply(lambda row: get_ofchoices_effective(row), axis=1)

In [None]:
create_stacked_bar_chart(df, ['ofchoices', 'ofchoices_effective'], labels=['List Length', 'Effective List Length'], title='List Length', save=False, savefolder=savefolder)

In [None]:
df['full_list'] = df['ofchoices']==12
print(df['full_list'].mean())
create_bar_chart(df, 'full_list', title='Full List', save=False, savefolder=savefolder)

In [None]:
print(df[df['ofchoices']==12]['not_matched'].mean())
create_bar_chart(df[df['ofchoices']==12], 'not_matched', title='Not Matched Conditional on Full List', save=False, savefolder=savefolder)

## Summary table of portfolio behaviors

In [None]:
def get_group_prevalences(df, metric):
    prevalences = []
    for FRL in [0,1]:
        prevalences.append(round(df[df['poverty']==FRL][metric].mean(), 2))
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:
        prevalences.append(round(df[df['ethnicity']==ethnicity][metric].mean(), 2))
    return prevalences

metrics = ['unselective_first', 'offer_rate_1', 'no_match_program', 'not_matched', 'no_unselective', 'inversion', 'ofchoices', 'ofchoices_effective', 'no_unselective']

rows = [[metric, *get_group_prevalences(df, metric)] for metric in metrics]

df_behaviors = pd.DataFrame(rows, columns=['behavior', 'no FRL', 'FRL', 'Asian', 'Black', 'Hispanic', 'white'])
df_behaviors

# Model-based recommendations

## "Utils"

In [None]:
df_programs = pd.read_csv('../clean_dfs/programs_with_apps_export.csv')
df_sqr = pd.read_csv('../data/202223-hs-sqr-with-mean.csv')
df_programs = df_programs.merge(df_sqr[['DBN','mean_SQR']], how='left', left_on='dbn', right_on='DBN').rename(columns={'mean_SQR_x':'mean_SQR'})

# Load 2021 offer rate data to be used for recommendations
cohort_directory = f'R:/CR4239/Cohort 2021-22/'
df_2021 = pd.read_csv(cohort_directory+f'2021 HSAPS_Scrambled_for2939.csv', dtype={'student_id_scram':'string'}, low_memory=False)
df_2021 = df_2021.drop_duplicates(subset = 'student_id_scram')
    
# filter for students who apply to at least one school, and GE only
df_2021 = df_2021[df_2021['ofchoices'] >= 1]
df_2021 = df_2021[df_2021['type'] == 'GE']
df_2021.loc[~df_2021['matched_program'].str.contains(r'[A-Za-z0-9]{4}$', na=False), 'matched_program'] = np.nan
df_2021.loc[~df_2021['finalprogramcode'].str.contains(r'[A-Za-z0-9]{4}$', na=False), 'finalprogramcode'] = np.nan

df_programs_2021 = create_program_df(2021, include_ms_tests=False)
df_programs_2021 = add_offer_rates(df_programs_2021, df_2021)

from collections import defaultdict
from collections import Counter
offer_rate_dict_2021 = dict(zip(df_programs_2021['programcode'], df_programs_2021['offer_rate']))
offer_rate_dict_2021 = defaultdict(lambda: 1, offer_rate_dict_2021)

num_matches_dict_2021 = defaultdict(int, Counter(df_2021['matched_program']))
num_matches_dict_2022 = defaultdict(int, Counter(df['matched_program']))

In [None]:
def get_score(probs):
    increasing_probs = get_increasing_probs(probs)
    score = 0
    low = 0
    increasing_probs.append(1)
    for prob in increasing_probs:
        score += prob * (prob - low)
        low = prob
    return score

def get_increasing_probs(probs):
    increasing_probs = []
    increasing_args = []
    for i, prob in enumerate(probs):
        if i == 0:
            increasing_probs.append(prob)
            increasing_args.append(i)
        else:
            if prob > increasing_probs[-1]:
                increasing_probs.append(prob)
                increasing_args.append(i)
    return increasing_probs

# best way to change the program with a given rank
def get_best_move(probs, candidate_args=False, only_reach=False):
    best_arg = 0
    best_move = 0
    best_score = get_score(probs)
    if not candidate_args:
        candidate_args = range(len(probs))
    
    for arg in candidate_args:
        increasing_probs = []
        increasing_args = []
        for i, prob in enumerate(probs):
            if i == 0:
                increasing_probs.append(prob)
                increasing_args.append(i)
            else:
                if prob > increasing_probs[-1]:
                    increasing_probs.append(prob)
                    increasing_args.append(i)

        low_neighbor = 0
        i = arg - 1
        while i >= 0:
            if i in increasing_args:
                low_neighbor = probs[i]
                break
            i -= 1


        potential_high_neighbors = get_increasing_probs([low_neighbor, *probs[arg + 1:]])[1:]
        potential_high_neighbors.append(1)
        for high_neighbor in potential_high_neighbors:
            probs_potential = probs.copy()
            probs_potential[arg] = (low_neighbor + high_neighbor)/2
            score = get_score(probs_potential)
            if score < best_score:
                best_move = (low_neighbor + high_neighbor)/2 - probs[arg]
                best_score = score
                best_arg = arg
    
    if only_reach:
        if best_move > 0:
            best_move = 0
    return best_arg, best_move

In [None]:
def plot_portfolio_recommendation(row, title='recommended portfolio', save=False, savefolder= f'../plots_export_20250112/'):
    offer_rates = row['offer_rates'].to_list()[0]
    best_arg, best_move = row['best_move'].to_list()[0]
    if np.abs(best_move) > 0.1:
        best_move_shortened = -(np.abs(best_move) - 0.1) if best_move < 0 else best_move - 0.1
    else:
        best_move_shortened = 0
    plt.figure(figsize=(6, 2))
    plt.scatter(range(1, len(offer_rates)+1), offer_rates, color='black', label='old choices')
    plt.scatter([best_arg + 1], [offer_rates[best_arg] + best_move], marker='s', color='green', label='recommended choice')
    plt.arrow(best_arg + 1, offer_rates[best_arg], 0, best_move_shortened, head_width=0.2, head_length=0.05)
    plt.xlim(0.5, 12.5)
    plt.ylim(-0.05, 1.05)
    plt.xlabel('rank')
    plt.ylabel('offer rate')
    plt.title(title)
    plt.legend()
    if save:
        plt.savefig(f'{savefolder}/{title}.pdf', dpi=600)
    plt.show()

In [None]:
# Get only close programs with counterfactuals
df_zipzip = pd.read_csv('../zipzipdf.csv')

zip_to_programs = defaultdict(lambda: [])

for index, row in df_programs.iterrows():
    zip_to_programs[row['zipcode']].append(row['programcode'])

df_zipzip['programcode'] = df_zipzip['zip_dest'].apply(lambda x: zip_to_programs[x])
df_zipzip = df_zipzip.explode('programcode', ignore_index=True)

programs_counterfactual = df_programs[df_programs['counterfactual']]['programcode'].to_list()

def filter_programs_by_radius(student_row, radius=30):
    helper_df = df_zipzip[df_zipzip['zip_source']==student_row['zipcode']]
    programs = helper_df.loc[helper_df['commute_time'] <= radius, 'programcode'].to_list()
    return programs

def get_close_programs_with_counterfactual(row):
    if row['matched']:
        result = df_zipzip.loc[(df_zipzip['zip_source']==row['zipcode']) & (df_zipzip['programcode']==row['matched_program']), 'commute_time']
        if not result.empty:
            radius = result.iloc[0]
        else:
            radius=30
    else:
        radius = 30
    close_programs = filter_programs_by_radius(row, radius)
    close_programs_with_counterfactual = list(set(close_programs) & set(programs_counterfactual))
    return close_programs_with_counterfactual

df['close_programs_with_counterfactual'] = df.apply(lambda row: get_close_programs_with_counterfactual(row), axis=1)

In [None]:
# further filter to programs that had at least 25 matched students the previous year

from collections import Counter
num_matches_dict_2021 = defaultdict(int, Counter(df_2021['matched_program']))
df['candidate_programs'] = df['close_programs_with_counterfactual'].apply(lambda x: [programcode for programcode in x if num_matches_dict_2021[programcode]>=25])

## Analysis

In [None]:
# this is for creating the best move to adjust individual portfolios

df['offer_rates_2021'] = df.apply(lambda row: [offer_rate_dict_2021[row[f'programcode{i}']] for i in range(1, 13)], axis=1)

df['best_move'] = df['offer_rates_2021'].apply(lambda x: get_best_move(x))
df['best_move_first'] = df['offer_rates_2021'].apply(lambda x: get_best_move(x, candidate_args=[0], only_reach=True))

df['best_move_arg'] = df['best_move'].apply(lambda x: x[0])
df['best_move_offer_rate_change'] = df['best_move'].apply(lambda x: x[1])
df['best_move_arg_first'] = df['best_move_first'].apply(lambda x: x[0])
df['best_move_offer_rate_change_first'] = df['best_move_first'].apply(lambda x: x[1])

def get_offer_rate(row, i):
    return row['offer_rates_2021'][i]

df['recommended_offer_rate'] = df.apply(lambda row: get_offer_rate(row, row['best_move_arg']) + row['best_move_offer_rate_change'], axis=1)
df['recommended_offer_rate_first'] = df.apply(lambda row: get_offer_rate(row, row['best_move_arg_first']) + row['best_move_offer_rate_change_first'], axis=1)

df['portfolio'] = df.apply(lambda row: [row[f'programcode{i}'] for i in range(1, 13)], axis=1)

In [None]:
plot_portfolio_recommendation(df.sample(1, random_state=8), title='Recommended portfolio 1', save=False, savefolder=savefolder)
plot_portfolio_recommendation(df.sample(1, random_state=9), title='Recommended portfolio 2', save=False, savefolder=savefolder)
plot_portfolio_recommendation(df.sample(1, random_state=10), title='Recommended portfolio 3', save=False, savefolder=savefolder)
plot_portfolio_recommendation(df.sample(1, random_state=105), title='Recommended portfolio 4', save=False, savefolder=savefolder)

In [None]:
# given a list of candidate programs, choose the program with the nearest offer rate to offer_rate, randomizing among the top r options
def get_program_with_nearest_offer_rate(offer_rate, programs, r=1):
    programs = [program for program in programs if program in offer_rate_dict_2021.keys()]
    if len(programs) > 0:
        program_offer_rates = np.array([offer_rate_dict_2021[program] for program in programs])
        i = np.random.randint(np.min([r, len(programs)]))
        arg = np.argsort(np.abs(offer_rate - program_offer_rates))[i]
        return programs[arg]
    else:
        return None
    
# given a portfolio, compute the match
def get_match(row, portfolio):
    for program in portfolio:
        if program == row['matched_program']:
            return program
        if program in programs_counterfactual and row[program]:
            return program
    return None

# return a modified portfolio, changing the program in a given rank
def make_change(portfolio, rank, program):
    portfolio = portfolio.copy()
    if len(portfolio) > 0 and pd.notnull(program):
        portfolio[rank] = program
    return portfolio

offer_rate_dict = dict(zip(df_programs['programcode'], df_programs['offer_rate']))
grad_rate_dict = dict(zip(df_programs['programcode'], df_programs['program_grad_rate']))
college_rate_dict = dict(zip(df_programs['programcode'], df_programs['program_college_rate']))
impact_dict = dict(zip(df_programs['programcode'], df_programs['impact']))
performance_dict = dict(zip(df_programs['programcode'], df_programs['performance']))
aggregated_quality_dict = dict(zip(df_programs['programcode'], df_programs['aggregated_quality']))

In [None]:
df['recommended_program_one'] = df.apply(lambda row: get_program_with_nearest_offer_rate(row['recommended_offer_rate'], row['candidate_programs'], r=1), axis=1)
df['recommended_program_first'] = df.apply(lambda row: get_program_with_nearest_offer_rate(row['recommended_offer_rate_first'], row['candidate_programs'], r=1), axis=1)
df['portfolio_one'] = df.apply(lambda row: make_change(row['portfolio'], row['best_move'][0], row['recommended_program_one']), axis=1)
df['portfolio_first'] = df.apply(lambda row: make_change(row['portfolio'], row['best_move_first'][0], row['recommended_program_first']), axis=1)
df['portfolio_optimal'] = df['close_programs_with_counterfactual'].apply(lambda programs: [get_program_with_nearest_offer_rate(i/13, programs, r=1) for i in range(1, 13)])
df['portfolio_optimal_with_safety'] = df['close_programs_with_counterfactual'].apply(lambda programs: [get_program_with_nearest_offer_rate(i/13, programs, r=1) for i in range(1, 14)])

In [None]:
def add_portfolio_result_cols(df, portfolio_col):
    mp_grad_rate_mean = df['program_grad_rate_mp'].mean()
    mp_college_rate_mean = df['program_college_rate_mp'].mean()
    mp_impact_mean = df['impact_mp'].mean()
    mp_performance_mean = df['performance_mp'].mean()
    mp_aggregated_quality_mean = df['aggregated_quality_mp'].mean()

    df[f'{portfolio_col}_match'] = df.apply(lambda row: get_match(row, row[f'{portfolio_col}']), axis=1)
    
    df[f'{portfolio_col}_match_offer_rate_diff'] = df[f'{portfolio_col}_match'].apply(lambda x: offer_rate_dict[x] if pd.notnull(x) else 1) - df['offer_rate_match']
    df[f'{portfolio_col}_match_grad_rate_diff'] = df[f'{portfolio_col}_match'].apply(lambda x: grad_rate_dict[x] if pd.notnull(x) else mp_grad_rate_mean) - df['program_grad_rate_match']
    df[f'{portfolio_col}_match_college_rate_diff'] = df[f'{portfolio_col}_match'].apply(lambda x: college_rate_dict[x] if pd.notnull(x) else mp_college_rate_mean) - df['program_college_rate_match']
    df[f'{portfolio_col}_match_impact_diff'] = df[f'{portfolio_col}_match'].apply(lambda x: impact_dict[x] if pd.notnull(x) else mp_impact_mean) - df['impact_match']
    df[f'{portfolio_col}_match_performance_diff'] = df[f'{portfolio_col}_match'].apply(lambda x: performance_dict[x] if pd.notnull(x) else mp_performance_mean) - df['performance_match']
    df[f'{portfolio_col}_match_aggregated_quality_diff'] = df[f'{portfolio_col}_match'].apply(lambda x: aggregated_quality_dict[x] if pd.notnull(x) else mp_aggregated_quality_mean) - df['aggregated_quality_match']

    return

pretty_names = {'portfolio_one': 'Single', 'portfolio_first': 'Top', 'portfolio_optimal': 'Full'}
def plot_portfolio_results(df, portfolio_col):
    for metric, title in [('offer_rate', 'Offer Rate'), ('grad_rate', 'Grad. Rate'), ('college_rate', 'College Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggr. Quality')]:
        create_bar_chart(df, f'{portfolio_col}_match_{metric}_diff', title=f'Change in {title} ({pretty_names[portfolio_col]})', save=True, savefolder=savefolder)

In [None]:
for portfolio_col in ['portfolio_one', 'portfolio_first', 'portfolio_optimal']:
    add_portfolio_result_cols(df, portfolio_col)
    print(df[f'{portfolio_col}_match_offer_rate_diff'].mean()/df['undermatching'].mean())
    plot_portfolio_results(df, portfolio_col)

## Heatmap of recommendations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.flatten()
for i, ax in enumerate(axes):
    sns.histplot(x=df[df['ethnicity']==groups[i]]['best_move_arg'], y=df[df['ethnicity']==groups[i]]['best_move_offer_rate_change'], bins=[np.linspace(-0.5,11.5,13), np.linspace(-1,1,21)], ax=ax, cbar=True, cmap='Blues', vmin=0)
    ax.plot([-0.5,11.5], [0,0], color='black', linestyle="--", linewidth=1)
    ax.set_xlabel('List Position')
    ax.set_ylabel('Change in Offer Rate')
    ax.set_xlim(-0.5,11.5)
    ax.set_ylim(-1.1,1.1)
    ax.set_xticks(range(12), range(1, 13))
    ax.set_title(f'Recommendations for {group_names_pretty[groups[i]]} students')

plt.tight_layout() 
plt.savefig(f'{savefolder}/heatmap.png', dpi=600)
plt.show()

In [None]:
plt.plot(range(1, 13), [(df['best_move_arg']==i).sum()/len(df) for i in range(12)])
print((df['best_move_arg']==0).sum()/len(df))

In [None]:
print('Proportion recommended to reach')
print('overall',  (df['best_move_offer_rate_change'] < 0).mean())
for ethnicity in ['asian', 'black', 'hispanic', 'white']:
    print(ethnicity, (df[df['ethnicity']==ethnicity]['best_move_offer_rate_change'] < 0).mean())

In [None]:
print('Average position')
print('overall',  (df['best_move_arg']).mean())
for ethnicity in ['asian', 'black', 'hispanic', 'white']:
    print(ethnicity, (df[df['ethnicity']==ethnicity]['best_move_arg']).mean())

# Supplemental

Offer rate by position

In [None]:
for metric, title in [('offer_rate', 'Offer Rate'), ('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:

    for i in range(1, 13):
        offer_rates_by_position = []
        df_temp = df[df['ofchoices']==i]
        for j in range(1, i+1):
            offer_rates_by_position.append(df_temp[f'{metric}_{j}'].mean())    
        plt.plot(range(1, i+1), offer_rates_by_position, label=str(i), marker='o')

    plt.xlabel('Rank in List',  fontsize=12)
    plt.ylabel(f'{title}',  fontsize=12)
    plt.legend(title='# of choices')
    sns.despine()
    plt.savefig(f'{savefolder}/{metric}_by_position_by_ofchoices.png', dpi=300)
    plt.show()

In [None]:
df_12 = df[df['ofchoices']==12]

for metric, title in [('offer_rate', 'Offer Rate'), ('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:
    for ethnicity in ['asian', 'black', 'hispanic', 'white']:

        offer_rates_by_position = []
        for i in range(1, 13):
            offer_rates_by_position.append(df[df['ethnicity']==ethnicity][f'{metric}_{i}'].mean())

        plt.plot(range(1, 13), offer_rates_by_position, label=group_names_pretty[ethnicity], marker='o')

    plt.xlabel('Rank in List', fontsize=12)
    plt.ylabel(f'{title}', fontsize=12)
    plt.legend()
    sns.despine()
    plt.savefig(f'{savefolder}/{metric}_by_position_by_ethnicity.png', dpi=300)
    plt.show()

In [None]:
df_12 = df[df['ofchoices']==12]

FRL_labels = ['No FRL', 'FRL']
for metric, title in [('offer_rate', 'Offer Rate'), ('program_grad_rate', 'Graduation Rate'), ('program_college_rate', 'College Attendance Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggregated Quality')]:
    for FRL in [0,1]:

        offer_rates_by_position = []
        for i in range(1, 13):
            offer_rates_by_position.append(df[df['poverty']==FRL][f'{metric}_{i}'].mean())

        plt.plot(range(1, 13), offer_rates_by_position, label=FRL_labels[FRL], marker='o')

    plt.xlabel('Rank in List', fontsize=12)
    plt.ylabel(f'{title}', fontsize=12)
    plt.legend()
    sns.despine()
    plt.savefig(f'{savefolder}/{metric}_by_position_by_FRL.png', dpi=300)
    plt.show()

# Simulate DA

## Compute program / seat group capacities

For programs that have excess capacity, we assume capacity = max(# students who match, stated capacity)

In [None]:
from create_counterfactual_with_scores import get_seatgroup_scores
import deferred_acceptance as da

In [None]:
df_scores, seat_group_fractions_dict, programs_counterfactual = get_seatgroup_scores(df, df_programs)

In [None]:
num_matches_dict = defaultdict(int, Counter(df['matched_program']))
df_programs['num_matches'] = df_programs['programcode'].apply(lambda x: num_matches_dict[x])

programs = df_programs['programcode'].to_list()
num_seat_groups_dict = {}
for program in programs:
    if program in programs_counterfactual:
        num_seat_groups_dict[program] = len(seat_group_fractions_dict[program])
    else:
        num_seat_groups_dict[program] = 1

seatgroups = [[f'{programcode}-{i}' for i in range(num_seat_groups_dict[programcode])] for programcode in programs]
seatgroups = [seatgroup for program_seatgroups in seatgroups for seatgroup in program_seatgroups]
seatgroup_arg_dict = {}
for i, seatgroup in enumerate(seatgroups):
    seatgroup_arg_dict[seatgroup] = i

seatgroup_to_program_dict = {}
for seatgroup in seatgroups:
    seatgroup_to_program_dict[seatgroup] = seatgroup[:4]

matched_students_dict = {programcode: [] for programcode in programs}

for index, row in df.iterrows():
    programcode = row['matched_program']
    if pd.notnull(programcode):
        matched_students_dict[programcode].append(index)

seatgroup_prefs = []
for seatgroup in seatgroups:
    if seatgroup_to_program_dict[seatgroup] in programs_counterfactual:
        seatgroup_prefs.append(np.argsort(-np.array(df_scores[seatgroup].to_list())))
    else:
        seatgroup_prefs.append(matched_students_dict[seatgroup_to_program_dict[seatgroup]])

seatgroup_caps_dict = {}
for programcode in programs:
    if programcode in programs_counterfactual:
        if offer_rate_dict[programcode] == 1:
            seat_count = 0
            for i, frac in enumerate(seat_group_fractions_dict[programcode]):
                if i == len(seat_group_fractions_dict[programcode]) - 1:
                    seatgroup_caps_dict[f'{programcode}-{i}'] = num_matches_dict[programcode] - seat_count
                else:
                    seatgroup_caps_dict[f'{programcode}-{i}']  = int(num_matches_dict[programcode] * frac)
                    seat_count += int(num_matches_dict[programcode] * frac)
        else:
            seat_count = 0
            for i, frac in enumerate(seat_group_fractions_dict[programcode]):
                if i == len(seat_group_fractions_dict[programcode]) - 1:
                    seatgroup_caps_dict[f'{programcode}-{i}'] = num_matches_dict[programcode] - seat_count
                else:
                    seatgroup_caps_dict[f'{programcode}-{i}']  = int(num_matches_dict[programcode] * frac)
                    seat_count += int(num_matches_dict[programcode] * frac)
    else:
        seatgroup_caps_dict[f'{programcode}-0'] = num_matches_dict[programcode]
        
seatgroup_caps = [seatgroup_caps_dict[seatgroup] for seatgroup in seatgroups]

In [None]:
def programs_to_seatgroup_args(programs):
    seatgroups = []
    for programcode in programs:
        if pd.notnull(programcode):
            for i in range(num_seat_groups_dict[programcode]):
                seatgroups.append(f'{programcode}-{i}')
    return [seatgroup_arg_dict[seatgroup] for seatgroup in seatgroups]

def add_predicted_equilibrium_match(df, student_prefs, seatgroup_prefs, seatgroup_caps, outcome_col):
    student_matches, seatgroup_matches = da.get_match(student_prefs, seatgroup_prefs, seatgroup_caps)
    df[outcome_col] = [seatgroup_to_program_dict[seatgroups[i]] if i>=0 else np.nan for i in student_matches]

In [None]:
df['recommended_program_one'] = df.apply(lambda row: get_program_with_nearest_offer_rate(row['recommended_offer_rate'], row['candidate_programs'], r=5), axis=1)
df['recommended_program_first'] = df.apply(lambda row: get_program_with_nearest_offer_rate(row['recommended_offer_rate_first'], row['candidate_programs'], r=5), axis=1)
df['portfolio_one'] = df.apply(lambda row: make_change(row['portfolio'], row['best_move'][0], row['recommended_program_one']), axis=1)
df['portfolio_first'] = df.apply(lambda row: make_change(row['portfolio'], row['best_move_first'][0], row['recommended_program_first']), axis=1)
df['portfolio_optimal'] = df['close_programs_with_counterfactual'].apply(lambda programs: [get_program_with_nearest_offer_rate(i/13, programs, r=5) for i in range(1, 13)])
df['portfolio_optimal_with_safety'] = df['close_programs_with_counterfactual'].apply(lambda programs: [get_program_with_nearest_offer_rate(i/13, programs, r=5) for i in range(1, 14)])

In [None]:
student_prefs_original = [programs_to_seatgroup_args(portfolio) for portfolio in df['portfolio'].to_list()]
student_prefs_one = [programs_to_seatgroup_args(portfolio) for portfolio in df['portfolio_one'].to_list()]
student_prefs_first= [programs_to_seatgroup_args(portfolio) for portfolio in df['portfolio_first'].to_list()]
student_prefs_optimal= [programs_to_seatgroup_args(portfolio) for portfolio in df['portfolio_optimal'].to_list()]
student_prefs_optimal_with_safety= [programs_to_seatgroup_args(portfolio) for portfolio in df['portfolio_optimal_with_safety'].to_list()]

In [None]:
add_predicted_equilibrium_match(df, student_prefs_original, seatgroup_prefs, seatgroup_caps, 'equilibrium_match_original')
add_predicted_equilibrium_match(df, student_prefs_one, seatgroup_prefs, seatgroup_caps, 'equilibrium_match_one')
add_predicted_equilibrium_match(df, student_prefs_first, seatgroup_prefs, seatgroup_caps, 'equilibrium_match_first')
add_predicted_equilibrium_match(df, student_prefs_optimal, seatgroup_prefs, seatgroup_caps, 'equilibrium_match_optimal')
add_predicted_equilibrium_match(df, student_prefs_optimal_with_safety, seatgroup_prefs, seatgroup_caps, 'equilibrium_match_optimal_with_safety')

In [None]:
accuracy = (df['equilibrium_match_original'].fillna('no match') == df['matched_program'].fillna('no match')).mean()
print(f'Accuracy of simulation on original preferences: {accuracy}')

In [None]:
print((df['matched_program'].isna()).mean())
print((df['equilibrium_match_one'].isna()).mean())
print((df['equilibrium_match_first'].isna()).mean())
print((df['equilibrium_match_optimal'].isna()).mean())
print((df['equilibrium_match_optimal_with_safety'].isna()).mean())

In [None]:
def add_match_column_post_manual_placement(df, match_col, match_col_post):

    num_matches_predicted_dict = defaultdict(int, Counter(df[match_col]))

    leftover_seats = {}
    for x in offer_rate_dict.keys():
        leftover_seats[x] = num_matches_dict[x] - num_matches_predicted_dict[x] 

    offer_rate_programs = list(offer_rate_dict.keys())
    offer_rates = [offer_rate_dict[x] for x in offer_rate_dict.keys()]
    offer_rate_args_sorted = np.argsort(offer_rates)

    empty_seats = 0
    for i in offer_rate_args_sorted:
        x = offer_rate_programs[i]
        empty_seats += leftover_seats[x]

    df[match_col_post] = df[match_col].apply(lambda x: x)

    j = 0
    while leftover_seats[offer_rate_programs[offer_rate_args_sorted[j]]] < 1:
        j += 1

    shuffled_indices = np.random.permutation(df.index)

    for idx in shuffled_indices:
        if pd.isna(df.at[idx, match_col]):
            # find next lowest offer rate program with seats open
            try:
                while leftover_seats[offer_rate_programs[offer_rate_args_sorted[j]]] < 1:
                    j += 1
            except:
                break

            program = offer_rate_programs[offer_rate_args_sorted[j]]
            df.at[idx, match_col_post] = program
            leftover_seats[program] -= 1
            
def add_match_col_results(df, match_col):
    mp_grad_rate_mean = df['program_grad_rate_mp'].mean()
    mp_college_rate_mean = df['program_college_rate_mp'].mean()
    mp_impact_mean = df['impact_mp'].mean()
    mp_performance_mean = df['performance_mp'].mean()
    mp_aggregated_quality_mean = df['aggregated_quality_mp'].mean()

    df[f'{match_col}_offer_rate_diff'] = df[f'{match_col}'].apply(lambda x: offer_rate_dict[x] if pd.notnull(x) else 1) - df['offer_rate_match']
    df[f'{match_col}_grad_rate_diff'] = df[f'{match_col}'].apply(lambda x: grad_rate_dict[x] if pd.notnull(x) else mp_grad_rate_mean) - df['program_grad_rate_match']
    df[f'{match_col}_college_rate_diff'] = df[f'{match_col}'].apply(lambda x: college_rate_dict[x] if pd.notnull(x) else mp_college_rate_mean) - df['program_college_rate_match']
    df[f'{match_col}_impact_diff'] = df[f'{match_col}'].apply(lambda x: impact_dict[x] if pd.notnull(x) else mp_impact_mean) - df['impact_match']
    df[f'{match_col}_performance_diff'] = df[f'{match_col}'].apply(lambda x: performance_dict[x] if pd.notnull(x) else mp_performance_mean) - df['performance_match']
    df[f'{match_col}_aggregated_quality_diff'] = df[f'{match_col}'].apply(lambda x: aggregated_quality_dict[x] if pd.notnull(x) else mp_aggregated_quality_mean) - df['aggregated_quality_match']

    return

pretty_names = {'equilibrium_match_one_post': 'Single', 'equilibrium_match_first_post': 'Top', 'equilibrium_match_optimal_post': 'Full'}
def plot_equilibrium_match_results(df, match_col):
    for metric, title in [('offer_rate', 'Offer Rate'), ('grad_rate', 'Grad. Rate'), ('college_rate', 'College Rate'), ('impact', 'Impact'), ('performance', 'Performance'), ('aggregated_quality', 'Aggr. Quality')]:
        create_bar_chart(df, f'{match_col}_{metric}_diff', title=f'Eq. Change in {title} ({pretty_names[match_col]})', save=True, savefolder=savefolder)

In [None]:
for match_col in ['equilibrium_match_one', 'equilibrium_match_first', 'equilibrium_match_optimal']:
    add_match_column_post_manual_placement(df, match_col, match_col + '_post')
    add_match_col_results(df, match_col + '_post')
    plot_equilibrium_match_results(df, match_col + '_post')

In [None]:
disparity = df[df['ethnicity']=='white']['offer_rate_match'].mean() - df[df['ethnicity']=='black']['offer_rate_match'].mean()
for match_col in ['equilibrium_match_one', 'equilibrium_match_first', 'equilibrium_match_optimal']:
    disparity_reduction = df[df['ethnicity']=='white'][f'{match_col}_post_offer_rate_diff'].mean() - df[df['ethnicity']=='black'][f'{match_col}_post_offer_rate_diff'].mean()
    print(disparity_reduction, disparity_reduction/disparity)

In [None]:
print('Change in disparity among most competitive students (best counterfactual offer rate)')
disparity = df[(df['offer_rate_cf_bin']==0) & (df['ethnicity']=='white')]['offer_rate_match'].mean() - df[(df['offer_rate_cf_bin']==0) & (df['ethnicity']=='black')]['offer_rate_match'].mean()
disparity_reduction = df[(df['offer_rate_cf_bin']==0) & (df['ethnicity']=='white')]['equilibrium_match_one_post_offer_rate_diff'].mean() - df[(df['offer_rate_cf_bin']==0) & (df['ethnicity']=='black')]['equilibrium_match_one_post_offer_rate_diff'].mean()
print(disparity_reduction, disparity_reduction/disparity)

# Comparing to offer rates in past years

In [None]:
offer_rates_2021, offer_rates_2022 = [], []
for programcode in offer_rate_dict.keys():
    if programcode in offer_rate_dict_2021.keys():
        if num_matches_dict_2021[programcode] > 25:
            offer_rates_2021.append(offer_rate_dict_2021[programcode])
            offer_rates_2022.append(offer_rate_dict[programcode])

plt.figure(figsize=(6,6))
plt.scatter(offer_rates_2021, offer_rates_2022, alpha=0.3)
plt.xlabel('Offer Rate 2021', fontsize=12)
plt.ylabel('Offer Rate 2022', fontsize=12)
sns.despine()

plt.tight_layout() 
plt.savefig(f'{savefolder}/offer_rate_2021_2022.png', dpi=600)
plt.show()

In [None]:
offer_rates_2021 = np.array(offer_rates_2021)
offer_rates_2022 = np.array(offer_rates_2022)
print('percent that stay unselective', (offer_rates_2022[offer_rates_2021 == 1]==1).sum()/(offer_rates_2021 == 1).sum())
print('offer rate of previously unselective', (offer_rates_2022[offer_rates_2021 == 1]).mean())

# Results restricting to students who eventually enroll

In [None]:
df_enrollment = pd.read_csv('R:/CR4239/Cohort 2022-23/2023-24_HsCourseAndGrades_Scrambled_for2939.csv')
id_to_enrolled_dbn = dict(zip(df_enrollment['student_id_scram'], df_enrollment['dbn']))
df['enrolled_dbn'] = df['student_id_scram'].apply(lambda x: id_to_enrolled_dbn.get(x))
print('percent enrolled')
for ethnicity in ['asian', 'black', 'white', 'hispanic']:
    print(ethnicity, df[df['ethnicity']==ethnicity]['enrolled_dbn'].isna().mean())

print('percent enrolled among those who did not match')
df['not_matched'] = df['matched'] != 1
for ethnicity in ['asian', 'black', 'white', 'hispanic']:
    print(ethnicity, df[(df['not_matched']) & (df['ethnicity']==ethnicity)]['enrolled_dbn'].isna().mean())

print('percent did not match among those who did enroll')
for ethnicity in ['asian', 'black', 'white', 'hispanic']:
    print(ethnicity, df[(df['enrolled_dbn'].notna()) & (df['ethnicity']==ethnicity)]['not_matched'].mean())

print('percent did not match among those who did not enroll')
for ethnicity in ['asian', 'black', 'white', 'hispanic']:
    print(ethnicity, df[(df['enrolled_dbn'].isna()) & (df['ethnicity']==ethnicity)]['not_matched'].mean())

In [None]:
print('Proportion recommended to reach')
for ethnicity in ['asian', 'black', 'hispanic', 'white']:
    print(ethnicity, (df[(df['enrolled_dbn'].notna()) & (df['ethnicity']==ethnicity)]['best_move_offer_rate_change'] < 0).mean())

# Undermatching results when accounting for other preferences

In [None]:
df['undermatching_homophily'] = df['offer_rate_match'] - df['offer_rate_best_cf_pareto_homophily']
df['undermatching_safety'] = df['offer_rate_match'] - df['offer_rate_best_cf_pareto_safe']
df['undermatching_within_33'] = df['offer_rate_match'] - df['offer_rate_best_cf_pareto_within_33']

In [None]:
for suffix in ['homophily', 'safe', 'within_33']:
    for metric in ['offer_rate', 'program_college_rate', 'program_grad_rate', 'impact', 'performance', 'aggregated_quality']:
        df[f'undermatching_{metric}_{suffix}'] = df[f'{metric}_best_cf_pareto_{suffix}'] - df[f'{metric}_match']

In [None]:
for suffix in ['homophily', 'safe', 'within_33']:
    table = ''
    for metric in ['offer_rate', 'program_college_rate', 'program_grad_rate', 'impact', 'performance']:
        table_row = ''
        table_row += str(round(df[f'{metric}_match'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[df['ethnicity']==ethnicity][f'{metric}_match'].mean(), 2)) + ' & '
        table_row += str(round(df[f'{metric}_best_cf_pareto_{suffix}'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[df['ethnicity']==ethnicity][f'{metric}_best_cf_pareto_{suffix}'].mean(), 2)) + ' & '
        table_row += str(round(df[f'undermatching_{metric}_{suffix}'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[df['ethnicity']==ethnicity][f'undermatching_{metric}_{suffix}'].mean(), 2)) + ' & '
        table += table_row + '\n'
    
        table_row = ''
        table_row += str(round(df[df['poverty']==True][f'{metric}_match'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==True)][f'{metric}_match'].mean(), 2)) + ' & '
        table_row += str(round(df[df['poverty']==True][f'{metric}_best_cf_pareto_{suffix}'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==True)][f'{metric}_best_cf_pareto_{suffix}'].mean(), 2)) + ' & '
        table_row += str(round(df[df['poverty']==True][f'undermatching_{metric}_{suffix}'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==True)][f'undermatching_{metric}_{suffix}'].mean(), 2)) + ' & '
        table += table_row + '\n'
    
        table_row = ''
        table_row += str(round(df[df['poverty']==False][f'{metric}_match'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==False)][f'{metric}_match'].mean(), 2)) + ' & '
        table_row += str(round(df[df['poverty']==False][f'{metric}_best_cf_pareto_{suffix}'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==False)][f'{metric}_best_cf_pareto_{suffix}'].mean(), 2)) + ' & '
        table_row += str(round(df[df['poverty']==False][f'undermatching_{metric}_{suffix}'].mean(), 2)) + ' & '
        for ethnicity in ['asian', 'black', 'hispanic', 'white']:
            table_row += str(round(df[(df['ethnicity']==ethnicity) & (df['poverty']==False)][f'undermatching_{metric}_{suffix}'].mean(), 2)) + ' & '
        table += table_row + '\n'
    
    with open(f'{savefolder}/undermatching_table_{suffix}.txt', 'w') as f:
        f.write(table)