In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr

In [None]:
df_programs = pd.read_csv('../clean_dfs/programs_with_apps_export.csv')
df_students = pd.read_csv('../clean_dfs/students_extended_export.csv')

In [None]:
print("number of GE students", df_students.student_id_scram.nunique())
print("number of programs with apps", df_programs.programcode.nunique())

In [None]:
k = 12  # full list length
groups = ['ethnicity', 'poverty', 'gender', 'homelang', 'type_of_ms']
metrics = ['offer_rate', 'impact', 'performance', 'program_grad_rate', 'program_college_rate', 'mean_SQR']

# computing additional predictors

In [None]:
# combined ethnicity
df_students['ethnicity_combined'] = df_students['ethnicity'].apply(lambda x: 'multiple_other' if x == 'multiple_not_rep' or x == 'unknown' else x)
df_students.rename(columns={'ethnicity':'ethnicity_old', 'ethnicity_combined':'ethnicity'}, inplace=True)

# type of middle school
def get_ms_type(row):
    binary_str = f"{int(row['private_ms'])}{int(row['charter_ms'])}{int(row['homeschool_ms'])}"
    categories = {
        '000': 'public',
        '001': 'homeschool',
        '010': 'charter',
        '100': 'private'
    }
    return categories[binary_str]

df_students['type_of_ms'] = df_students.apply(get_ms_type, axis=1)

# mean transit
transit_cols = ['transit_'+str(rank) for rank in range(1,k+1)]
df_students['mean_transit'] = df_students[transit_cols].mean(axis=1, skipna=True)

In [None]:
## match-independent portfolio stats

# quality metrics -- only include schools to which the student actually applied (i.e. don't count imputed values!)
def compute_portfolio_max(row, metric):
    cols_to_include = [f'{metric}_' + str(rank) for rank in range(1,int(row['ofchoices'])+1)]
    return row[cols_to_include].max()

def compute_portfolio_min(row, metric):
    cols_to_include = [f'{metric}_' + str(rank) for rank in range(1,int(row['ofchoices'])+1)]
    return row[cols_to_include].min()

def compute_portfolio_avg(row, metric):
    cols_to_include = [f'{metric}_' + str(rank) for rank in range(1,int(row['ofchoices'])+1)]
    return row[cols_to_include].mean()


for metric in metrics:
    df_students[f'{metric}_portfolio_max'] = df_students.apply(lambda row: compute_portfolio_max(row, metric), axis=1)
    df_students[f'{metric}_portfolio_min'] = df_students.apply(lambda row: compute_portfolio_min(row, metric), axis=1)
    df_students[f'{metric}_portfolio_avg'] = df_students.apply(lambda row: compute_portfolio_avg(row, metric), axis=1)

# no reach == 'applied to an unselective program first'
df_students['no_reach'] = (df_students['offer_rate_1'] == 1)

# no match == 'no program with offer rate between 30% and 60%'
def no_match_indep(row):
    for i in range(1, int(row['ofchoices'])+1):
        if row[f'offer_rate_{i}'] > 0.3 and row[f'offer_rate_{i}'] < 0.6:
            return False
    return True
df_students['no_match_indep'] = df_students.apply(lambda row: no_match_indep(row), axis=1)

# no safety == 'no application to a program with offer rate > 90%'
def no_safety_indep(row):
    for i in range(1, int(row['ofchoices'])+1):
        if row[f'offer_rate_{i}'] > 0.9:
            return False
    return True
df_students['no_safety_indep'] = df_students.apply(lambda row: no_safety_indep(row), axis=1)

# ranking inversions (max non-negative offer rate difference)
def compute_inversions_max(row):
    max_diff = 0
    for i in range(1, int(row['ofchoices'])):
        for j in range(i+1, int(row['ofchoices'])+1):
            if row[f'offer_rate_{i}'] - row[f'offer_rate_{j}'] > max_diff:
                max_diff = row[f'offer_rate_{i}'] - row[f'offer_rate_{j}']
    return max_diff
df_students['inversions_max_indep'] = df_students.apply(lambda row: compute_inversions_max(row), axis=1)

# ranking inversions (number of non-negative offer rate differences)
def compute_inversions_count(row):
    count = 0
    for i in range(1, int(row['ofchoices'])):
        for j in range(i+1, int(row['ofchoices'])+1):
            if row[f'offer_rate_{i}'] - row[f'offer_rate_{j}'] > 0:
                count += 1
    return count
df_students['inversions_count_indep'] = df_students.apply(lambda row: compute_inversions_count(row), axis=1)

# effective list length (i.e. first non-selective program)
def compute_effective_list_len(row):
    ell = 1
    while ell < row['ofchoices']:
        if row[f'offer_rate_{ell}'] == 1:
            return ell
        ell += 1
    return ell
df_students['effective_list_length'] = df_students.apply(lambda row: compute_effective_list_len(row), axis=1)

In [None]:
## match-dependent portfolio stats

# no match program == 'student is matched to a program with offer rate 1, after getting rejected by more selective programs'
df_students['no_match_dep'] = df_students.apply(lambda row: row['offer_rate_match']==1 and row['matched_choice_num'] > 1 and row['matched']==True, axis=1)

# ranking inversions == 'portfolio contains undermatching'
df_students['inversions_binary_dep'] = (df_students['offer_rate_match'] - df_students['offer_rate_best_cf_pareto_withinportfolio'] > 0)

# no safety = un-match rate
df_students['unmatched'] = (df_students['matched'] != 1)

In [None]:
# various forms of undermatching
for metric in metrics:
    if metric == 'offer_rate':
        df_students[f'undermatching_{metric}'] = df_students[f'{metric}_match'] - df_students[f'{metric}_best_cf_pareto']
    else:  # imputes manual placement for unmatched students, if mp value exists
        df_students[f'undermatching_{metric}'] = df_students[f'{metric}_final'] - df_students[f'{metric}_best_cf_pareto']

In [None]:
# for all non-offerrate metrics, use final as ''match'' (to address imputations)
rename_dict = {'ofchoices':'list length',
               'offer_rate_portfolio_avg':'portfolio avg offer rate',
               'offer_rate_portfolio_min':'portfolio min offer rate',
               'offer_rate_match':'match offer rate',
               'impact_portfolio_avg':'portfolio avg impact',
               'impact_portfolio_max':'portfolio max impact',
               'impact_final':'match impact',
               'performance_portfolio_avg':'portfolio avg performance',
               'performance_portfolio_max':'portfolio max performance',
               'performance_final':'match performance',
               'program_grad_rate_portfolio_avg':'portfolio avg graduation rate',
               'program_grad_rate_portfolio_max':'portfolio max graduation rate',
               'program_grad_rate_final':'match graduation rate',
               'program_college_rate_portfolio_avg':'portfolio avg college rate',
               'program_college_rate_portfolio_max':'portfolio max college rate',
               'program_college_rate_final':'match college rate',
               'mean_SQR_portfolio_avg':'portfolio avg SQR',
               'mean_SQR_portfolio_max':'portfolio max SQR',
               'mean_SQR_final':'match SQR',
               'effective_list_length':'effective list length',
               'impact_standardized_portfolio_avg':'portfolio avg impact (standardized)',
               'impact_standardized_portfolio_max':'portfolio max impact (standardized)',
               'impact_standardized_match':'match impact (standardized)',
               'performance_standardized_portfolio_avg':'portfolio avg performance (standardized)',
               'performance_standardized_portfolio_max':'portfolio max performance (standardized)',
               'performance_standardized_match':'match performance (standardized)',
               'no_reach':'no reach',
               'no_match_indep':'no match (match-independent)',
               'no_safety_indep':'no safety (match-independent)',
               'inversions_max_indep':'inversions (max diff, match-independent)',
               'inversions_count_indep':'inversions (count, match-independent)',
               'no_match_dep':'no match (match-dependent)',
               'unmatched':'no safety (match-dependent)',
               'inversions_binary_dep':'inversions (binary, match-dependent)',
               'undermatching_offer_rate':'undermatching (offer rate)',
               'undermatching_impact':'undermatching (impact)',
               'undermatching_performance':'undermatching (performance)',
               'undermatching_program_grad_rate':'undermatching (graduation rate)',
               'undermatching_program_college_rate':'undermatching (college rate)',
               'undermatching_mean_SQR':'undermatching (SQR)',
               'gender':'sex',
               'poverty':'FRL status',
               'homelang':'home language',
               'type_of_ms':'type of MS',
               'avg_ms_grades':'avg MS grades',
               'mean_transit':'mean transit time'}
               
df_renamed = df_students.rename(columns=rename_dict)
groups_renamed = ['ethnicity', 'FRL status', 'sex', 'home language', 'type of MS']
metrics_renamed = ['offer rate', 'impact', 'performance', 'graduation rate', 'college rate', 'SQR']

In [None]:
df_renamed['match SQR_agg'] = (df_renamed['match SQR'] + df_renamed['match impact'] + df_renamed['match performance'])/3
df_renamed['undermatching_SQR_agg'] = (df_renamed['match SQR'] + df_renamed['match impact'] + df_renamed['match performance'])/3

# summary stats

In [None]:
# school-level means and SDs (impact and performance)
df_schoolquality = df_programs.drop_duplicates(subset='dbn')[['impact','performance']]
mean_impact = df_schoolquality['impact'].mean()
sd_impact = df_schoolquality['impact'].std()
mean_performance = df_schoolquality['performance'].mean()
sd_performance = df_schoolquality['performance'].std()

df_schoolquality.describe()

In [None]:
table_stats = ['list length',
                    'portfolio avg offer rate', 'portfolio min offer rate', 'match offer rate',
                    'portfolio avg impact', 'portfolio max impact', 'match impact',
                    'portfolio avg performance', 'portfolio max performance', 'match performance',
                    'portfolio avg graduation rate', 'portfolio max graduation rate', 'match graduation rate',
                    'portfolio avg college rate', 'portfolio max college rate', 'match college rate',
                    'effective list length', 'no reach',
                    'no match (match-independent)', 'no safety (match-independent)', 'inversions (max diff, match-independent)', # 'inversions (count, match-independent)',
                    'no match (match-dependent)', 'no safety (match-dependent)', 'inversions (binary, match-dependent)',
                    'mean transit time', 'avg MS grades',
                    'undermatching (offer rate)']

In [None]:
table_string = ''

for group in groups_renamed:
    table_string += df_renamed.groupby(group)[table_stats].describe().transpose().to_latex()
    table_string += '\n'\
    
with open('summarystats_20250128.txt', 'w') as f:
    f.write(table_string)

In [None]:
# further prettification for plots
df_plots = df_renamed.copy()
df_plots['ethnicity'] = df_plots.ethnicity.map({'multiple_other':'multiple/other', 'white':'white', 'black':'Black', 'asian':'Asian', 'hispanic':'Hispanic'})
df_plots['FRL status'] = df_plots['FRL status'].map({1:'FRL eligible', 0:'not FRL eligible'})
df_plots['sex'] = df_plots.sex.map({'F':'female', 'M':'male'})

In [None]:
plt.rcParams.update({'font.size': 12, 'axes.labelsize':12, 'legend.title_fontsize':12,'legend.fontsize':12})

In [None]:
sns.set_context(context='paper', font_scale=2)
sns.set_style('white')
for group in groups_renamed:
    plt.figure(figsize=(10,10))
    sns.histplot(data=df_plots, x='undermatching (offer rate)', hue=group, stat='proportion', common_norm=False, fill=False,
            element='step', bins=5)
    plt.tight_layout()
    plt.savefig(f'plots-20250224/undermatching_by_{group}_hist.pdf', format='pdf', dpi=300, bbox_inches='tight')

    plt.close()

In [None]:
sns.set_context(context='paper', font_scale=1.5)
for group in groups_renamed:
    plt.figure(figsize=(10,10))
    
    g = sns.displot(data=df_plots, x='undermatching (offer rate)', hue=group, kind='ecdf')
    g._legend.set_frame_on(True)
    plt.ylabel('cumulative density function')
    plt.tight_layout()
    plt.savefig(f'plots-20250224/undermatching_by_{group}_ECDF.pdf', format='pdf', dpi=300, bbox_inches='tight')

    plt.close()

In [None]:
# order in which to plot homelang
initial_order = list(df_plots['home language'].value_counts().index)
language_order = [x for x in initial_order if x != 'Other'] + ['Other']     # move 'Other' to end
language_order

In [None]:
plt.figure(figsize=(10,10))
    
sns.barplot(data=df_plots, x='home language', y='match offer rate', order=language_order, color='#1f77b4')
plt.xlabel(None)
plt.ylabel('Offer Rate of Match')
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
    
sns.barplot(data=df_plots, x='home language', y='undermatching (offer rate)', order=language_order, color='#1f77b4')
plt.xlabel(None)
plt.ylabel('Undermatching')
plt.tight_layout()

In [None]:
df_plots_asian = df_plots[df_plots.ethnicity=='Asian']

In [None]:
plt.figure(figsize=(10,10))
    
sns.barplot(data=df_plots_asian, x='home language', y='match offer rate', order=language_order, color='#1f77b4')
plt.xlabel(None)
plt.ylabel('Offer Rate of Match')
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
    
sns.barplot(data=df_plots_asian, x='home language', y='undermatching (offer rate)', order=language_order, color='#1f77b4')
plt.xlabel(None)
plt.ylabel('Undermatching')
plt.tight_layout()

In [None]:
sns.set_context(context='paper', font_scale=1.8)

plt.figure(figsize=(10,10))
    
sns.barplot(data=df_plots, x='home language', y='match offer rate', order=language_order, hue='FRL status', hue_order = ['not FRL eligible', 'FRL eligible'])
plt.xlabel(None)
plt.ylabel('Offer Rate of Match')
plt.xticks(fontsize=12)
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
    
sns.barplot(data=df_plots, x='home language', y='undermatching (offer rate)', order=language_order, hue='FRL status', hue_order = ['not FRL eligible', 'FRL eligible'])
plt.xlabel(None)
plt.ylabel('Undermatching')
plt.xticks(fontsize=12)
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
    
sns.barplot(data=df_plots_asian, x='home language', y='match offer rate', order=language_order, hue='FRL status', hue_order = ['not FRL eligible', 'FRL eligible'])
plt.xlabel(None)
plt.ylabel('Offer Rate of Match')
plt.xticks(fontsize=12)
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
    
sns.barplot(data=df_plots_asian, x='home language', y='undermatching (offer rate)', order=language_order, hue='FRL status', hue_order = ['not FRL eligible', 'FRL eligible'])
plt.xlabel(None)
plt.ylabel('Undermatching')
plt.xticks(fontsize=12)
plt.tight_layout()

# standardizing features for regressions

In [None]:
predictors_INDEP = ['sex', 'ethnicity', 'FRL status', 'avg MS grades', 
                    'list length', 'effective list length', 
                    'portfolio min offer rate', 'portfolio avg offer rate', 'cluster', 
                    'no reach', 'no match (match-independent)', 'no safety (match-independent)', 'inversions (max diff, match-independent)', # 'inversions (count, match-independent)',
                    'mean transit time']

predictors_DEP = ['sex', 'ethnicity', 'FRL status', 'avg MS grades', 
                  'list length', 'effective list length', 
                  'portfolio min offer rate', 'portfolio avg offer rate', 'cluster', 
                  'no reach', 'no match (match-dependent)', 'no safety (match-dependent)', 
                  'inversions (binary, match-dependent)',
                  'mean transit time']

In [None]:
# divide predictors into categorical and continuous
cts_predictors_INDEP = ['avg MS grades', 'list length', 'effective list length', 'portfolio min offer rate', 'portfolio avg offer rate', 'inversions (max diff, match-independent)', # 'inversions (count, match-independent)', 
                        'mean transit time']
cat_predictors_INDEP = list(set(predictors_INDEP) - set(cts_predictors_INDEP))
cts_predictors_DEP = ['avg MS grades', 'list length', 'effective list length', 'portfolio min offer rate', 'portfolio avg offer rate', 'inversions (binary, match-dependent)', 'mean transit time']
cat_predictors_DEP = list(set(predictors_DEP) - set(cts_predictors_DEP))

# make a copy of the df to standardize
targets = [f'match {metric}' for metric in metrics_renamed] + [f'undermatching ({metric})' for metric in metrics_renamed]
df_scaled_INDEP = df_renamed[['student_id_scram'] + predictors_INDEP + targets]
df_scaled_DEP = df_renamed[['student_id_scram'] + predictors_DEP + targets]

# initialize and fit scaler
scaler_INDEP, scaler_DEP = StandardScaler(), StandardScaler()
df_scaled_INDEP[cts_predictors_INDEP] = scaler_INDEP.fit_transform(df_scaled_INDEP[cts_predictors_INDEP])
df_scaled_DEP[cts_predictors_DEP] = scaler_DEP.fit_transform(df_scaled_DEP[cts_predictors_DEP])

# confirm number of students is unchanged
print(df_scaled_INDEP.student_id_scram.nunique(), df_scaled_DEP.student_id_scram.nunique())

# regressions

In [None]:
dep_str = ''
indep_str = ''

for pred in predictors_DEP:
    if pred == 'cluster':
        dep_str += 'C(cluster) + '
    elif ' ' in pred:
        dep_str += f'Q("{pred}") + '
    else:
        dep_str += pred + ' + '

for pred in predictors_INDEP:
    if pred == 'cluster':
        indep_str += 'C(cluster) + '
    elif ' ' in pred:
        indep_str += f'Q("{pred}") + '
    else:
        indep_str += pred + ' + '
        
dep_str, indep_str = dep_str[:-2], indep_str[:-2]
print(dep_str)
print(indep_str)

## match offer rate

In [None]:
# match-dependent portfolio behaviors
f_match_MD = 'Q("match offer rate") ~ ' + dep_str
mod = smf.ols(formula = f_match_MD, data = df_scaled_DEP)
res = mod.fit()
res.summary()

In [None]:
# match-independent portfolio behaviors
f_match_MI = 'Q("match offer rate") ~ ' + indep_str
mod = smf.ols(formula = f_match_MI, data = df_scaled_INDEP)
res = mod.fit()
res.summary()

In [None]:
# no portfolio info
f_match_NO = 'Q("match offer rate") ~ ' + 'sex + ethnicity + Q("FRL status") + Q("avg MS grades")'
mod = smf.ols(formula = f_match_NO, data = df_scaled_DEP)
res = mod.fit()
res.summary()

## undermatching (offer rate)

In [None]:
# match-dependent portfolio behaviors
f_under_MD = 'Q("undermatching (offer rate)") ~ ' + dep_str
mod = smf.ols(formula = f_under_MD, data = df_scaled_DEP)
res = mod.fit()
res.summary()

In [None]:
# match-independent portfolio behaviors
f_under_MI = 'Q("undermatching (offer rate)") ~ ' + indep_str
mod = smf.ols(formula = f_under_MI, data = df_scaled_INDEP)
res = mod.fit()
res.summary()

In [None]:
# no portfolio info
f_under_NO = 'Q("undermatching (offer rate)") ~ ' + 'sex + ethnicity + Q("FRL status") + Q("avg MS grades")'
mod = smf.ols(formula = f_under_NO, data = df_scaled_DEP)
res = mod.fit()
res.summary()

## all targets/metrics (to tex)

In [None]:
# REGRESSIONS TO EXPORT FOR TABLE: each group

for metric in metrics_renamed:
    # match - portfolio and match-dependent behaviors
    f_match_MD = f'Q("match {metric}") ~ ' + dep_str
    mod = smf.ols(formula = f_match_MD, data = df_scaled_DEP)
    res = mod.fit()
    with open(f'regressions-20250129/regression_match_{metric}_matchdep.txt', 'w') as f:
        f.write(res.summary().as_latex())
    
    # match - portfolio and match-independent behaviors
    f_match_MI = f'Q("match {metric}") ~ ' + indep_str
    mod = smf.ols(formula = f_match_MI, data = df_scaled_INDEP)
    res = mod.fit()
    with open(f'regressions-20250129/regression_match_{metric}_matchindep.txt', 'w') as f:
        f.write(res.summary().as_latex())
    
    # match - no portfolio
    f_match_NO = f'Q("match {metric}") ~ sex + ethnicity + Q("FRL status") + Q("avg MS grades")'
    mod = smf.ols(formula = f_match_NO, data = df_scaled_INDEP)
    res = mod.fit()
    with open(f'regressions-20250129/regression_match_{metric}_noport.txt', 'w') as f:
        f.write(res.summary().as_latex())
    
    # undermatching - portfolio and match-dependent behaviors
    f_under_MD = f'Q("undermatching ({metric})") ~ ' + dep_str
    mod = smf.ols(formula = f_under_MD, data = df_scaled_DEP)
    res = mod.fit()
    with open(f'regressions-20250129/regression_undermatching_{metric}_matchdep.txt', 'w') as f:
        f.write(res.summary().as_latex())
    
    # undermatching - portfolio and match-independent behaviors
    f_under_MI = f'Q("undermatching ({metric})") ~ ' + indep_str
    mod = smf.ols(formula = f_under_MI, data = df_scaled_INDEP)
    res = mod.fit()
    with open(f'regressions-20250129/regression_undermatching_{metric}_matchindep.txt', 'w') as f:
        f.write(res.summary().as_latex())
    
    # undermatching - no portfolio
    f_under_NO = f'Q("undermatching ({metric})") ~ sex + ethnicity + Q("FRL status") + Q("avg MS grades")'
    mod = smf.ols(formula = f_under_NO, data = df_scaled_INDEP)
    res = mod.fit()
    with open(f'regressions-20250129/regression_undermatching_{metric}_noport.txt', 'w') as f:
        f.write(res.summary().as_latex())

# program metrics

In [None]:
print('number of unique programs (with apps)', df_programs.programcode.nunique())
print('number of unique schools (with apps)', df_programs.dbn.nunique())
print('number of unique students', df_students.student_id_scram.nunique())
print('number of matched students', df_students[df_students['matched']==1].student_id_scram.nunique())

## category counts

In [None]:
df_programs['method'].value_counts()

In [None]:
df_students['match_method'] = df_students['matched_program'].map(dict(zip(df_programs['programcode'], df_programs['method'])))
df_students['match_method'].value_counts()

In [None]:
df_students[['shs_offer', 'lga_offer']].count()

## quality by category

In [None]:
screened_list = ['Screened', 'Screened: Language', 'Screened: Language & Academics']
other_list = ['ASD/ACES Program', 'Audition', 'D75 Special Education Inclusive Services', 'Ed. Opt.', 'For Continuing 8th Graders',
              'Open', 'Transfer', 'Zoned Guarantee', 'Zoned Priority']    # excluding test schools
quality_metrics = ['impact', 'performance', 'program_grad_rate', 'program_college_rate', 'offer_rate', 'impact_standardized', 'performance_standardized']   

df_screened_progs = df_programs.where(df_programs['method'].isin(screened_list))
df_other_progs = df_programs.where(df_programs['method'].isin(other_list))

In [None]:
df_students['impact_match'].describe()

In [None]:
df_screened_progs[quality_metrics].mean()

In [None]:
df_other_progs[quality_metrics].mean()

## quality metric plots

In [None]:
def correlation_pairplot(df, 
                         value_names,
                         plot_correlations=True,
                         correlation_fontsize=12,
                         label_fontsize=30,
                         alpha=0.5):
    sns.set_context('paper', rc={'axes.labelsize':label_fontsize})
    graph = sns.pairplot(df[value_names], corner=True, plot_kws={'alpha':alpha})
    
    if plot_correlations:
        
        def corrfunc(x, y, ax=None, **kwargs):
            r, _ = spearmanr(x, y, nan_policy='omit')
            ax = ax or plt.gca()
            ax.annotate(fr'$\rho$ = {r:.2f}', xy=(.6, .1),
                        size=correlation_fontsize,
                        bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=.5'),
                        xycoords=ax.transAxes)
            
        _ = graph.map_lower(corrfunc)
            
    return graph

In [None]:
df_programs['impact_standardized'] = (df_programs['impact'] - df_programs['impact'].mean())/df_programs['impact'].std()
df_programs['performance_standardized'] = (df_programs['performance'] - df_programs['performance'].mean())/df_programs['performance'].std()
df_programs['mean_SQR_standardized'] = (df_programs['mean_SQR'] - df_programs['mean_SQR'].mean())/df_programs['mean_SQR'].std()
df_programs['agg_sqr'] = df_programs['mean_SQR_standardized']/2 + (df_programs['impact_standardized'] + df_programs['performance_standardized'])/4

In [None]:
df_programs['selectivity'] = 1 - df_programs['offer_rate']

In [None]:
old_names = ['impact', 'performance', 'program_grad_rate', 'program_college_rate', 'selectivity', 'agg_sqr']
pretty_names = ['impact', 'performance', 'graduation rate', 'college enrollment rate', 'selectivity', 'aggregate quality']
df_programs.rename(columns={i:j for i,j in zip(old_names,pretty_names)}, inplace=True)

In [None]:
corr_matrix = df_programs[pretty_names].corr()
corr_matrix

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix, annot=True, vmin=0, vmax=1)
plt.title('Correlation matrix between program quality metrics')
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
graph = correlation_pairplot(df_programs, pretty_names)
plt.tight_layout()

## pairplot for programs with cfs only

In [None]:
df_programs_cf = df_programs[df_programs['counterfactual']==True]

In [None]:
plt.figure(figsize=(10,10))
graph = correlation_pairplot(df_programs_cf, pretty_names)
plt.tight_layout()

## likelihood of matching to program with cf

In [None]:
programs_cf = set(df_programs[df_programs['counterfactual']==True]['programcode'].to_list())

In [None]:
df_students['matched_to_program_with_cf'] = df_students['matched_program'].apply(lambda x: x in programs_cf)

In [None]:
for ethnicity in ['asian', 'black', 'hispanic', 'white']:
    print(df_students[(df_students['ethnicity']==ethnicity) & (df_students['matched']==True)]['matched_to_program_with_cf'].mean())