In [52]:
import pandas as pd
from matplotlib import pyplot as plt
from glob import glob
from scipy.stats import mannwhitneyu, ttest_ind
from tqdm import tqdm
import numpy as np 
import math
%matplotlib inline
def process_unis():
    unis = pd.read_csv('../../data/IPEDS/university_info/hd2021.csv', encoding='cp1252')
    ## Preprocessing meta-data files
    ### Updating names to match in rankings  
    unis.loc[unis['IALIAS']=='Virginia Tech', 'INSTNM'] = 'Virginia Tech'
    unis.loc[unis['INSTNM']=='University of Illinois Urbana-Champaign', 'INSTNM'] = 'University of Illinois at Urbana-Champaign'
    unis.loc[(unis['INSTNM']=="St. John's College") & (unis['STABBR']=='MD'), 'INSTNM'] = "St. John's College - MD"
    unis.loc[(unis['INSTNM']=="St. John's College") & (unis['STABBR']=='NM'), 'INSTNM'] = "St. John's College - NM"
    unis.loc[unis['INSTNM']=="William & Mary", 'INSTNM'] = "College of William and Mary"
    unis.loc[(unis['INSTNM']=="Brigham Young University") & (unis['CITY']=='Provo'), 'INSTNM'] = "Brigham Young University-Provo"
    unis.loc[(unis['INSTNM']=="The University of the South") & (unis['CITY']=='Sewanee'), 'INSTNM'] = "Sewanee - The University of the South"
    unis.loc[(unis['OBEREG'] == -3), 'OBEREG'] = 10
        
    return unis

def process_completions(year):
    completions = pd.read_csv('../../data/IPEDS/completions/c%s_a.csv' % year)
    return completions
# hard coded it because of assignment errors

In [56]:
def startup():
    all_unis = {}
    all_completions = {}
    for year in range(2008, 2022):
        all_unis[year] = process_unis()
        all_completions[year] = process_completions(year)
    cols = ['UNITID', 'CTOTALW', 'CAIANT', 'CBKAAT', 'CHISPT']
    return all_unis, all_completions, cols

In [6]:
def region_processing(uni_data, completion_data, region):
    uni_boolmap = uni_data['OBEREG'] == region
    region_unis = uni_data[uni_boolmap]
    unitids = set(region_unis['UNITID'])
    completion_boolmap = completion_data['UNITID'].apply(lambda x: x in unitids)
    region_data = completion_data[completion_boolmap]
    return region_data, region_unis

In [50]:
reg_code_to_label = {0: 'US Service Schools', 1: 'New England', 2: 'Mid East', 3: 'Great Lakes', 4: 'Plains',\
     5: 'Southeast',6: 'Southwest', 7: 'Rocky Mountains',8: 'Far West', 9: 'Outlying Areas'}

In [28]:
def make_years(completions, unis, region, start, end):
    years = {}
    for year in range(start, end + 1):
        year_completions, year_unis = region_processing(unis[year], completions[year], region)
        years[year] = [year_completions, year_unis]
    return years

In [36]:
# data is the dataframe already boolmapped with the proper data
def groupTotals(years):
    allStateCsData  = []
    allStateNonCsData = []
    for year in years:
        data, unis = years[year]
        csMerge, nonCsMerge = csVsNonCs(data, unis)
        csMerge['year'] = year
        nonCsMerge['year'] = year
        allStateCsData.append(csMerge)
        allStateNonCsData.append(nonCsMerge)
    allStateCsData = pd.concat(allStateCsData)
    allStateNonCsData = pd.concat(allStateNonCsData)
    return allStateCsData, allStateNonCsData
    
        
        
def csVsNonCs(data, unis):
    csBoolmap = data['CIPCODE'].apply(lambda x : x < 12 and x >= 11)
    cs = data[csBoolmap]
    csSum = cs.groupby('UNITID').sum()
    csSum = csSum.reset_index()
    uniqueIds = set(csSum['UNITID'].unique())
    nonCs = data[data.apply(lambda x: x['UNITID'] in uniqueIds, axis = 1)]
    nonCsSum = nonCs.groupby('UNITID').sum().reset_index()
    cols = ['UNITID', 'CTOTALW', 'CAIANT', 'CBKAAT', 'CHISPT']
    csMerge = pd.merge(csSum[cols], unis[['UNITID', 'LONGITUD', 'LATITUDE']], on = 'UNITID', how='left')
    nonCsMerge = pd.merge(nonCsSum[cols], unis[['UNITID', 'LONGITUD', 'LATITUDE']], on = 'UNITID', how='left')
    return csMerge, nonCsMerge

In [37]:
def make_percents(cs, nonCs):
    cs = cs.set_index(['UNITID', 'year', 'LONGITUD', 'LATITUDE'])
    nonCs = nonCs.set_index(['UNITID', 'year', 'LONGITUD', 'LATITUDE'])
    cs = cs.apply(lambda x: x / x['CTOTALT'], axis = 1)
    nonCs = nonCs.apply(lambda x: x / x['CTOTALT'], axis = 1)
    cs = cs.reset_index()
    nonCs = nonCs.reset_index()
    return cs, nonCs

In [45]:
def cs_nonCs_difference(cs, nonCs):
    cs = cs.set_index(['LONGITUD', 'LATITUDE', 'UNITID', 'year'])
    nonCs = nonCs.set_index(['LONGITUD', 'LATITUDE', 'UNITID', 'year'])
    difference = cs - nonCs
    difference = difference.reset_index()
    return difference

Unnamed: 0,LONGITUD,LATITUDE,UNITID,year,CTOTALW,CAIANT,CBKAAT,CHISPT,CTOTALT
0,-86.568502,34.783368,100654,2020,-1191,0,-1493,-18,-1763
1,-86.799345,33.505697,100663,2020,-7675,-24,-2126,-426,-11798
2,-86.640449,34.724557,100706,2020,-2306,-58,-343,-191,-4481
3,-86.295677,32.364317,100724,2020,-921,-6,-1187,-12,-1381
4,-87.545978,33.211875,100751,2020,-11907,-82,-1954,-1009,-20611
...,...,...,...,...,...,...,...,...,...
1438,-93.771686,32.480613,490498,2021,-12,0,-4,-3,-27
1439,-82.463419,28.488302,491783,2021,-8,0,0,-17,-52
1440,-92.343582,34.775996,493725,2021,-301,-6,-94,-4,-458
1441,-81.567371,30.809739,494269,2021,-6,-2,-4,0,-29


In [47]:
percent_cs, percent_nonCs = make_percents(groupCs, groupNonCs)
difference = cs_nonCs_difference(percent_cs, percent_nonCs)
difference

Unnamed: 0,LONGITUD,LATITUDE,UNITID,year,CTOTALW,CAIANT,CBKAAT,CHISPT,CTOTALT
0,-86.568502,34.783368,100654,2020,-0.223599,0.000000,-0.120728,-0.010045,0.0
1,-86.799345,33.505697,100663,2020,-0.427109,-0.002015,-0.091596,0.016366,0.0
2,-86.640449,34.724557,100706,2020,-0.266791,-0.001582,-0.035592,-0.024700,0.0
3,-86.295677,32.364317,100724,2020,-0.308466,-0.004213,0.046022,-0.008427,0.0
4,-87.545978,33.211875,100751,2020,-0.432348,-0.003956,-0.044129,-0.023608,0.0
...,...,...,...,...,...,...,...,...,...
1438,-93.771686,32.480613,490498,2021,-0.400000,0.000000,-0.133333,0.200000,0.0
1439,-82.463419,28.488302,491783,2021,-0.142857,0.000000,0.000000,-0.071429,0.0
1440,-92.343582,34.775996,493725,2021,-0.274516,0.039361,-0.138776,-0.008097,0.0
1441,-81.567371,30.809739,494269,2021,-0.200000,-0.066667,-0.133333,0.000000,0.0


In [48]:
def get_avg(data):
    data = data.groupby('UNITID').mean()
    return data

In [49]:
avg = get_avg(difference)
avg

Unnamed: 0_level_0,LONGITUD,LATITUDE,year,CTOTALW,CAIANT,CBKAAT,CHISPT,CTOTALT
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100654,-86.568502,34.783368,2020.5,-0.246475,-0.001218,-0.053430,0.036778,0.0
100663,-86.799345,33.505697,2020.5,-0.391327,-0.002273,-0.090531,-0.006682,0.0
100706,-86.640449,34.724557,2020.5,-0.246830,-0.005934,-0.033921,-0.013460,0.0
100724,-86.295677,32.364317,2020.5,-0.312118,-0.002107,-0.032655,-0.011882,0.0
100751,-87.545978,33.211875,2020.5,-0.386076,-0.003384,-0.035323,-0.031543,0.0
...,...,...,...,...,...,...,...,...
490498,-93.771686,32.480613,2020.5,-0.305263,0.098684,-0.000877,0.198684,0.0
491783,-82.463419,28.488302,2020.5,-0.138095,0.000000,-0.050000,0.055952,0.0
493725,-92.343582,34.775996,2020.5,-0.329543,0.019681,-0.113417,-0.006873,0.0
494269,-81.567371,30.809739,2020.5,-0.266667,-0.200000,-0.233333,0.000000,0.0


In [44]:
years = make_years(all_completions, all_unis, 5, 2020, 2021)
groupCs, groupNonCs = groupTotals(years)
groupCs

Unnamed: 0,UNITID,CTOTALW,CAIANT,CBKAAT,CHISPT,CTOTALT,LONGITUD,LATITUDE,year
0,100654,13,0,21,0,29,-86.568502,34.783368,2020
1,100663,25,0,10,6,114,-86.799345,33.505697,2020
2,100706,42,2,7,3,177,-86.640449,34.724557,2020
3,100724,15,0,39,0,43,-86.295677,32.364317,2020
4,100751,17,0,6,3,119,-87.545978,33.211875,2020
...,...,...,...,...,...,...,...,...,...
715,490498,0,0,0,1,3,-93.771686,32.480613,2021
716,491783,0,0,0,1,4,-82.463419,28.488302,2021
717,493725,13,2,2,0,36,-92.343582,34.775996,2021
718,494269,0,0,0,0,1,-81.567371,30.809739,2021


In [54]:
def uni_plot(data):
    fig, axs = plt.subplots(1, 4, figsize = (5, 5))
    for i, demo in enumerate(cols[1:]):
        demo_plot(data, demo, axs, i)

In [57]:
def demo_plot(data, demo, axs, i):
    axs[1][i].scatter(data['LONGITUD'], data['LATITUDE'], c = data[demo], alpha = 0.5)

In [58]:
def regional_difference_mapping(start, end):
    all_unis, all_completions, cols = startup()
    for region in reg_code_to_label.keys():
        years = make_years(all_completions, all_unis, region, start, end)
        cs_group, nonCs_group = groupTotals(years)
        cs_percent, nonCs_percent = make_percents(cs_group, nonCs_group)
        difference = cs_nonCs_difference(cs_percent, nonCs_percent)
        average = get_avg(difference)
        uni_plot(average)
    plt.tight_layout()
    plt.savefig('../out/Regional Line Plots/%s.png'%reg_code_to_label[region])
    plt.close()