In [1]:

import pandas as pd
from matplotlib import pyplot as plt
from glob import glob
from scipy.stats import mannwhitneyu, ttest_ind
from tqdm import tqdm
import numpy as np 
import math
import seaborn as sns
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()
%matplotlib inline
def process_unis():
    unis = pd.read_csv('../../data/IPEDS/university_info/hd2021.csv', encoding='cp1252')
    ## Preprocessing meta-data files
    ### Updating names to match in rankings  
    unis.loc[unis['IALIAS']=='Virginia Tech', 'INSTNM'] = 'Virginia Tech'
    unis.loc[unis['INSTNM']=='University of Illinois Urbana-Champaign', 'INSTNM'] = 'University of Illinois at Urbana-Champaign'
    unis.loc[(unis['INSTNM']=="St. John's College") & (unis['STABBR']=='MD'), 'INSTNM'] = "St. John's College - MD"
    unis.loc[(unis['INSTNM']=="St. John's College") & (unis['STABBR']=='NM'), 'INSTNM'] = "St. John's College - NM"
    unis.loc[unis['INSTNM']=="William & Mary", 'INSTNM'] = "College of William and Mary"
    unis.loc[(unis['INSTNM']=="Brigham Young University") & (unis['CITY']=='Provo'), 'INSTNM'] = "Brigham Young University-Provo"
    unis.loc[(unis['INSTNM']=="The University of the South") & (unis['CITY']=='Sewanee'), 'INSTNM'] = "Sewanee - The University of the South"
        
    return unis

def process_completions(year):
    completions = pd.read_csv('../../data/IPEDS/completions/c%s_a.csv' % year)
    return completions
# hard coded it because of assignment errors

In [2]:
def startup():
    all_unis = {}
    all_completions = {}
    for year in range(2008, 2022):
        all_unis[year] = process_unis()
        all_completions[year] = process_completions(year)
    cols = ['UNITID', 'CTOTALW', 'CBKAAT', 'CHISPT']
    colors = ['blue', 'green', 'red', 'orange', 'purple', 'navy']
    return all_unis, all_completions, cols, colors

In [3]:
def region_processing(uni_data, completion_data, region):
    uni_boolmap = uni_data['OBEREG'] == region
    region_unis = uni_data[uni_boolmap]
    unitids = set(region_unis['UNITID'])
    completion_boolmap = completion_data['UNITID'].apply(lambda x: x in unitids)
    region_data = completion_data[completion_boolmap]
    return region_data, region_unis

In [4]:
def make_years(completions, unis, start, end):
    years = {}
    for year in range(start, end + 1):
        year_completions, year_unis = unis[year], completions[year]
        years[year] = [year_completions, year_unis]
    return years

In [5]:
# data is the dataframe already boolmapped with the proper data
def groupTotals(years):
    allStateCsData  = []
    allStateNonCsData = []
    for year in years:
        unis, data = years[year]
        csMerge, nonCsMerge = csVsNonCs(data, unis)
        csMerge['year'] = year
        nonCsMerge['year'] = year
        allStateCsData.append(csMerge)
        allStateNonCsData.append(nonCsMerge)
    allStateCsData = pd.concat(allStateCsData)
    allStateNonCsData = pd.concat(allStateNonCsData)
    return allStateCsData, allStateNonCsData
    
        
        
def csVsNonCs(data, unis):
    csBoolmap = data['CIPCODE'].apply(lambda x : x < 12 and x >= 11)
    cs = data[csBoolmap]
    csSum = cs.groupby('UNITID').sum()
    csSum = csSum.reset_index()
    uniqueIds = set(csSum['UNITID'].unique())
    nonCs = data[data.apply(lambda x: x['UNITID'] in uniqueIds, axis = 1)]
    nonCsSum = nonCs.groupby('UNITID').sum().reset_index()
    cols = ['UNITID', 'CTOTALW', 'CBKAAT', 'CHISPT', 'CTOTALT']
    csMerge = pd.merge(csSum[cols], unis[['UNITID', 'LONGITUD', 'LATITUDE']], on = 'UNITID', how='left')
    nonCsMerge = pd.merge(nonCsSum[cols], unis[['UNITID', 'LONGITUD', 'LATITUDE']], on = 'UNITID', how='left')
    return csMerge, nonCsMerge

In [6]:
def make_percents(cs, nonCs):
    cs = cs.set_index(['UNITID', 'year', 'LONGITUD', 'LATITUDE'])
    nonCs = nonCs.set_index(['UNITID', 'year', 'LONGITUD', 'LATITUDE'])
    cs = cs.apply(lambda x: x / x['CTOTALT'], axis = 1)
    nonCs = nonCs.apply(lambda x: x / x['CTOTALT'], axis = 1)
    cs = cs.reset_index()
    nonCs = nonCs.reset_index()
    return cs, nonCs

In [7]:
def make_groups(cs, nonCs):
    return cs.groupby('year').sum().reset_index(), nonCs.groupby('year').sum().reset_index()


In [8]:
def cs_nonCs_difference(cs, nonCs):
    cs = cs.set_index(['LONGITUD', 'LATITUDE', 'UNITID', 'year'])
    nonCs = nonCs.set_index(['LONGITUD', 'LATITUDE', 'UNITID', 'year'])
    difference = cs - nonCs
    difference.columns = ['d_%s' % col for col in difference.columns]
        
    difference = difference.reset_index()
    return difference

In [9]:
def get_avg(data):
    data = data.groupby('UNITID').mean()
    return data.reset_index()

In [10]:
def make_correlation_df(data, unis, demo):
    correlation_df_merge = pd.merge(data[['UNITID', 'd_CTOTALW', 'd_CBKAAT', 'd_CHISPT']], unis, on='UNITID')
    non_differences = correlation_df_merge.drop(['d_CTOTALW', 'd_CBKAAT', 'd_CHISPT'], axis=1)
    correlations_df = non_differences.corrwith(correlation_df_merge[demo])
    #correlations_df = correlation_df_merge[['d_CTOTALW', 'd_CBKAAT', 'd_CHISPT', 'd_CAIANT']].corrwith(non_differences, axis=0)
    return correlations_df.dropna()
    


In [31]:
def csGraph(cs, cols, colors):
    demos = ['Women', 'Black', 'Hispanic']
    p=figure(width=800, height=400, title='Absolute Counts of Minorities in CS')
    p.xaxis.axis_label='Year'
    p.yaxis.axis_label='Percentage'
    for i, demo in enumerate(cols[1:]):
        p.line(cs['year'], cs[demo], legend_label=demos[i], line_color=colors[i])
    p.legend.location='center'
    p.add_layout(p.legend[0], 'right')
    return p
    

In [32]:
def nonCsGraph(nonCs, cols, colors):
    demos = ['Women', 'Black', 'Hispanic']
    p=figure(width=800, height=400, title='Absolute Counts of Minorities in Non-CS')
    for i, demo in enumerate(cols[1:]):
        p.line(nonCs['year'], nonCs[demo], legend_label=demos[i], line_color=colors[i])
    p.legend.location='center'
    p.add_layout(p.legend[0], 'right')
    return p

In [33]:
def run():
    all_unis, all_completion, cols, colors = startup()
    years = make_years(all_completion, all_unis, 2011, 2021)
    csTotal, nonCsTotal = groupTotals(years)
    csTotal, nonCsTotal = make_groups(csTotal, nonCsTotal)
    cs_graph = csGraph(csTotal, cols, colors)
    nonCs_graph = nonCsGraph(nonCsTotal, cols, colors)
    show(cs_graph)
    show(nonCs_graph)
    
    

In [34]:
run()