In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from glob import glob
from scipy.stats import mannwhitneyu, ttest_ind
from tqdm import tqdm
import numpy as np 
import math
%matplotlib inline

In [3]:
#Reading meta-data files
unis = pd.read_csv('../data/IPEDS/hd2020.csv',encoding='cp1252')


## Preprocessing meta-data files
### Updating names to match in rankings  
unis.loc[unis['IALIAS']=='Virginia Tech', 'INSTNM'] = 'Virginia Tech'
unis.loc[unis['INSTNM']=='University of Illinois Urbana-Champaign', 'INSTNM'] = 'University of Illinois at Urbana-Champaign'
unis.loc[(unis['INSTNM']=="St. John's College") & (unis['STABBR']=='MD'), 'INSTNM'] = "St. John's College - MD"
unis.loc[(unis['INSTNM']=="St. John's College") & (unis['STABBR']=='NM'), 'INSTNM'] = "St. John's College - NM"
unis.loc[unis['INSTNM']=="William & Mary", 'INSTNM'] = "College of William and Mary"
unis.loc[(unis['INSTNM']=="Brigham Young University") & (unis['CITY']=='Provo'), 'INSTNM'] = "Brigham Young University-Provo"
unis.loc[(unis['INSTNM']=="The University of the South") & (unis['CITY']=='Sewanee'), 'INSTNM'] = "Sewanee - The University of the South"

### Labeling 
unis = unis[unis['C18BASIC'].apply(lambda x: x in [15, 16, 17, 21, 22])]
unis['Y']  = unis['C18BASIC'].apply(lambda x: 'liberal_arts' if x in [21, 22] else 'research')

# Making sure all institutes are captured in ranking


In [13]:
unis.head()
unis['INSTNM']

1                     University of Alabama at Birmingham
3                     University of Alabama in Huntsville
6                               The University of Alabama
8                                 Athens State University
10                                      Auburn University
                              ...                        
5960    Northwest University-College of Adult and Prof...
5987             Huntington University of Health Sciences
6055                             Purdue University Global
6141               University of Wisconsin-Milwaukee Flex
6393                    The Pennsylvania State University
Name: INSTNM, Length: 940, dtype: object

In [76]:
data = pd.read_csv('../data/IPEDS/c2020_a.csv',index_col=0)
data.head()

Unnamed: 0_level_0,CIPCODE,MAJORNUM,AWLEVEL,XCTOTALT,CTOTALT,XCTOTALM,CTOTALM,XCTOTALW,CTOTALW,XCAIANT,...,XCUNKNM,CUNKNM,XCUNKNW,CUNKNW,XCNRALT,CNRALT,XCNRALM,CNRALM,XCNRALW,CNRALW
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,1.0999,1,5,R,3,Z,0,R,3,Z,...,Z,0,R,1,Z,0,Z,0,Z,0
100654,1.1001,1,5,R,7,R,2,R,5,Z,...,Z,0,Z,0,Z,0,Z,0,Z,0
100654,1.1001,1,7,R,4,R,0,R,4,Z,...,Z,0,Z,0,R,2,R,0,R,2
100654,1.1001,1,17,R,2,R,2,R,0,Z,...,Z,0,Z,0,R,1,R,1,R,0
100654,1.9999,1,5,R,6,R,4,R,2,Z,...,R,1,R,0,Z,0,Z,0,Z,0


In [83]:
# XCTOTALW = total women
# XCAIANT = total native
# XCBKAAT = total black
# XCHISPT = hispanic total

# making the base dataframe
df = pd.DataFrame(data = 0, index = sorted(set(data.index)), columns = ['Women', 'Black', 'Native', 'Hispanic'])
df.head()
# def addTo(line, df):
#     if line[1] < 12 and line[1] > 11:
#         df.loc[len(df.index)] = line

Unnamed: 0,Women,Black,Native,Hispanic
100654,0,0,0,0
100663,0,0,0,0
100690,0,0,0,0
100706,0,0,0,0
100724,0,0,0,0


In [93]:
#get the cipcodes that are between 11 and 12
moreThan = data.loc[data['CIPCODE'] > 11]
middle = moreThan.loc[moreThan['CIPCODE'] < 12]


Unnamed: 0_level_0,CIPCODE,MAJORNUM,AWLEVEL,XCTOTALT,CTOTALT,XCTOTALM,CTOTALM,XCTOTALW,CTOTALW,XCAIANT,...,XCUNKNM,CUNKNM,XCUNKNW,CUNKNW,XCNRALT,CNRALT,XCNRALM,CNRALM,XCNRALW,CNRALW
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,11.0101,1,5,R,20,R,10,R,10,Z,...,R,1,R,1,R,1,R,1,R,0
100654,11.0101,1,7,R,9,R,6,R,3,Z,...,R,1,R,1,R,2,R,1,R,1
100663,11.0101,1,5,R,71,R,58,R,13,R,...,R,1,R,0,R,1,R,1,R,0
100663,11.0101,1,7,R,28,R,18,R,10,R,...,R,0,R,0,R,24,R,15,R,9
100663,11.0101,1,17,R,3,R,3,R,0,R,...,R,0,R,0,R,2,R,2,R,0


In [None]:
#now we get the totals and put them in the dataframe
