In [1]:
import pandas as pd
import numpy as np

years = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
files = ['gr{}_rv.csv', 'ef{}a_rv.csv', 'hd{}.csv']
#files = ['adm{}_rv.csv', 'c{}_a_rv.csv', 'ef{}a_rv.csv', 'gr{}_rv.csv', 'hd{}.csv', 's{}_oc_rv.csv']

In [2]:
# Get all the dataframes into a dictionary
dataframes = {'university_data': [], 'enrollment_data': [], 'graduation_data': []}

for f in files:
    for y in years:
        data = pd.read_csv("C:\\project\\{}\\{}".format(y, f.format(y)), encoding = 'ISO-8859-1')
        data['year'] = int(y)
        
        if f[0] == 'h':
            dataframes['university_data'].append(data)
        elif f[0] == 'e':
            dataframes['enrollment_data'].append(data)
        elif f[0] == 'g':
            dataframes['graduation_data'].append(data)

In [3]:
# Merge the data frames into one dataframe per category
unv_df = dataframes['university_data'][0]
enr_df = dataframes['enrollment_data'][0]
grad_df = dataframes['graduation_data'][0]

for key in dataframes.keys():
    df_list = dataframes[key]
    count = 0

    for l_item in df_list:

        if count == 0:
            count += 1
            continue
        else:
            if key == 'university_data':
                unv_df = pd.concat([unv_df, l_item])
            elif key == 'enrollment_data':
                enr_df = pd.concat([enr_df, l_item])
            elif key == 'graduation_data':
                grad_df = pd.concat([grad_df, l_item])
        
        count += 1 

In [4]:
#school_data = pd.read_csv("C:\\project\\2014\\adm2014.csv", encoding = 'ISO-8859-1')
# Grab Washington Schools only
wa_unv_df = unv_df[(unv_df['STABBR'] == 'WA') & (unv_df['ICLEVEL'] == 1) & (unv_df['HLOFFER'].isin([5,6,7,8,9]))][['UNITID', 'INSTNM', 'STABBR', 'ICLEVEL', 'year']]
wa_unv_df

Unnamed: 0,UNITID,INSTNM,STABBR,ICLEVEL,year
4331,234492,The Art Institute of Seattle,WA,1,2010
4333,234669,Bellevue College,WA,1,2010
4339,234827,Central Washington University,WA,1,2010
4341,234915,City University of Seattle,WA,1,2010
4345,234979,Columbia Basin College,WA,1,2010
...,...,...,...,...,...
4707,443410,DigiPen Institute of Technology,WA,1,2020
5122,455406,Pacific Northwest University of Health Sciences,WA,1,2020
5496,475200,Whitworth University-Adult Degree Programs,WA,1,2020
5960,487603,Northwest University-College of Adult and Prof...,WA,1,2020


In [5]:
wa_enr_df = enr_df[enr_df['UNITID'].isin(wa_unv_df['UNITID'])].copy()
wa_cohort_df = enr_df[enr_df['UNITID'].isin(wa_unv_df['UNITID'])].copy()

In [6]:
# rename variables to something that we can understand better
wa_enr_df = wa_enr_df.rename(columns={'UNITID': 'institute_id',
                       'EFALEVEL': 'student_lvl',
                       'LINE': 'student_lvl_og',
                       'SECTION': 'attend_status',
                       'LSTUDY': 'student_lvl_2',
                       'EFTOTLT': 'e_total',
                       'EFTOTLM': 'e_total_m',
                       'EFTOTLW': 'e_total_w',
                       'EFAIANT': 'e_AmerInd_AlaskNtv',
                       'EFAIANM': 'e_AmerInd_AlaskNtv_m',
                       'EFAIANW': 'e_AmerInd_AlaskNtv_w',
                       'EFASIAT': 'e_Asian',
                       'EFASIAM': 'e_Asian_m',
                       'EFASIAW': 'e_Asian_w',
                       'EFBKAAT': 'e_AfrAmer',
                       'EFBKAAM': 'e_AfrAmer_m',
                       'EFBKAAW': 'e_AfrAmer_w',
                       'EFHISPT': 'e_Hispanic',
                       'EFHISPM': 'e_Hispanic_m',
                       'EFHISPW': 'e_Hispanic_w',
                       'EFNHPIT': 'e_Hi_PacIsland',
                       'EFNHPIM': 'e_Hi_PacIsland_m',
                       'EFNHPIW': 'e_Hi_PacIsland_w',
                       'EFWHITT': 'e_White',
                       'EFWHITM': 'e_White_m',
                       'EFWHITW': 'e_White_w',
                       'EF2MORT': 'e_2orMore',
                       'EF2MORM': 'e_2orMore_m',
                       'EF2MORW': 'e_2orMore_w',
                       'EFUNKNT': 'e_unknown',
                       'EFUNKNM': 'e_unknown_m',
                       'EFUNKNW': 'e_unknown_w',
                       'EFNRALT': 'e_nonresAlien',
                       'EFNRALM': 'e_nonresAlien_m',
                       'EFNRALW ': 'e_nonresAlien_w'
                       })


In [7]:
wa_enr_df = wa_enr_df[wa_enr_df['institute_id'].isin(wa_unv_df['UNITID'])]
df_enrollment = wa_enr_df[wa_enr_df['student_lvl'] == 1][['institute_id', 'e_total', 'e_AmerInd_AlaskNtv', 'e_Asian', 'e_AfrAmer', 'e_Hispanic', 'e_Hi_PacIsland', 'e_White', 'e_2orMore', 'e_unknown', 'e_nonresAlien', 'year']].groupby(['institute_id', 'year']).sum()

In [8]:
# rename variables to something that we can understand better
wa_cohort_df = wa_cohort_df.rename(columns={'UNITID': 'institute_id',
                       'EFALEVEL': 'student_lvl',
                       'LINE': 'student_lvl_og',
                       'SECTION': 'attend_status',
                       'LSTUDY': 'student_lvl_2',
                       'EFTOTLT': 'co_total',
                       'EFTOTLM': 'co_total_m',
                       'EFTOTLW': 'co_total_w',
                       'EFAIANT': 'co_AmerInd_AlaskNtv',
                       'EFAIANM': 'co_AmerInd_AlaskNtv_m',
                       'EFAIANW': 'co_AmerInd_AlaskNtv_w',
                       'EFASIAT': 'co_Asian',
                       'EFASIAM': 'co_Asian_m',
                       'EFASIAW': 'co_Asian_w',
                       'EFBKAAT': 'co_AfrAmer',
                       'EFBKAAM': 'co_AfrAmer_m',
                       'EFBKAAW': 'co_AfrAmer_w',
                       'EFHISPT': 'co_Hispanic',
                       'EFHISPM': 'co_Hispanic_m',
                       'EFHISPW': 'co_Hispanic_w',
                       'EFNHPIT': 'co_Hi_PacIsland',
                       'EFNHPIM': 'co_Hi_PacIsland_m',
                       'EFNHPIW': 'co_Hi_PacIsland_w',
                       'EFWHITT': 'co_White',
                       'EFWHITM': 'co_White_m',
                       'EFWHITW': 'co_White_w',
                       'EF2MORT': 'co_2orMore',
                       'EF2MORM': 'co_2orMore_m',
                       'EF2MORW': 'co_2orMore_w',
                       'EFUNKNT': 'co_unknown',
                       'EFUNKNM': 'co_unknown_m',
                       'EFUNKNW': 'co_unknown_w',
                       'EFNRALT': 'co_nonresAlien',
                       'EFNRALM': 'co_nonresAlien_m',
                       'EFNRALW ': 'co_nonresAlien_w'
                       }).copy()

In [9]:
wa_cohort_df = wa_cohort_df[wa_cohort_df['institute_id'].isin(wa_unv_df['UNITID'])]
df_cohort = wa_cohort_df[wa_cohort_df['student_lvl'] == 24][['institute_id', 'co_total', 'co_AmerInd_AlaskNtv', 'co_Asian', 'co_AfrAmer', 'co_Hispanic', 'co_Hi_PacIsland', 'co_White', 'co_2orMore', 'co_unknown', 'co_nonresAlien', 'year']].groupby(['institute_id', 'year']).sum()
df_cohort

Unnamed: 0_level_0,Unnamed: 1_level_0,co_total,co_AmerInd_AlaskNtv,co_Asian,co_AfrAmer,co_Hispanic,co_Hi_PacIsland,co_White,co_2orMore,co_unknown,co_nonresAlien
institute_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
102845,2010,1658,109,40,102,440,39,639,19,175,95
102845,2011,871,45,15,24,267,17,323,16,124,40
102845,2012,918,54,63,42,282,10,387,45,35,0
102845,2013,1053,49,16,57,316,73,451,68,23,0
102845,2014,437,34,3,25,156,25,158,16,20,0
...,...,...,...,...,...,...,...,...,...,...,...
488448,2016,6,0,0,0,1,0,5,0,0,0
488448,2017,9,0,0,1,1,0,7,0,0,0
488448,2018,7,0,0,0,1,0,6,0,0,0
488448,2019,8,0,0,1,2,0,5,0,0,0


In [10]:
wa_grad_df = grad_df[grad_df['UNITID'].isin(wa_unv_df['UNITID'])]
wa_grad_df

Unnamed: 0,UNITID,GRTYPE,CHRTSTAT,SECTION,COHORT,XGRNRALM,GRNRALM,XGRNRALW,GRNRALW,XGRRAC03,...,DVGRHSW,XDVGRWHT,DVGRWHT,XDVGRWHM,DVGRWHM,XDVGRWHW,DVGRWHW,year,LINE,GRNRALW.1
636,102845,2,12,1,1,Z,0.0,Z,0.0,R,...,3.0,R,13.0,R,8.0,R,5,2010,,
637,102845,3,13,1,1,Z,0.0,Z,0.0,R,...,3.0,R,13.0,R,8.0,R,5,2010,,
638,102845,6,10,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
639,102845,8,12,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
640,102845,9,13,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49371,488448,20,12,3,3,R,0.0,R,,,...,,,,,,,,2020,50,0.0
49372,488448,21,13,3,3,R,0.0,R,,,...,,,,,,,,2020,29A,0.0
49373,488448,22,14,3,3,R,0.0,R,,,...,,,,,,,,2020,11A,0.0
49374,488448,42,32,1,1,R,0.0,R,,,...,,,,,,,,2020,999,0.0


In [11]:
# rename variables to something that we can understand better
wa_grad_df = wa_grad_df.rename(columns={'UNITID': 'institute_id',
                       'COHORT':  'cohort',
                       'GRTOTLT': 'gr_total',
                       'GRAIANT': 'gr_AmerInd_AlaskNtv',
                       'GRASIAT': 'gr_Asian',
                       'GRBKAAT': 'gr_AfrAmer',
                       'GRHISPT': 'gr_Hispanic',
                       'GRNHPIT': 'gr_Hi_PacIsland',
                       'GRWHITT': 'gr_White',
                       'GR2MORT': 'gr_2orMore',
                       'GRUNKNT': 'gr_unknown',
                       'GRNRALT': 'gr_nonresAlien'
                       })
wa_grad_df

Unnamed: 0,institute_id,GRTYPE,CHRTSTAT,SECTION,cohort,XGRNRALM,GRNRALM,XGRNRALW,GRNRALW,XGRRAC03,...,DVGRHSW,XDVGRWHT,DVGRWHT,XDVGRWHM,DVGRWHM,XDVGRWHW,DVGRWHW,year,LINE,GRNRALW.1
636,102845,2,12,1,1,Z,0.0,Z,0.0,R,...,3.0,R,13.0,R,8.0,R,5,2010,,
637,102845,3,13,1,1,Z,0.0,Z,0.0,R,...,3.0,R,13.0,R,8.0,R,5,2010,,
638,102845,6,10,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
639,102845,8,12,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
640,102845,9,13,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49371,488448,20,12,3,3,R,0.0,R,,,...,,,,,,,,2020,50,0.0
49372,488448,21,13,3,3,R,0.0,R,,,...,,,,,,,,2020,29A,0.0
49373,488448,22,14,3,3,R,0.0,R,,,...,,,,,,,,2020,11A,0.0
49374,488448,42,32,1,1,R,0.0,R,,,...,,,,,,,,2020,999,0.0


In [12]:
df_graduation = wa_grad_df[['institute_id', 'gr_total', 'gr_AmerInd_AlaskNtv', 'gr_Asian', 'gr_AfrAmer', 'gr_Hispanic', 'gr_Hi_PacIsland', 'gr_White', 'gr_2orMore', 'gr_unknown', 'gr_nonresAlien', 'year']].groupby(['institute_id', 'year']).sum()
df_graduation

Unnamed: 0_level_0,Unnamed: 1_level_0,gr_total,gr_AmerInd_AlaskNtv,gr_Asian,gr_AfrAmer,gr_Hispanic,gr_Hi_PacIsland,gr_White,gr_2orMore,gr_unknown,gr_nonresAlien
institute_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
102845,2010,167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102845,2011,191,12.0,12.0,6.0,0.0,0.0,149.0,6.0,0.0,6.0
102845,2012,1490,217.0,227.0,132.0,46.0,0.0,836.0,17.0,15.0,0.0
102845,2013,4482,727.0,666.0,443.0,149.0,25.0,2365.0,61.0,46.0,0.0
102845,2014,3677,397.0,191.0,283.0,692.0,112.0,1568.0,110.0,324.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
488448,2016,24,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0
488448,2017,57,0.0,0.0,6.0,0.0,0.0,51.0,0.0,0.0,0.0
488448,2018,60,0.0,0.0,6.0,18.0,0.0,36.0,0.0,0.0,0.0
488448,2019,41,0.0,0.0,0.0,11.0,0.0,30.0,0.0,0.0,0.0


In [13]:
df = pd.concat([df_enrollment, df_cohort], axis = 1)
df = pd.concat([df, df_cohort], axis = 1)

In [14]:
df.to_csv("C:\\project\\university_data_with_gradrate_and_cohort.csv")

In [15]:
df.shape

(766, 30)

In [16]:
df.info

<bound method DataFrame.info of                    e_total  e_AmerInd_AlaskNtv  e_Asian  e_AfrAmer  \
institute_id year                                                    
102845       2010     1764                 121       47        111   
             2011     1651                  89       38         66   
             2012     1259                  67       76         62   
             2013     2191                  92       43         82   
             2014     3267                 132       40        198   
...                    ...                 ...      ...        ...   
488448       2016       71                   0        4          3   
             2017      113                   1        4          8   
             2018      100                   2        6         14   
             2019      106                   3        7         14   
             2020       96                   5        7          9   

                   e_Hispanic  e_Hi_PacIsland  e_White  e