In [1]:
import pandas as pd
import numpy as np

years = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
files = ['gr{}_rv.csv', 'ef{}a_rv.csv', 'hd{}.csv']
#files = ['adm{}_rv.csv', 'c{}_a_rv.csv', 'ef{}a_rv.csv', 'gr{}_rv.csv', 'hd{}.csv', 's{}_oc_rv.csv']

In [2]:
# Get all the dataframes into a dictionary
dataframes = {'university_data': [], 'enrollment_data': [], 'graduation_data': []}

for f in files:
    for y in years:
        data = pd.read_csv("C:\\project\\{}\\{}".format(y, f.format(y)), encoding = 'ISO-8859-1')
        data['year'] = int(y)
        
        if f[0] == 'h':
            dataframes['university_data'].append(data)
        elif f[0] == 'e':
            dataframes['enrollment_data'].append(data)
        elif f[0] == 'g':
            dataframes['graduation_data'].append(data)

In [3]:
# Merge the data frames into one dataframe per category
unv_df = dataframes['university_data'][0]
enr_df = dataframes['enrollment_data'][0]
grad_df = dataframes['graduation_data'][0]

for key in dataframes.keys():
    df_list = dataframes[key]
    count = 0

    for l_item in df_list:

        if count == 0:
            count += 1
            continue
        else:
            if key == 'university_data':
                unv_df = pd.concat([unv_df, l_item])
            elif key == 'enrollment_data':
                enr_df = pd.concat([enr_df, l_item])
            elif key == 'graduation_data':
                grad_df = pd.concat([grad_df, l_item])
        
        count += 1 

In [5]:
# rename variables to something that we can understand better
unv_df = unv_df.rename(columns={'UNITID': 'institute_id',
                       'INSTNM': 'institute_name',
                       'STABBR': 'state',
                       'ICLEVEL': 'university_level',
                       'HLOFFER': 'highest_level_offered',
                       'CITY':   'city',
                       'ZIP':    'zip'
                       })


# Grab Washington Schools only that are 4 year or more institutions and that give out bachelor's or higher level degrees
wa_unv_df = unv_df[(unv_df['state'] == 'WA') & (unv_df['university_level'] == 1) & (unv_df['highest_level_offered'].isin([5,6,7,8,9]))][['institute_id', 'institute_name', 'state', 'university_level', 'highest_level_offered', 'city', 'zip', 'year']]

In [6]:
wa_enr_df = enr_df[enr_df['UNITID'].isin(wa_unv_df['institute_id'])].copy()

In [7]:
# rename variables to something that we can understand better
wa_enr_df = wa_enr_df.rename(columns={'UNITID': 'institute_id',
                       'EFALEVEL': 'student_lvl',
                       'LINE': 'student_lvl_og',
                       'SECTION': 'attend_status',
                       'LSTUDY': 'student_lvl_2',
                       'EFTOTLT': 'e_total',
                       'EFAIANT': 'e_AmerInd_AlaskNtv',
                       'EFASIAT': 'e_Asian',
                       'EFBKAAT': 'e_AfrAmer',
                       'EFHISPT': 'e_Hispanic',
                       'EFNHPIT': 'e_Hi_PacIsland',
                       'EFWHITT': 'e_White',
                       'EF2MORT': 'e_2orMore',
                       'EFUNKNT': 'e_unknown',
                       'EFNRALT': 'e_nonresAlien',
                       })


In [8]:
wa_enr_df = wa_enr_df[wa_enr_df['institute_id'].isin(wa_unv_df['institute_id'])]
df_enrollment = wa_enr_df[wa_enr_df['student_lvl'] == 1][['institute_id', 'e_total', 'e_AmerInd_AlaskNtv', 'e_Asian', 'e_AfrAmer', 'e_Hispanic', 'e_Hi_PacIsland', 'e_White', 'e_2orMore', 'e_unknown', 'e_nonresAlien', 'year']].groupby(['institute_id', 'year']).sum()

In [11]:
wa_grad_df = grad_df[grad_df['UNITID'].isin(wa_unv_df['institute_id'])].copy()
wa_cohort_df = grad_df[grad_df['UNITID'].isin(wa_unv_df['institute_id'])].copy()

In [12]:
# rename variables to something that we can understand better
wa_grad_df = wa_grad_df.rename(columns={'UNITID': 'institute_id',
                       'GRTOTLT': 'gr_total',
                       'GRAIANT': 'gr_AmerInd_AlaskNtv',
                       'GRASIAT': 'gr_Asian',
                       'GRBKAAT': 'gr_AfrAmer',
                       'GRHISPT': 'gr_Hispanic',
                       'GRNHPIT': 'gr_Hi_PacIsland',
                       'GRWHITT': 'gr_White',
                       'GR2MORT': 'gr_2orMore',
                       'GRUNKNT': 'gr_unknown',
                       'GRNRALT': 'gr_nonresAlien'
                       })
wa_grad_df

Unnamed: 0,institute_id,GRTYPE,CHRTSTAT,SECTION,COHORT,XGRNRALM,GRNRALM,XGRNRALW,GRNRALW,XGRRAC03,...,DVGRHSW,XDVGRWHT,DVGRWHT,XDVGRWHM,DVGRWHM,XDVGRWHW,DVGRWHW,year,LINE,GRNRALW.1
636,102845,2,12,1,1,Z,0.0,Z,0.0,R,...,3.0,R,13.0,R,8.0,R,5,2010,,
637,102845,3,13,1,1,Z,0.0,Z,0.0,R,...,3.0,R,13.0,R,8.0,R,5,2010,,
638,102845,6,10,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
639,102845,8,12,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
640,102845,9,13,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49371,488448,20,12,3,3,R,0.0,R,,,...,,,,,,,,2020,50,0.0
49372,488448,21,13,3,3,R,0.0,R,,,...,,,,,,,,2020,29A,0.0
49373,488448,22,14,3,3,R,0.0,R,,,...,,,,,,,,2020,11A,0.0
49374,488448,42,32,1,1,R,0.0,R,,,...,,,,,,,,2020,999,0.0


In [13]:
# rename variables to something that we can understand better
wa_cohort_df = wa_cohort_df.rename(columns={'UNITID': 'institute_id',
                       'GRTOTLT': 'co_total',
                       'GRAIANT': 'co_AmerInd_AlaskNtv',
                       'GRASIAT': 'co_Asian',
                       'GRBKAAT': 'co_AfrAmer',
                       'GRHISPT': 'co_Hispanic',
                       'GRNHPIT': 'co_Hi_PacIsland',
                       'GRWHITT': 'co_White',
                       'GR2MORT': 'co_2orMore',
                       'GRUNKNT': 'co_unknown',
                       'GRNRALT': 'co_nonresAlien'
                       })
wa_cohort_df

Unnamed: 0,institute_id,GRTYPE,CHRTSTAT,SECTION,COHORT,XGRNRALM,GRNRALM,XGRNRALW,GRNRALW,XGRRAC03,...,DVGRHSW,XDVGRWHT,DVGRWHT,XDVGRWHM,DVGRWHM,XDVGRWHW,DVGRWHW,year,LINE,GRNRALW.1
636,102845,2,12,1,1,Z,0.0,Z,0.0,R,...,3.0,R,13.0,R,8.0,R,5,2010,,
637,102845,3,13,1,1,Z,0.0,Z,0.0,R,...,3.0,R,13.0,R,8.0,R,5,2010,,
638,102845,6,10,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
639,102845,8,12,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
640,102845,9,13,2,2,R,0.0,R,0.0,R,...,0.0,R,3.0,R,3.0,R,0,2010,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49371,488448,20,12,3,3,R,0.0,R,,,...,,,,,,,,2020,50,0.0
49372,488448,21,13,3,3,R,0.0,R,,,...,,,,,,,,2020,29A,0.0
49373,488448,22,14,3,3,R,0.0,R,,,...,,,,,,,,2020,11A,0.0
49374,488448,42,32,1,1,R,0.0,R,,,...,,,,,,,,2020,999,0.0


In [14]:
df_grad = wa_grad_df[(wa_grad_df['GRTYPE'] == 2) & (wa_grad_df['SECTION'] == 1)][['institute_id', 'gr_total', 'gr_AmerInd_AlaskNtv', 'gr_Asian', 'gr_AfrAmer', 'gr_Hispanic', 'gr_Hi_PacIsland', 'gr_White', 'gr_2orMore', 'gr_unknown', 'gr_nonresAlien', 'year']].groupby(['institute_id', 'year']).sum()
df_grad.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gr_total,gr_AmerInd_AlaskNtv,gr_Asian,gr_AfrAmer,gr_Hispanic,gr_Hi_PacIsland,gr_White,gr_2orMore,gr_unknown,gr_nonresAlien
institute_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
102845,2010,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102845,2011,32,2.0,2.0,1.0,0.0,0.0,25.0,1.0,0.0,1.0
102845,2012,277,41.0,41.0,25.0,9.0,0.0,155.0,3.0,3.0,0.0
102845,2013,832,136.0,125.0,82.0,28.0,5.0,437.0,11.0,8.0,0.0
102845,2014,670,75.0,36.0,53.0,124.0,21.0,283.0,20.0,58.0,0.0


In [15]:
df_cohort = wa_cohort_df[(wa_cohort_df['GRTYPE'] == 3) & (wa_cohort_df['SECTION'] == 1)][['institute_id', 'co_total', 'co_AmerInd_AlaskNtv', 'co_Asian', 'co_AfrAmer', 'co_Hispanic', 'co_Hi_PacIsland', 'co_White', 'co_2orMore', 'co_unknown', 'co_nonresAlien', 'year']].groupby(['institute_id', 'year']).sum()
df_cohort.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,co_total,co_AmerInd_AlaskNtv,co_Asian,co_AfrAmer,co_Hispanic,co_Hi_PacIsland,co_White,co_2orMore,co_unknown,co_nonresAlien
institute_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
102845,2010,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102845,2011,30,2.0,2.0,1.0,0.0,0.0,23.0,1.0,0.0,1.0
102845,2012,93,12.0,19.0,6.0,1.0,0.0,53.0,2.0,0.0,0.0
102845,2013,294,42.0,38.0,29.0,8.0,0.0,167.0,5.0,5.0,0.0
102845,2014,307,22.0,11.0,14.0,72.0,7.0,139.0,10.0,32.0,0.0


In [23]:
df = pd.concat([df_enrollment, df_grad], axis = 1)
df = pd.concat([df, df_cohort], axis = 1)

In [24]:
df.to_csv("C:\\project\\university_data_enr_grad_and_cohort.csv")

In [28]:
df2 = pd.concat([df_grad, df_cohort], axis = 1)

In [29]:
df2.to_csv("C:\\project\\university_data_grad_and_cohort.csv")