In [1]:
import pandas as pd
import numpy as np

years = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
files = ['gr{}_rv.csv', 'ef{}a_rv.csv', 'hd{}.csv', 's{}_oc_rv.csv']
fiscalYearFiles = ['f{}_f1a_rv.csv', 'f{}_f2_rv.csv', 'sfa{}_rv.csv']
#files = ['adm{}_rv.csv', 'c{}_a_rv.csv', 'ef{}a_rv.csv', 'gr{}_rv.csv', 'hd{}.csv', 's{}_oc_rv.csv']

In [2]:
def GetFiscalYear(year):
    endYear = year[-2:]
    numEndYear = int(endYear)
    startYear = numEndYear-1
    return str(startYear) + endYear

In [3]:
# Get all the dataframes into a dictionary
dataframes = {'university_data': [], 'enrollment_data': [], 'graduation_data': [], 'staff_data': [], 'finance1_data': [], 'finance2_data': [], 'finaid_data': []}

for y in years:
    for f in files:
        data = pd.read_csv("C:\\project\\{}\\{}".format(y, f.format(y)), encoding = 'ISO-8859-1')
        data['year'] = int(y)
        
        if f[0] == 'h':
            dataframes['university_data'].append(data)
        elif f[0] == 'e':
            dataframes['enrollment_data'].append(data)
        elif f[0] == 'g':
            dataframes['graduation_data'].append(data)
        elif f[0] == 's':
            dataframes['staff_data'].append(data)
            
    fiscalYear = GetFiscalYear(y)
    for fyf in fiscalYearFiles:
        fdata = pd.read_csv("C:\\project\\{}\\{}".format(y, fyf.format(fiscalYear)), encoding = 'ISO-8859-1')
        fdata['year'] = int(y)
        
        if 'f1a' in fyf:
            dataframes['finance1_data'].append(fdata)
        elif 'f2' in fyf:
            dataframes['finance2_data'].append(fdata)
        elif 'sfa' in fyf:
            dataframes['finaid_data'].append(fdata)

In [4]:
# Merge the data frames into one dataframe per category
df_unv = dataframes['university_data'][0]
df_enr = dataframes['enrollment_data'][0]
df_grad = dataframes['graduation_data'][0]
df_staff = dataframes['staff_data'][0]
df_finance1 = dataframes['finance1_data'][0]
df_finance2 = dataframes['finance2_data'][0]
df_finaid = dataframes['finaid_data'][0]

for key in dataframes.keys():
    df_list = dataframes[key]
    count = 0

    for l_item in df_list:

        if count == 0:
            count += 1
            continue
        else:
            if key == 'university_data':
                df_unv = pd.concat([df_unv, l_item])
            elif key == 'enrollment_data':
                df_enr = pd.concat([df_enr, l_item])
            elif key == 'graduation_data':
                df_grad = pd.concat([df_grad, l_item])
            elif key == 'staff_data':
                df_staff = pd.concat([df_staff, l_item])
            elif key == 'finance1_data':
                df_finance1 = pd.concat([df_finance1, l_item])
            elif key == 'finance2_data':
                df_finance2 = pd.concat([df_finance2, l_item])
            elif key == 'finaid_data':
                df_finaid = pd.concat([df_finaid, l_item])
        
        count += 1 

In [5]:
# rename variables to something that we can understand better
df_unv = df_unv.rename(columns={'UNITID': 'institute_id',
                       'INSTNM': 'institute_name',
                       'STABBR': 'state',
                       'ICLEVEL': 'university_level',
                       'HLOFFER': 'highest_level_offered',
                       'CITY':   'city',
                       'ZIP':    'zip'
                       })


In [6]:
# rename variables to something that we can understand better
df_enr = df_enr.rename(columns={'UNITID': 'institute_id',
                       'EFALEVEL': 'student_lvl',
                       'LINE': 'student_lvl_og',
                       'SECTION': 'attend_status',
                       'LSTUDY': 'student_lvl_2',
                       'EFTOTLT': 'e_total',
                       'EFAIANT': 'e_AmerInd_AlaskNtv',
                       'EFASIAT': 'e_Asian',
                       'EFBKAAT': 'e_AfrAmer',
                       'EFHISPT': 'e_Hispanic',
                       'EFNHPIT': 'e_Hi_PacIsland',
                       'EFWHITT': 'e_White',
                       'EF2MORT': 'e_2orMore',
                       'EFUNKNT': 'e_unknown',
                       'EFNRALT': 'e_nonresAlien'
                       })


In [7]:
df_enr = df_enr[df_enr['student_lvl'] == 1][['institute_id', 'e_total', 'e_AmerInd_AlaskNtv', 'e_Asian', 'e_AfrAmer', 'e_Hispanic', 'e_Hi_PacIsland', 'e_White', 'e_2orMore', 'e_unknown', 'e_nonresAlien', 'year']]

In [8]:
# rename variables to something that we can understand better
df_finance1 = df_finance1.rename(columns={'UNITID': 'institute_id',
                       'F1H02': 'public_endowment'
                       })

In [9]:
# rename variables to something that we can understand better
df_finance2 = df_finance2.rename(columns={'UNITID': 'institute_id',
                       'F2H02': 'private_endowment'
                       })

In [32]:
df_finaid = df_finaid.rename(columns={'UNITID': 'institute_id'})

In [10]:
df_cohort = df_grad.copy()

In [11]:
# rename variables to something that we can understand better
df_grad = df_grad.rename(columns={'UNITID': 'institute_id',
                        'GRTOTLT': 'gr_total',
                        'GRAIANT': 'gr_AmerInd_AlaskNtv',
                        'GRASIAT': 'gr_Asian',
                        'GRBKAAT': 'gr_AfrAmer',
                        'GRHISPT': 'gr_Hispanic',
                        'GRNHPIT': 'gr_Hi_PacIsland',
                        'GRWHITT': 'gr_White',
                        'GR2MORT': 'gr_2orMore',
                        'GRUNKNT': 'gr_unknown',
                        'GRNRALT': 'gr_nonresAlien'
                       })
df_grad.head()

Unnamed: 0,institute_id,GRTYPE,CHRTSTAT,SECTION,COHORT,LINE,XGRTOTLT,gr_total,XGRTOTLM,GRTOTLM,...,XGRUNKNW,GRUNKNW,XGRNRALT,gr_nonresAlien,XGRNRALM,GRNRALM,XGRNRALW,GRNRALW,year,GRNRALW.1
0,100654,2,12,1,1,999,R,882,R,407.0,...,R,0.0,R,5.0,R,2.0,R,3.0,2013,
1,100654,3,13,1,1,999,R,257,R,93.0,...,Z,0.0,R,2.0,Z,0.0,R,2.0,2013,
2,100654,4,20,1,1,999,R,197,R,100.0,...,Z,0.0,R,1.0,R,0.0,R,1.0,2013,
3,100654,6,10,2,2,10,R,882,R,407.0,...,R,0.0,R,5.0,R,2.0,R,3.0,2013,
4,100654,8,12,2,2,50,R,882,R,407.0,...,R,0.0,R,5.0,R,2.0,R,3.0,2013,


In [12]:
# rename variables to something that we can understand better
df_cohort = df_cohort.rename(columns={'UNITID': 'institute_id',
                        'GRTOTLT': 'co_total',
                        'GRAIANT': 'co_AmerInd_AlaskNtv',
                        'GRASIAT': 'co_Asian',
                        'GRBKAAT': 'co_AfrAmer',
                        'GRHISPT': 'co_Hispanic',
                        'GRNHPIT': 'co_Hi_PacIsland',
                        'GRWHITT': 'co_White',
                        'GR2MORT': 'co_2orMore',
                        'GRUNKNT': 'co_unknown',
                        'GRNRALT': 'co_nonresAlien'
                       })
df_cohort.head()

Unnamed: 0,institute_id,GRTYPE,CHRTSTAT,SECTION,COHORT,LINE,XGRTOTLT,co_total,XGRTOTLM,GRTOTLM,...,XGRUNKNW,GRUNKNW,XGRNRALT,co_nonresAlien,XGRNRALM,GRNRALM,XGRNRALW,GRNRALW,year,GRNRALW.1
0,100654,2,12,1,1,999,R,882,R,407.0,...,R,0.0,R,5.0,R,2.0,R,3.0,2013,
1,100654,3,13,1,1,999,R,257,R,93.0,...,Z,0.0,R,2.0,Z,0.0,R,2.0,2013,
2,100654,4,20,1,1,999,R,197,R,100.0,...,Z,0.0,R,1.0,R,0.0,R,1.0,2013,
3,100654,6,10,2,2,10,R,882,R,407.0,...,R,0.0,R,5.0,R,2.0,R,3.0,2013,
4,100654,8,12,2,2,50,R,882,R,407.0,...,R,0.0,R,5.0,R,2.0,R,3.0,2013,


In [13]:
df_unv = df_unv[['institute_id', 'institute_name', 'state', 'city', 'zip', 'year']]

In [14]:
df_grad = df_grad[(df_grad['GRTYPE'] == 3) & (df_grad['SECTION'] == 1)][['institute_id', 'gr_total', 'gr_AmerInd_AlaskNtv', 'gr_Asian', 'gr_AfrAmer', 'gr_Hispanic', 'gr_Hi_PacIsland', 'gr_White', 'gr_2orMore', 'gr_unknown', 'gr_nonresAlien', 'year']]

In [15]:
df_cohort = df_cohort[(df_cohort['GRTYPE'] == 2) & (df_cohort['SECTION'] == 1)][['institute_id', 'co_total', 'co_AmerInd_AlaskNtv', 'co_Asian', 'co_AfrAmer', 'co_Hispanic', 'co_Hi_PacIsland', 'co_White', 'co_2orMore', 'co_unknown', 'co_nonresAlien', 'year']]

In [16]:
# rename variables to something that we can understand better
df_staff = df_staff.rename(columns={'UNITID': 'institute_id',
                        'HRTOTLT': 'hr_total',
                        'HRAIANT': 'hr_AmerInd_AlaskNtv',
                        'HRASIAT': 'hr_Asian',
                        'HRBKAAT': 'hr_AfrAmer',
                        'HRHISPT': 'hr_Hispanic',
                        'HRNHPIT': 'hr_Hi_PacIsland',
                        'HRWHITT': 'hr_White',
                        'HR2MORT': 'hr_2orMore',
                        'HRUNKNT': 'hr_unknown',
                        'HRNRALT': 'hr_nonresAlien'
                       })

In [17]:
df_staff = df_staff[df_staff.STAFFCAT == 1210][['institute_id', 'hr_total', 'hr_AmerInd_AlaskNtv', 'hr_Asian', 'hr_AfrAmer', 'hr_Hispanic', 'hr_Hi_PacIsland', 'hr_White', 'hr_2orMore', 'hr_unknown', 'hr_nonresAlien', 'year']]

In [18]:
df_finance1 = df_finance1[['institute_id', 'public_endowment', 'year']]

In [19]:
df_finance2 = df_finance2[['institute_id', 'private_endowment', 'year']]

Unnamed: 0,institute_id,private_endowment,year
0,100690,174805.0,2013
1,100937,50042970.0,2013
2,101073,,2013
3,101189,18015943.0,2013
4,101435,44721770.0,2013


In [40]:
df = df_grad.merge(df_cohort, how='outer', on = ['institute_id', 'year'])
df = df.merge(df_staff, how='outer', on = ['institute_id', 'year'])
df = df.merge(df_enr, how = 'outer', on = ['institute_id', 'year'])

In [21]:
df.head()

Unnamed: 0,institute_id,gr_total,gr_AmerInd_AlaskNtv,gr_Asian,gr_AfrAmer,gr_Hispanic,gr_Hi_PacIsland,gr_White,gr_2orMore,gr_unknown,...,e_total,e_AmerInd_AlaskNtv,e_Asian,e_AfrAmer,e_Hispanic,e_Hi_PacIsland,e_White,e_2orMore,e_unknown,e_nonresAlien
0,100654,257.0,0.0,0.0,252.0,1.0,0.0,2.0,0.0,0.0,...,5020.0,8.0,32.0,4630.0,49.0,4.0,256.0,0.0,40.0,1.0
1,100663,741.0,2.0,51.0,181.0,17.0,0.0,441.0,19.0,15.0,...,18568.0,48.0,900.0,3929.0,488.0,14.0,11853.0,524.0,237.0,575.0
2,100690,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,631.0,1.0,2.0,251.0,7.0,1.0,192.0,0.0,177.0,0.0
3,100706,367.0,10.0,13.0,50.0,6.0,0.0,264.0,7.0,2.0,...,7376.0,94.0,276.0,863.0,224.0,1.0,5117.0,107.0,244.0,450.0
4,100724,340.0,0.0,0.0,332.0,1.0,0.0,1.0,2.0,4.0,...,6075.0,9.0,14.0,5530.0,71.0,4.0,206.0,38.0,83.0,120.0


In [41]:
df.to_csv("C:\\project\\student_data.csv")

In [35]:
df_university = df_unv.merge(df_finance1, how='outer', on = ['institute_id', 'year'])
df_university = df_university.merge(df_finance2, how='outer', on = ['institute_id', 'year'])
df_university = df_university.merge(df_finaid, how = 'outer', on = ['institute_id', 'year'])

In [42]:
df_endowment_only = df_unv.merge(df_finance1, how='outer', on = ['institute_id', 'year'])
df_endowment_only = df_endowment_only.merge(df_finance2, how='outer', on = ['institute_id', 'year'])

In [43]:
df_university.to_csv("C:\\project\\endowment_and_finaid_data.csv")

In [45]:
df_endowment_only.to_csv("C:\\project\\endowment_data.csv")