In [1]:
import pandas as pd
import numpy as np

years = ['2014', '2015', '2016', '2017', '2018', '2019', '2020']
files = ['c{}_b_rv.csv', 'ef{}a_rv.csv', 'hd{}.csv']
#files = ['adm{}_rv.csv', 'c{}_a_rv.csv', 'ef{}a_rv.csv', 'gr{}_rv.csv', 'hd{}.csv', 's{}_oc_rv.csv']

In [2]:
# Get all the dataframes into a dictionary
dataframes = {'university_data': [], 'enrollment_data': [], 'completion_data': []}

for f in files:
    for y in years:
        data = pd.read_csv("C:\\project\\{}\\{}".format(y, f.format(y)), encoding = 'ISO-8859-1')
        data['year'] = int(y)
        
        if f[0] == 'h':
            dataframes['university_data'].append(data)
        elif f[0] == 'e':
            dataframes['enrollment_data'].append(data)
        elif f[0] == 'c':
            dataframes['completion_data'].append(data)

In [3]:
# Merge the data frames into one dataframe per category
unv_df = dataframes['university_data'][0]
enr_df = dataframes['enrollment_data'][0]
cmpl_df = dataframes['completion_data'][0]

for key in dataframes.keys():
    df_list = dataframes[key]
    count = 0

    for l_item in df_list:

        if count == 0:
            count += 1
            continue
        else:
            if key == 'university_data':
                unv_df = pd.concat([unv_df, l_item])
            elif key == 'enrollment_data':
                enr_df = pd.concat([enr_df, l_item])
            elif key == 'completion_data':
                cmpl_df = pd.concat([cmpl_df, l_item])
        
        count += 1 

In [4]:
#school_data = pd.read_csv("C:\\project\\2014\\adm2014.csv", encoding = 'ISO-8859-1')
# Grab Washington Schools only
wa_unv_df = unv_df[(unv_df['STABBR'] == 'WA') & (unv_df['ICLEVEL'] == 1)][['UNITID', 'INSTNM', 'STABBR', 'ICLEVEL', 'year']]
wa_unv_df

Unnamed: 0,UNITID,INSTNM,STABBR,ICLEVEL,year
4119,234492,The Art Institute of Seattle,WA,1,2014
4121,234669,Bellevue College,WA,1,2014
4126,234827,Central Washington University,WA,1,2014
4127,234845,Centralia College,WA,1,2014
4128,234915,City University of Seattle,WA,1,2014
...,...,...,...,...,...
4707,443410,DigiPen Institute of Technology,WA,1,2020
5122,455406,Pacific Northwest University of Health Sciences,WA,1,2020
5496,475200,Whitworth University-Adult Degree Programs,WA,1,2020
5960,487603,Northwest University-College of Adult and Prof...,WA,1,2020


In [5]:
wa_enr_df = enr_df[enr_df['UNITID'].isin(wa_unv_df['UNITID'])]

In [6]:
# rename variables to something that we can understand better
wa_enr_df = wa_enr_df.rename(columns={'UNITID': 'institute_id',
                       'EFALEVEL': 'student_lvl',
                       'LINE': 'student_lvl_og',
                       'SECTION': 'attend_status',
                       'LSTUDY': 'student_lvl_2',
                       'EFTOTLT': 'e_total',
                       'EFTOTLM': 'e_total_m',
                       'EFTOTLW': 'e_total_w',
                       'EFAIANT': 'e_AmerInd_AlaskNtv',
                       'EFAIANM': 'e_AmerInd_AlaskNtv_m',
                       'EFAIANW': 'e_AmerInd_AlaskNtv_w',
                       'EFASIAT': 'e_Asian',
                       'EFASIAM': 'e_Asian_m',
                       'EFASIAW': 'e_Asian_w',
                       'EFBKAAT': 'e_AfrAmer',
                       'EFBKAAM': 'e_AfrAmer_m',
                       'EFBKAAW': 'e_AfrAmer_w',
                       'EFHISPT': 'e_Hispanic',
                       'EFHISPM': 'e_Hispanic_m',
                       'EFHISPW': 'e_Hispanic_w',
                       'EFNHPIT': 'e_Hi_PacIsland',
                       'EFNHPIM': 'e_Hi_PacIsland_m',
                       'EFNHPIW': 'e_Hi_PacIsland_w',
                       'EFWHITT': 'e_White',
                       'EFWHITM': 'e_White_m',
                       'EFWHITW': 'e_White_w',
                       'EF2MORT': 'e_2orMore',
                       'EF2MORM': 'e_2orMore_m',
                       'EF2MORW': 'e_2orMore_w',
                       'EFUNKNT': 'e_unknown',
                       'EFUNKNM': 'e_unknown_m',
                       'EFUNKNW': 'e_unknown_w',
                       'EFNRALT': 'e_nonresAlien',
                       'EFNRALM': 'e_nonresAlien_m',
                       'EFNRALW ': 'e_nonresAlien_w'
                       })

In [7]:
df_enrollment = wa_enr_df[wa_enr_df['student_lvl'] == 1][['institute_id', 'e_total', 'e_AmerInd_AlaskNtv', 'e_Asian', 'e_AfrAmer', 'e_Hispanic', 'e_Hi_PacIsland', 'e_White', 'e_2orMore', 'e_unknown', 'e_nonresAlien', 'year']].groupby(['institute_id', 'year']).sum()
df_enrollment

Unnamed: 0_level_0,Unnamed: 1_level_0,e_total,e_AmerInd_AlaskNtv,e_Asian,e_AfrAmer,e_Hispanic,e_Hi_PacIsland,e_White,e_2orMore,e_unknown,e_nonresAlien
institute_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
102845,2014,3267,132,40,198,1007,188,1430,142,130,0
102845,2015,1945,84,93,104,588,6,841,92,137,0
102845,2016,2676,246,6,162,746,151,1097,133,135,0
102845,2017,2664,334,13,148,662,143,1070,128,166,0
102845,2018,2853,305,7,187,706,171,1098,190,189,0
...,...,...,...,...,...,...,...,...,...,...,...
488448,2016,71,0,4,3,7,0,57,0,0,0
488448,2017,113,1,4,8,8,2,83,3,0,4
488448,2018,100,2,6,14,3,1,67,2,0,5
488448,2019,106,3,7,14,11,1,63,4,0,3


In [8]:
wa_cmpl_df = cmpl_df[cmpl_df['UNITID'].isin(wa_unv_df['UNITID'])]
wa_cmpl_df

Unnamed: 0,UNITID,XCSTOTLT,CSTOTLT,XCSTOTLM,CSTOTLM,XCSTOTLW,CSTOTLW,XCSAIANT,CSAIANT,XCSAIANM,...,CSUNKNM,XCSUNKNW,CSUNKNW,XCSNRALT,CSNRALT,XCSNRALM,CSNRALM,XCSNRALW,CSNRALW,year
67,102845,R,1613,R,431,R,1182,R,53,R,...,13,R,38,Z,0,Z,0,Z,0,2014
4026,234492,R,312,R,149,R,163,R,3,R,...,82,R,94,R,0,R,0,R,0,2014
4028,234669,R,2495,R,991,R,1504,R,13,R,...,74,R,123,R,252,R,114,R,138,2014
4029,234696,R,714,R,367,R,347,R,4,R,...,132,R,73,Z,0,Z,0,Z,0,2014
4033,234827,R,2694,R,1293,R,1401,R,10,R,...,95,R,113,R,62,R,42,R,20,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4539,443410,R,226,R,168,R,58,R,3,R,...,56,R,15,R,52,R,39,R,13,2020
4933,455406,R,141,R,73,R,68,R,0,R,...,2,R,3,R,1,R,0,R,1,2020
5298,475200,R,87,R,27,R,60,R,1,R,...,1,R,3,R,0,R,0,R,0,2020
5738,487603,R,238,R,107,R,131,R,1,R,...,2,R,2,Z,0,Z,0,Z,0,2020


In [9]:
# rename variables to something that we can understand better
wa_cmpl_df = wa_cmpl_df.rename(columns={'UNITID': 'institute_id',
                       'CSTOTLT': 'c_total',
                       'CSAIANT': 'c_AmerInd_AlaskNtv',
                       'CSASIAT': 'c_Asian',
                       'CSBKAAT': 'c_AfrAmer',
                       'CSHISPT': 'c_Hispanic',
                       'CSNHPIT': 'c_Hi_PacIsland',
                       'CSWHITT': 'c_White',
                       'CS2MORT': 'c_2orMore',
                       'CSUNKNT': 'c_unknown',
                       'CSNRALT': 'c_nonresAlien'
                       })
wa_cmpl_df

Unnamed: 0,institute_id,XCSTOTLT,c_total,XCSTOTLM,CSTOTLM,XCSTOTLW,CSTOTLW,XCSAIANT,c_AmerInd_AlaskNtv,XCSAIANM,...,CSUNKNM,XCSUNKNW,CSUNKNW,XCSNRALT,c_nonresAlien,XCSNRALM,CSNRALM,XCSNRALW,CSNRALW,year
67,102845,R,1613,R,431,R,1182,R,53,R,...,13,R,38,Z,0,Z,0,Z,0,2014
4026,234492,R,312,R,149,R,163,R,3,R,...,82,R,94,R,0,R,0,R,0,2014
4028,234669,R,2495,R,991,R,1504,R,13,R,...,74,R,123,R,252,R,114,R,138,2014
4029,234696,R,714,R,367,R,347,R,4,R,...,132,R,73,Z,0,Z,0,Z,0,2014
4033,234827,R,2694,R,1293,R,1401,R,10,R,...,95,R,113,R,62,R,42,R,20,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4539,443410,R,226,R,168,R,58,R,3,R,...,56,R,15,R,52,R,39,R,13,2020
4933,455406,R,141,R,73,R,68,R,0,R,...,2,R,3,R,1,R,0,R,1,2020
5298,475200,R,87,R,27,R,60,R,1,R,...,1,R,3,R,0,R,0,R,0,2020
5738,487603,R,238,R,107,R,131,R,1,R,...,2,R,2,Z,0,Z,0,Z,0,2020


In [10]:
df_completion = wa_cmpl_df[['institute_id', 'c_total', 'c_AmerInd_AlaskNtv', 'c_Asian', 'c_AfrAmer', 'c_Hispanic', 'c_Hi_PacIsland', 'c_White', 'c_2orMore', 'c_unknown', 'c_nonresAlien', 'year']].groupby(['institute_id', 'year']).sum()
df_completion

Unnamed: 0_level_0,Unnamed: 1_level_0,c_total,c_AmerInd_AlaskNtv,c_Asian,c_AfrAmer,c_Hispanic,c_Hi_PacIsland,c_White,c_2orMore,c_unknown,c_nonresAlien
institute_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
102845,2014,1613,53,48,67,572,61,685,76,51,0
102845,2015,1656,43,28,67,544,72,788,51,63,0
102845,2016,1735,62,89,88,505,4,790,78,119,0
102845,2017,1576,81,4,82,445,71,736,70,87,0
102845,2018,1464,162,7,64,368,73,650,63,77,0
...,...,...,...,...,...,...,...,...,...,...,...
488448,2016,38,0,2,4,1,0,31,0,0,0
488448,2017,36,0,4,1,4,0,26,0,0,1
488448,2018,50,0,2,3,5,0,38,0,0,2
488448,2019,47,1,3,2,2,1,30,5,0,3


In [11]:
df = pd.concat([df_enrollment, df_completion], axis = 1)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,e_total,e_AmerInd_AlaskNtv,e_Asian,e_AfrAmer,e_Hispanic,e_Hi_PacIsland,e_White,e_2orMore,e_unknown,e_nonresAlien,c_total,c_AmerInd_AlaskNtv,c_Asian,c_AfrAmer,c_Hispanic,c_Hi_PacIsland,c_White,c_2orMore,c_unknown,c_nonresAlien
institute_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102845,2014,3267.0,132.0,40.0,198.0,1007.0,188.0,1430.0,142.0,130.0,0.0,1613,53,48,67,572,61,685,76,51,0
102845,2015,1945.0,84.0,93.0,104.0,588.0,6.0,841.0,92.0,137.0,0.0,1656,43,28,67,544,72,788,51,63,0
102845,2016,2676.0,246.0,6.0,162.0,746.0,151.0,1097.0,133.0,135.0,0.0,1735,62,89,88,505,4,790,78,119,0
102845,2017,2664.0,334.0,13.0,148.0,662.0,143.0,1070.0,128.0,166.0,0.0,1576,81,4,82,445,71,736,70,87,0
102845,2018,2853.0,305.0,7.0,187.0,706.0,171.0,1098.0,190.0,189.0,0.0,1464,162,7,64,368,73,650,63,77,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488448,2019,106.0,3.0,7.0,14.0,11.0,1.0,63.0,4.0,0.0,3.0,47,1,3,2,2,1,30,5,0,3
488448,2020,96.0,5.0,7.0,9.0,8.0,0.0,62.0,5.0,0.0,0.0,55,0,3,4,6,1,36,1,0,4
234492,2018,,,,,,,,,,,191,4,15,9,19,1,87,1,25,30
439057,2018,,,,,,,,,,,19,1,2,1,0,0,15,0,0,0


In [12]:
df.to_csv("C:\\project\\university_data2.csv")