In [1]:
import pandas as pd
import scipeds

scipeds.download_db()

from scipeds.data.completions import CompletionsQueryEngine
from scipeds.data.queries import (
    FieldTaxonomy,
    QueryFilters, 
)

engine = CompletionsQueryEngine()



Database already downloaded to /Users/kevinphan981/Library/Caches/.scipeds/scipeds_0_0_8.duckdb. To re-download and overwrite the existing file, re-run with `overwrite` set to `True`(from within Python) or add --overwrite to your CLI command.


In [2]:
from scipeds.constants import COMPLETIONS_TABLE
#  majornum,
completions = engine.get_df_from_query(f"""
    SELECT year, unitid, cipcode, awlevel, cip2020, race_ethnicity, gender, n_awards  
    FROM {COMPLETIONS_TABLE}
    WHERE YEAR BETWEEN 2010 AND 2019
    AND regexp_matches(CAST(CIPCODE AS VARCHAR), '^12\\.04.*$')
    ;
""")

completions_total = engine.get_df_from_query(f"""
    SELECT year, unitid, cipcode, awlevel, cip2020, race_ethnicity, gender, n_awards as n_awards_total 
    FROM {COMPLETIONS_TABLE}
    WHERE YEAR BETWEEN 2010 AND 2019
    ;
""")
print(completions.head())
print(completions.shape)

   year  unitid  cipcode                                            awlevel  \
0  2010  100760  12.0401  Award of at least 1 but less than 2 academic y...   
1  2010  101240  12.0401                 Award of less than 1 academic year   
2  2010  101240  12.0401                 Award of less than 1 academic year   
3  2010  101240  12.0401                 Award of less than 1 academic year   
4  2010  101240  12.0401                 Award of less than 1 academic year   

   cip2020             race_ethnicity gender  n_awards  
0  12.0401  Black or African American  women         3  
1  12.0401                    Unknown  women         1  
2  12.0401  Black or African American    men         1  
3  12.0401  Black or African American  women         2  
4  12.0401         Hispanic or Latino  women         1  
(146367, 8)


In [3]:
# to see the tables available

query = "SHOW TABLES"

engine.get_df_from_query(query) # this package doesn't offer that much, but it does at least offer the completions data

Unnamed: 0,name
0,cip_info
1,ipeds_completions_a
2,ipeds_directory_info


In [4]:
institution_fip = engine.get_institutions_table() # reads in all institutions and their features (we need state)
print(institution_fip.shape)

(9469, 102)


In [None]:
# creating index of cosmetology institutions
# cosmetology_schools = pd.merge(institution_fip, completions, how = "left", on = "unitid")
# wrong idea here, we need the unique schools from 'completions'

cosmetology_schools = completions['unitid'].unique()
print(cosmetology_schools.shape) # we cut down the amount by nearly a fourth


(2474,)


## Switching Gears to "pypeds" library

This is another library to help read in the data from IPEDS en masse. Whereas with the scipeds we could really only access the completions survey, we are able to access the other ones that we want.

I would like: 
1. Enrollment data
2. Retention rates
3. Verifying past queries on completions using this library's version of completions
4. Graduation rates (if possible)

However, this is for cosmetology schools that are dedicated only to cosmetology, a university that has a cosmetology major would not count for this. I will do this by using the graduation / enrollment ratio. If it is above a threshold of .9, then it is likely a cosmetology school. I will vary the cutoff to see if anything changes. I could also just make it one and call it a day.


In [None]:
import pypeds
from pypeds import ipeds

timeframe = range(2010, 2019, 1)


     unitid  efcstate  line xefres01  efres01 xefres02 efres02  survey_year  \
0  100654.0       1.0   1.0        R    720.0        R     646       2010.0   
1  100654.0       2.0   2.0        R      2.0        R       1       2010.0   
2  100654.0       4.0   4.0        R      2.0        R       2       2010.0   
3  100654.0       5.0   5.0        R      1.0        R       1       2010.0   
4  100654.0       6.0   6.0        R      7.0        R       5       2010.0   

   fall_year  
0     2010.0  
1     2010.0  
2     2010.0  
3     2010.0  
4     2010.0  
(526136, 9)
       unitid  efcstate  line xefres01  efres01 xefres02 efres02  survey_year  \
185  100760.0       1.0   1.0        R    678.0        R     441       2010.0   
186  100760.0       6.0   6.0        R      1.0        Z       0       2010.0   
187  100760.0      12.0  12.0        R      2.0        R       2       2010.0   
188  100760.0      13.0  13.0        R      3.0        R       3       2010.0   
189  100760.0     

In [21]:
# completions in pypeds

completions = ipeds.C_A(years = timeframe)
completions.extract()
completions_df = completions.load()
completions_df.head()

  survey_file = pd.read_csv(path, encoding='ISO-8859-1')


Unnamed: 0,unitid,cipcode,majornum,awlevel,xcnralm,cnralm,xcnralw,cnralw,xcrace03,crace03,...,dvchsw,xdvcwht,dvcwht,xdvcwhm,dvcwhm,xdvcwhw,dvcwhw,survey_year,fall_year,cdistedp
0,100636.0,9.0999,1.0,3.0,R,0.0,R,0.0,R,3.0,...,0.0,R,47.0,R,31.0,R,16.0,2010.0,2009.0,
1,100636.0,10.0105,1.0,3.0,R,0.0,R,0.0,R,43.0,...,0.0,R,802.0,R,577.0,R,225.0,2010.0,2009.0,
2,100636.0,11.0101,1.0,3.0,R,0.0,R,0.0,R,3.0,...,0.0,R,57.0,R,54.0,R,3.0,2010.0,2009.0,
3,100636.0,11.0401,1.0,3.0,R,0.0,R,0.0,R,107.0,...,0.0,R,671.0,R,554.0,R,117.0,2010.0,2009.0,
4,100636.0,13.0499,1.0,3.0,R,0.0,R,0.0,R,32.0,...,0.0,R,166.0,R,110.0,R,56.0,2010.0,2009.0,


In [None]:

enrollment = ipeds.EFC(years = timeframe)
enrollment.extract()

enrollment_df = enrollment.load()
print(enrollment_df.head())
print(enrollment_df.shape)

enrollment_cosme = enrollment_df[enrollment_df['unitid'].isin(cosmetology_schools)]
#enrollment_cosme = enrollment_cosme[['unitid', 'fall_year', 'xgrcohrt',  'grcohrt', 'xugenter',  'ugentern', 'xpgrcohr',  'pgrcohrt', 'xrrftct']]
print(enrollment_cosme.head())
print(enrollment_cosme.shape)

In [19]:
# attempting to query an index.

institutions = ipeds.IC()
institutions.extract()

institutions_df = institutions.load()
print(institutions_df.head())

directory = ipeds.HD(years = timeframe)
directory.extract()

directory_df = directory.load()
print(directory_df.head())

     unitid  peo1istr  peo2istr  peo3istr  peo4istr  peo5istr  peo6istr  \
0  100654.0       0.0       1.0       0.0       0.0       0.0       0.0   
1  100663.0       0.0       1.0       1.0       0.0       0.0       0.0   
2  100690.0       0.0       1.0       0.0       0.0       0.0       0.0   
3  100706.0       0.0       1.0       1.0       1.0       0.0       0.0   
4  100724.0       1.0       1.0       0.0       0.0       0.0       1.0   

   cntlaffi  pubprime  pubsecon  ...  xactcm75  actcm75  xacten25  acten25  \
0       1.0       2.0       0.0  ...         R     20.0         R     14.0   
1       1.0       2.0       0.0  ...         R     30.0         R     22.0   
2       4.0      -2.0      -2.0  ...       NaN      NaN       NaN      NaN   
3       1.0       2.0       0.0  ...         R     31.0         R     24.0   
4       1.0       2.0       0.0  ...         R     20.0         R     14.0   

   xacten75  acten75  xactmt25  actmt25  xactmt75  actmt75   
0         R     20

In [11]:
print(enrollment_cosme[enrollment_cosme['unitid'] == 100760])

          unitid  efcstate   line xefres01  efres01 xefres02 efres02  \
185     100760.0       1.0    1.0        R    678.0        R     441   
186     100760.0       6.0    6.0        R      1.0        Z       0   
187     100760.0      12.0   12.0        R      2.0        R       2   
188     100760.0      13.0   13.0        R      3.0        R       3   
189     100760.0      47.0   47.0        R      2.0        R       2   
...          ...       ...    ...      ...      ...      ...     ...   
455897  100760.0      33.0   33.0        R      1.0        R     1.0   
455898  100760.0      42.0   42.0        R      1.0        R     1.0   
455899  100760.0      54.0   54.0        R      1.0        R     1.0   
455900  100760.0      58.0  999.0        R    372.0        R   305.0   
455901  100760.0      99.0   99.0        R    372.0        R   305.0   

        survey_year  fall_year  
185          2010.0     2010.0  
186          2010.0     2010.0  
187          2010.0     2010.0  
188

In [12]:
# full_cosme_df = pd.merge(enrollment_cosme, completions, how = 'inner', on = 'unitid')
# full_cosme_df.shape
# TODO: Aggregate completions to year and institution, same with enrollment, regardless of demographic data.

completions_agg = completions.groupby(['unitid', 'year']).sum('n_awards').reset_index() #major num means nothing here.
completions_total_agg = completions_total.groupby(['unitid', 'year']).sum('n_awards_total').reset_index()

# not really necessary, but there.
enrollment_agg = enrollment_cosme.groupby(['unitid', 'fall_year']).sum('efres01').reset_index()
enrollment_agg['year'] = enrollment_agg['fall_year']

In [28]:
full_cosme_agg = pd.merge(completions_agg, completions_total_agg, how = 'inner', on = ['unitid', 'year'])
full_cosme_agg.head()


full_cosme_agg['cosme_ratio'] = full_cosme_agg['n_awards']/full_cosme_agg['n_awards_total']

# conditional to filter

full_cosme_agg = full_cosme_agg[full_cosme_agg['cosme_ratio'] > .5] # > 1 should not be possible.
print(full_cosme_agg.shape)

full_cosme_agg_index = full_cosme_agg['unitid'].unique()
print(full_cosme_agg_index.shape) # realistic figures



(12962, 5)
(1783,)


In [None]:
isinstance(full_cosme_agg, pd.DataFrame) # it is indeed a dataframe.

True

In [None]:
full_cosme_agg.to_csv('raw-data/cosmetology-completions.csv', index=False)