In [1]:
# Set up the environment
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# The goal
What are common features among colleges and universities that have closed in the last decade?
Using these features, is it possible to predict which schools are in danger of closing in the near future?
Are there steps that schools can take to mitigate the features which may lead to their closing?

# Data collection
The data come from the US Department of Education College Scorecard, which is available on Kaggle and from the US Department of Education at https://collegescorecard.ed.gov/data/.

In [2]:
# Load the most recent scorecard into a DataFrame for examination
# List of fields I want to import, based on previous exploration and data dictionary
fields = ['Id','OPEID','INSTNM','CITY', 'ZIP','sch_deg', 'main', 'NUMBRANCH', 'PREDDEG', \
         'HIGHDEG', 'CONTROL','st_fips','region','LOCALE','CCUGPROF','RELAFFIL', 'ADM_RATE',\
         'ACTCMMID','SAT_AVG', 'UGDS', 'UG', 'CURROPER', 'NPT4_PUB', 'NPT4_PRIV', 'COSTT4_A', \
         'COSTT4_P','TUITFTE', 'INEXPFTE', 'AVGFACSAL', 'PFTFAC',\
         'C150_4', 'C150_L4','RET_FT4','RET_FTL4','RET_PT4','RET_PTL4','PCTFLOAN','UG25abv',\
         'COMP_ORIG_YR4_RT','WDRAW_ORIG_YR4_RT', 'ENRL_ORIG_YR4_RT', 'INC_PCT_LO', 'DEP_STAT_PCT_IND',\
         'IND_INC_PCT_LO', 'DEP_INC_PCT_LO', 'PAR_ED_PCT_1STGEN', 'DEP_INC_AVG', 'IND_INC_AVG', \
         'DEBT_MDN', 'GRAD_DEBT_MDN', 'WDRAW_DEBT_MDN', 'loan_ever', 'age_entry', 'female',\
         'married', 'dependent', 'veteran', 'faminc', 'md_faminc', 'median_hh_inc']

data = pd.read_csv("data/raw/Scorecard.csv", usecols=fields, low_memory=False)
data.shape

(124699, 60)

## Data organization
Create a file structure (and add work to GitHub repo)

In [3]:
#Create directories for various components of the project
#os.mkdir("data")
#os.mkdir("data/raw")
#os.mkdir("data/interim")
#os.mkdir("data/processed")
#os.mkdir("data/external")
#os.mkdir("figures")
#os.mkdir("models")

In [4]:
# Look at basic info about the dataframe - column names, summary information
data.columns

Index(['Id', 'OPEID', 'INSTNM', 'CITY', 'ZIP', 'sch_deg', 'main', 'NUMBRANCH',
       'PREDDEG', 'HIGHDEG', 'CONTROL', 'st_fips', 'region', 'LOCALE',
       'CCUGPROF', 'RELAFFIL', 'ADM_RATE', 'ACTCMMID', 'SAT_AVG', 'UGDS', 'UG',
       'CURROPER', 'NPT4_PUB', 'NPT4_PRIV', 'COSTT4_A', 'COSTT4_P', 'TUITFTE',
       'INEXPFTE', 'AVGFACSAL', 'PFTFAC', 'C150_4', 'C150_L4', 'RET_FT4',
       'RET_FTL4', 'RET_PT4', 'RET_PTL4', 'PCTFLOAN', 'UG25abv',
       'COMP_ORIG_YR4_RT', 'WDRAW_ORIG_YR4_RT', 'ENRL_ORIG_YR4_RT',
       'INC_PCT_LO', 'DEP_STAT_PCT_IND', 'DEP_INC_PCT_LO', 'IND_INC_PCT_LO',
       'PAR_ED_PCT_1STGEN', 'DEP_INC_AVG', 'IND_INC_AVG', 'DEBT_MDN',
       'GRAD_DEBT_MDN', 'WDRAW_DEBT_MDN', 'loan_ever', 'age_entry', 'female',
       'married', 'dependent', 'veteran', 'faminc', 'md_faminc',
       'median_hh_inc'],
      dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124699 entries, 0 to 124698
Data columns (total 60 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Id                 124699 non-null  int64  
 1   OPEID              124699 non-null  object 
 2   INSTNM             124699 non-null  object 
 3   CITY               124699 non-null  object 
 4   ZIP                124699 non-null  object 
 5   sch_deg            101334 non-null  float64
 6   main               124699 non-null  object 
 7   NUMBRANCH          124699 non-null  int64  
 8   PREDDEG            124699 non-null  object 
 9   HIGHDEG            124699 non-null  object 
 10  CONTROL            124676 non-null  object 
 11  st_fips            124669 non-null  object 
 12  region             124669 non-null  object 
 13  LOCALE             7380 non-null    object 
 14  CCUGPROF           3559 non-null    object 
 15  RELAFFIL           917 non-null     object 
 16  AD

In [6]:
data.nunique()

Id                   124699
OPEID                 11511
INSTNM                20601
CITY                   5494
ZIP                   16527
sch_deg                   3
main                      2
NUMBRANCH                58
PREDDEG                   5
HIGHDEG                   5
CONTROL                   3
st_fips                  59
region                   10
LOCALE                   12
CCUGPROF                 14
RELAFFIL                 59
ADM_RATE               7717
ACTCMMID                 32
SAT_AVG                 784
UGDS                  13472
UG                     2804
CURROPER                  2
NPT4_PUB               6887
NPT4_PRIV             13237
COSTT4_A              15353
COSTT4_P               9594
TUITFTE               23870
INEXPFTE              19612
AVGFACSAL              9870
PFTFAC                 8688
C150_4                 7985
C150_L4                8989
RET_FT4                5065
RET_FTL4               5289
RET_PT4                1841
RET_PTL4            

Many of the columns that I expected to be numerical data were actually classifed as strings. I tried to convert them to numbers, but got an error message, so I explored the content of the colummns and the data dictionary to see what the problem was.

For privacy reasons, some of the columns include data is coded as "PrivacySupressed".

In [7]:
# Look at individual column information to assess and assign data types
print((data.COMP_ORIG_YR4_RT.value_counts()/len(data.COMP_ORIG_YR4_RT)) *100)
print((data.COMP_ORIG_YR4_RT.isnull().sum() /len(data.COMP_ORIG_YR4_RT)) *100)

PrivacySuppressed    11.696966
0                     3.828419
1                     0.334405
0.5                   0.288695
0.666666666667        0.158782
                       ...    
0.282692307692        0.000802
0.283106796117        0.000802
0.39514978602         0.000802
0.281588447653        0.000802
0.545112781955        0.000802
Name: COMP_ORIG_YR4_RT, Length: 32741, dtype: float64
31.17186184331871


In [8]:
# Replace "PrivacySuppressed" with null values (NaN)
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].str.replace("PrivacySuppressed", "NaN")

In [9]:
# Look at percentage of null values per column, now that PrivacySupressed has been recoded
nas=pd.DataFrame(data.isnull().sum().sort_values(ascending=False)/len(data),columns = ['percent'])
pos = nas['percent'] > 0
nas[pos]

Unnamed: 0,percent
RELAFFIL,0.992646
CCUGPROF,0.971459
UG,0.950737
LOCALE,0.940817
NPT4_PUB,0.926575
COSTT4_P,0.901715
RET_PT4,0.876687
ACTCMMID,0.872726
NPT4_PRIV,0.851162
SAT_AVG,0.850857


In [10]:
# Get a list of columns with > 90% NAN values
to_drop = [column for column in data.columns if (data[column].isnull().sum()/len(data[column]) > 0.90)]
#data.isnull().sum()/len(data)

print("Columns to drop include: ", to_drop)

Columns to drop include:  ['LOCALE', 'CCUGPROF', 'RELAFFIL', 'UG', 'NPT4_PUB', 'COSTT4_P']


In [11]:
# Remove columns with > 90% NAN values from dataframe
data = data.drop(to_drop, axis=1)
data.head()

Unnamed: 0,Id,OPEID,INSTNM,CITY,ZIP,sch_deg,main,NUMBRANCH,PREDDEG,HIGHDEG,...,WDRAW_DEBT_MDN,loan_ever,age_entry,female,married,dependent,veteran,faminc,md_faminc,median_hh_inc
0,1,1230800,COMMUNITY COLLEGE OF THE AIR FORCE,MONTGOMERY,361126613,,Main campus,1,Predominantly associate's-degree granting,Associate degree,...,,,,,,,,,,
1,2,100200,ALABAMA A & M UNIVERSITY,NORMAL,35762,,Main campus,1,Predominantly bachelor's-degree granting,Graduate degree,...,,,,,,,,,,
2,3,105200,UNIVERSITY OF ALABAMA AT BIRMINGHAM,BIRMINGHAM,352940110,,Main campus,2,Predominantly bachelor's-degree granting,Graduate degree,...,,,,,,,,,,
3,4,574900,ALABAMA AVIATION AND TECHNICAL COLLEGE,OZARK,36360,,Main campus,1,Predominantly certificate-degree granting,Associate degree,...,,,,,,,,,,
4,5,2503400,SOUTHERN CHRISTIAN UNIVERSITY,MONTGOMERY,361173553,,Main campus,1,Predominantly bachelor's-degree granting,Graduate degree,...,,,,,,,,,,


In [12]:
# Convert column types, based on data dictionary
#data.CCUGPROF = data.CCUGPROF.astype('category') # dropped
#data.RELAFFIL = data.RELAFFIL.astype('category') #dropped
#data.LOCALE = data.LOCALE.astype('category')
data.PREDDEG = data.PREDDEG.astype('category')
data.HIGHDEG = data.HIGHDEG.astype('category')
data.CONTROL = data.CONTROL.astype('category')
data.st_fips = data.st_fips.astype('category')
data.region = data.region.astype('category')
data.main = data.main.astype('bool')
data.COMP_ORIG_YR4_RT = data.COMP_ORIG_YR4_RT.astype('float')
data.WDRAW_ORIG_YR4_RT = data.WDRAW_ORIG_YR4_RT.astype('float')
data.ENRL_ORIG_YR4_RT= data.ENRL_ORIG_YR4_RT.astype('float')
data.INC_PCT_LO = data.INC_PCT_LO.astype('float')
data.DEP_STAT_PCT_IND= data.DEP_STAT_PCT_IND.astype('float')
data.DEP_INC_PCT_LO = data.DEP_INC_PCT_LO.astype('float')
data.IND_INC_PCT_LO = data.IND_INC_PCT_LO.astype('float')
data.PAR_ED_PCT_1STGEN = data.PAR_ED_PCT_1STGEN.astype('float')
data.DEP_INC_AVG = data.DEP_INC_AVG.astype('float')
data.IND_INC_AVG = data.IND_INC_AVG.astype('float')
data.DEBT_MDN = data.DEBT_MDN.astype('float')
data.GRAD_DEBT_MDN = data.GRAD_DEBT_MDN.astype('float')
data.WDRAW_DEBT_MDN = data.WDRAW_DEBT_MDN.astype('float')
data.loan_ever = data.loan_ever.astype('float')
data.age_entry = data.age_entry.astype('float')
data.female = data.female.astype('float')
data.married = data.married.astype('float')
data.dependent = data.dependent.astype('float')
data.veteran = data.veteran.astype('float')
data.faminc = data.faminc.astype('float')
data.md_faminc = data.md_faminc.astype('float')
data.median_hh_inc = data.median_hh_inc.astype('float')

In [13]:
#Rename columns to be more descriptive
col_names = ['Id','OPEID','Name','City','Zip','Predominant degree', 'Main branch', 'NumBranches',\
            'Primary degree type', 'High degree', 'Control', 'StateCode','Region', \
            'AdmissionRate', 'ACTMidpoint', 'SATAvg',\
            'Enrollment-DegreeSeeking', \
            'CURROPER','NetPrice-Private', 'AvgCost-AY', 'TuitionRevenue/FTE',\
            'Expenditure/FTE','AvgFaculty Salary', 'Percent Fulltime Faculty', 'Completion-4yr', 'Completion<4yr','RetentionFT-4yr', \
            'RetentionFT-<4yr','RetentionPT-4yr', 'RetentionPT-<4yr', 'PercentFedLoan','PercentAge>25', \
            'PercentCompleted4yr', 'PercentWithdraw4yr', 'PercentEnrolled4yr',\
            'PercentAidedLowIncome', 'PercentIndependent', 'PercentDependentLowIncome','PercentIndepentdentLowIncome',\
            'PercentFirstGen','AvgIncomeDependent','AvgIncomeIndep','MedianDebt', \
            'MedianDebtCompleters', 'MedianDebtNotComplete', 'PercentReceiveFedLoan','AvgAgeEntry', 'PercentFemale',\
            'PercentMarried','PercentDependent','PercentVeteran', 'AvgFamilyIncome','MedianFamilyIncome',\
            'MedianHouseholdIncome']
data.columns = col_names
data.set_index('Id').head()

Unnamed: 0_level_0,OPEID,Name,City,Zip,Predominant degree,Main branch,NumBranches,Primary degree type,High degree,Control,...,MedianDebtNotComplete,PercentReceiveFedLoan,AvgAgeEntry,PercentFemale,PercentMarried,PercentDependent,PercentVeteran,AvgFamilyIncome,MedianFamilyIncome,MedianHouseholdIncome
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1230800,COMMUNITY COLLEGE OF THE AIR FORCE,MONTGOMERY,361126613,,True,1,Predominantly associate's-degree granting,Associate degree,Public,...,,,,,,,,,,
2,100200,ALABAMA A & M UNIVERSITY,NORMAL,35762,,True,1,Predominantly bachelor's-degree granting,Graduate degree,Public,...,,,,,,,,,,
3,105200,UNIVERSITY OF ALABAMA AT BIRMINGHAM,BIRMINGHAM,352940110,,True,2,Predominantly bachelor's-degree granting,Graduate degree,Public,...,,,,,,,,,,
4,574900,ALABAMA AVIATION AND TECHNICAL COLLEGE,OZARK,36360,,True,1,Predominantly certificate-degree granting,Associate degree,Public,...,,,,,,,,,,
5,2503400,SOUTHERN CHRISTIAN UNIVERSITY,MONTGOMERY,361173553,,True,1,Predominantly bachelor's-degree granting,Graduate degree,Private nonprofit,...,,,,,,,,,,


In [14]:
# Look for duplicate rows
duplicateRowsDF = data[data.duplicated()]
duplicateRowsDF

Unnamed: 0,Id,OPEID,Name,City,Zip,Predominant degree,Main branch,NumBranches,Primary degree type,High degree,...,MedianDebtNotComplete,PercentReceiveFedLoan,AvgAgeEntry,PercentFemale,PercentMarried,PercentDependent,PercentVeteran,AvgFamilyIncome,MedianFamilyIncome,MedianHouseholdIncome


In [15]:
# Write complete dataframe to file.
data.to_csv('data/interim/college.filtered.csv')

In [16]:
# Convert CURROPER from category to Boolean
curroper = {"Currently certified as operating": 1, "Not currently certified as an operating institution": 0}
data['CURROPER'] = data['CURROPER'].replace(curroper)
data['CURROPER'] = data['CURROPER'].astype('bool')
print(data['CURROPER'])

# Write complete dataframe to file.
data.to_csv('data/processed/college.filtered.csv')
# Create a dataframe with only closed schools
closed = data[~data['CURROPER']]
operational = data[data['CURROPER']]
print("Data: ", data.shape)
print("Open schools: ", operational.shape)
print("Closed schools: ", closed.shape)

0          True
1          True
2          True
3         False
4          True
          ...  
124694     True
124695     True
124696     True
124697     True
124698     True
Name: CURROPER, Length: 124699, dtype: bool
Data:  (124699, 54)
Open schools:  (101121, 54)
Closed schools:  (23578, 54)


In [17]:
print(len(closed.OPEID.unique()))

3826


In [18]:
closed.to_csv('data/interim/closed.filtered.csv')
operational.to_csv('data/interim/operational.filtered.csv')

In [19]:
# Counts and percents unique values
# Ranges of values

closed.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,23578.0,49811.77233,33055.783377,4.0,20497.25,45606.5,75408.25,124692.0
Predominant degree,18503.0,1.789007,0.826354,1.0,1.0,2.0,3.0,3.0
NumBranches,23578.0,4.849563,12.537724,1.0,1.0,1.0,3.0,141.0
AdmissionRate,4080.0,0.693199,0.286237,0.0,0.49805,0.7,0.92905,10.0
ACTMidpoint,322.0,20.773292,2.951556,9.0,19.0,21.0,23.0,32.0
SATAvg,464.0,1001.06681,113.435312,560.0,937.0,990.0,1056.0,1405.0
Enrollment-DegreeSeeking,20555.0,499.965604,2667.07841,0.0,55.0,155.0,446.5,241832.0
NetPrice-Private,1906.0,17725.508919,7009.961774,-5029.0,13507.0,18237.5,22505.25,44842.0
AvgCost-AY,1323.0,23399.270597,7171.311205,3368.0,19692.0,24593.0,27280.0,48361.0
TuitionRevenue/FTE,17997.0,21438.137912,262530.265546,0.0,2548.0,6158.0,10495.0,26670160.0


In [20]:
closed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23578 entries, 3 to 124691
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Id                            23578 non-null  int64   
 1   OPEID                         23578 non-null  object  
 2   Name                          23578 non-null  object  
 3   City                          23578 non-null  object  
 4   Zip                           23578 non-null  object  
 5   Predominant degree            18503 non-null  float64 
 6   Main branch                   23578 non-null  bool    
 7   NumBranches                   23578 non-null  int64   
 8   Primary degree type           23578 non-null  category
 9   High degree                   23578 non-null  category
 10  Control                       23577 non-null  category
 11  StateCode                     23577 non-null  category
 12  Region                        23577 non-null 

In [21]:
# Find the percent of null values in each column
nas=pd.DataFrame(closed.isnull().sum().sort_values(ascending=False)/len(closed),columns = ['percent'])
pos = nas['percent'] > 0
nas[pos]

Unnamed: 0,percent
ACTMidpoint,0.986343
SATAvg,0.980321
RetentionPT-4yr,0.956697
AvgCost-AY,0.943888
RetentionFT-4yr,0.925693
NetPrice-Private,0.919162
Completion-4yr,0.89698
RetentionPT-<4yr,0.871236
PercentFedLoan,0.853974
AdmissionRate,0.826957


In [22]:
operational.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,101121.0,65273.490986,36027.85,1.0,34585.0,66740.0,96875.0,124699.0
Predominant degree,82831.0,2.012423,0.8819505,1.0,1.0,2.0,3.0,3.0
NumBranches,101121.0,3.478387,12.62927,1.0,1.0,1.0,1.0,141.0
AdmissionRate,30076.0,0.697474,0.2140628,0.0,0.564375,0.7223,0.8552,3.6877
ACTMidpoint,15549.0,22.763329,3.259005,2.0,21.0,22.0,24.0,35.0
SATAvg,18134.0,1054.022223,126.9459,514.0,971.0,1036.0,1117.0,1599.0
Enrollment-DegreeSeeking,89788.0,2529.405333,5015.666,0.0,146.0,705.0,2556.0,253594.0
NetPrice-Private,16654.0,18011.659481,7511.786,-103168.0,13080.5,18162.0,22483.5,87570.0
AvgCost-AY,19181.0,22974.776654,11818.76,2200.0,13062.0,20940.0,29762.0,74473.0
TuitionRevenue/FTE,88058.0,11785.763179,94751.36,0.0,2479.0,5682.0,10655.0,11932460.0


In [23]:
operational.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101121 entries, 0 to 124698
Data columns (total 54 columns):
 #   Column                        Non-Null Count   Dtype   
---  ------                        --------------   -----   
 0   Id                            101121 non-null  int64   
 1   OPEID                         101121 non-null  object  
 2   Name                          101121 non-null  object  
 3   City                          101121 non-null  object  
 4   Zip                           101121 non-null  object  
 5   Predominant degree            82831 non-null   float64 
 6   Main branch                   101121 non-null  bool    
 7   NumBranches                   101121 non-null  int64   
 8   Primary degree type           101121 non-null  category
 9   High degree                   101121 non-null  category
 10  Control                       101099 non-null  category
 11  StateCode                     101092 non-null  category
 12  Region                        

In [24]:
nas=pd.DataFrame(operational.isnull().sum().sort_values(ascending=False)/len(operational),columns = ['percent'])
pos = nas['percent'] > 0
nas[pos]

Unnamed: 0,percent
RetentionPT-4yr,0.858031
ACTMidpoint,0.846234
NetPrice-Private,0.835306
SATAvg,0.82067
AvgCost-AY,0.810316
RetentionPT-<4yr,0.807646
RetentionFT-4yr,0.790874
MedianHouseholdIncome,0.760792
PercentReceiveFedLoan,0.759842
AvgFamilyIncome,0.759842
