In [1]:
# Set up the environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
# Upload the data
# data will contain everything, names and locations
# df will contain the features used in the modeling
data = pd.read_csv('../data/processed/DecisionTreeData2010.csv', index_col=0)
#data = data.loc[data.Year > 2009]
data.sort_index(inplace=True)
data.head()

Unnamed: 0,OPEID,INSTNM,CITY,NUMBRANCH,PREDDEG,HIGHDEG,CONTROL,region,ADM_RATE,UGDS,...,Cost,PREDDEG_N,Governance,EnrollmentBins,TuitionBins,ExpenditureBins,FacultyBins,Complete,RetentionFT,RetentionPT
0,100200,alabama a & m university,Normal,1,3,4,1,5.0,0.5129,4930.0,...,13762.0,Bachelor,Public,xl,high,high,high,0.3306,0.7457,0.5
1,105200,university of alabama at birmingham,Birmingham,1,3,4,1,5.0,0.8243,10661.0,...,18003.0,Bachelor,Public,xl,high,highest,highest,0.4108,0.7997,0.7
2,2503400,amridge university,Montgomery,1,3,4,2,5.0,0.7143,370.0,...,20700.0,Bachelor,Private nonprofit,medium,highest,med,low,0.2308,0.8,
3,105500,university of alabama at huntsville,Huntsville,1,3,4,1,5.0,0.6614,5828.0,...,17064.0,Bachelor,Public,xl,high,high,highest,0.4364,0.7545,0.5385
4,100500,alabama state university,Montgomery,1,3,4,1,5.0,0.4372,4882.0,...,14390.0,Bachelor,Public,xl,high,high,high,0.2399,0.6352,0.4167


In [3]:
identity = data[['OPEID','INSTNM', 'CITY']]
identity.shape

(30682, 3)

In [4]:
# Separate open and closed schoools
closed = data.loc[data.CURROPER == False]
closed.shape

(2759, 33)

In [5]:
closed_degree = closed.pivot_table(index='PREDDEG_N', columns=['CONTROL'], aggfunc='size')
print(closed_degree)

CONTROL           1    2     3
PREDDEG_N                     
Associates       42   26   257
Bachelor         13   94   134
Certificate     117  147  1577
Graduate          1   40    58
Not classified   27   35   191


In [6]:
closed.PREDDEG_N.value_counts()

Certificate       1841
Associates         325
Not classified     253
Bachelor           241
Graduate            99
Name: PREDDEG_N, dtype: int64

In [7]:
# Resample the minority class
from sklearn.utils import resample
closed_upsample = resample(closed, replace=True, n_samples=closed.shape[0]*2, random_state=123)

In [8]:
open_df = data.loc[data.CURROPER==True]
open_df.shape

(27923, 33)

In [9]:
current = open_df.loc[data.Year > 2012]
current.shape

(7440, 33)

In [10]:
current_degree = current.pivot_table(index='PREDDEG_N', columns=['CONTROL','CURROPER'], aggfunc='size')
print(current_degree)

CONTROL           1     2     3
CURROPER       True  True  True
PREDDEG_N                      
Associates      780   163   545
Bachelor        588  1241   271
Certificate     594   199  2300
Graduate         15   236    35
Not classified   75    89   309


In [11]:
current.PREDDEG_N.value_counts()

Certificate       3093
Bachelor          2100
Associates        1488
Not classified     473
Graduate           286
Name: PREDDEG_N, dtype: int64

In [12]:
data = pd.concat([current, closed_upsample], axis = 0)
data.shape

(12958, 33)

In [13]:
data.CURROPER.value_counts()

True     7440
False    5518
Name: CURROPER, dtype: int64

In [14]:
data.head()

Unnamed: 0,OPEID,INSTNM,CITY,NUMBRANCH,PREDDEG,HIGHDEG,CONTROL,region,ADM_RATE,UGDS,...,Cost,PREDDEG_N,Governance,EnrollmentBins,TuitionBins,ExpenditureBins,FacultyBins,Complete,RetentionFT,RetentionPT
22882,00100200,alabama a & m university,Normal,1,3,4,1,5.0,0.8989,4051.0,...,18888.0,Bachelor,Public,xl,high,high,high,0.2914,0.6314,0.5
22883,00105200,university of alabama at birmingham,Birmingham,1,3,4,1,5.0,0.8673,11200.0,...,19990.0,Bachelor,Public,xl,high,highest,highest,0.5377,0.8016,0.5
22884,02503400,amridge university,Montgomery,1,3,4,2,5.0,,322.0,...,12300.0,Bachelor,Private nonprofit,medium,highest,med,low,0.6667,0.375,0.3333
22885,Other,university of alabama in huntsville,Huntsville,1,3,4,1,5.0,0.8062,5525.0,...,20306.0,Bachelor,Public,xl,high,high,highest,0.4835,0.8098,0.4444
22886,00100500,alabama state university,Montgomery,1,3,4,1,5.0,0.5125,5354.0,...,17400.0,Bachelor,Public,xl,high,high,high,0.2517,0.6219,0.3023


## Simple EDA with the data subset used for modeling

In [15]:
data['PREDDEG_N'].value_counts()

Certificate       6762
Bachelor          2594
Associates        2126
Not classified     979
Graduate           497
Name: PREDDEG_N, dtype: int64

In [16]:
print(data.CONTROL_N.value_counts())
#print(data.Governance.value_counts())

AttributeError: 'DataFrame' object has no attribute 'CONTROL_N'

In [None]:
control_degree = data.pivot_table(index='PREDDEG_N', columns=['CONTROL','CURROPER'], aggfunc='size')
print(control_degree)
#sns.heatmap(control_degree, cmap='YlGnBu')
#plt.savefig('../figures/PreddegControlHeatmap.png',bbox_inches='tight')

## Drop columns not needed for analysis

In [None]:
#df contains just the feature to train the model on
to_drop = ['OPEID', 'INSTNM', 'CITY', 'HIGHDEG', 'ADM_RATE', 'NetPrice', 'region','EnrollmentBins','CONTROL_N',\
           'TuitionBins','ExpenditureBins','FacultyBins', 'AVGFACSAL', 'RetentionPT', 'PCTFLOAN', 'Year', 'PREDDEG_N']
df = data.drop(to_drop, axis=1)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
y = df.CURROPER
X = df.drop('CURROPER', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

In [None]:
X_test.shape

# Decision Tree Model
Basis for comparison

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier

## Explore just the criterion and max_depth features

Code modified from https://towardsdatascience.com/decision-tree-build-prune-and-visualize-it-using-python-12ceee9af752

In [None]:
from sklearn import metrics
max_depth = []
acc_gini = []
acc_entropy = []
for i in range(1, 31):
    gtree = DecisionTreeClassifier(criterion='gini', max_depth=i, random_state=21)
    gtree.fit(X_train, y_train)
    gpredict = gtree.predict(X_test)
    acc_gini.append(metrics.accuracy_score(y_test, gpredict))
    ##
    etree = DecisionTreeClassifier(criterion='entropy', max_depth=i, random_state=21)
    etree.fit(X_train, y_train)
    epredict = etree.predict(X_test)
    acc_entropy.append(metrics.accuracy_score(y_test, epredict))
    ##
    max_depth.append(i)

In [None]:
gini_max = max(acc_gini)
gini_idx = acc_gini.index(gini_max)
entropy_max = max(acc_entropy)
entropy_idx = acc_entropy.index(entropy_max)

if gini_max > entropy_max:
    depth = max_depth[gini_idx]
    crit = 'gini'
else:
    depth = max_depth[entropy_idx]
    crit='entropy'
    
print(depth)

In [None]:
trees = pd.DataFrame({'acc_gini':pd.Series(acc_gini),
                     'acc_entropy':pd.Series(acc_entropy),
                     'max_depth':pd.Series(max_depth)})

plt.plot('max_depth', 'acc_gini', data=trees, label='gini')
plt.plot('max_depth', 'acc_entropy', data=trees, label='entropy')
#plt.vlines(6, 0.895, 0.915)
plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
# Train the decision tree using the identified hyperparameters
modelDT = DecisionTreeClassifier(criterion=crit, max_depth=depth, random_state=21)
modelDT.fit(X_train, y_train)

In [None]:
# Use the trained tree to predict the testing data
dt_pred = modelDT.predict(X_test)
dt_pred_prob = modelDT.predict_proba(X_test)
#print(type(dt_pred_prob))

In [None]:
# Run this block for model evaluation 
#from sklearn import metrics
print("Model Metrics")
print("Accuracy:", metrics.accuracy_score(y_test, dt_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test, dt_pred))
print('Precision score for "Yes"' , metrics.precision_score(y_test, dt_pred, pos_label = 1))
print('Recall score for "No"' , metrics.recall_score(y_test, dt_pred, pos_label = 0))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, dt_pred))

In [None]:
print(classification_report(y_test, dt_pred))

In [None]:
importances = modelDT.feature_importances_
#std = np.std([tree.feature_importances_ for tree in modelDT.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
names = [X_train.columns[i] for i in indices]

print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, names[f], importances[indices[f]]))

In [None]:
# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="r", align="center") #yerr=std[indices]
plt.xticks(range(X.shape[1]), names)#indices)
plt.xlim([-1, X.shape[1]])
plt.show()

CONTROL and UGDS are the most prominent features - that predict open schools. (Control is categorical, so I'm not exactly sure how to interpret it). More enrollment increases prediction of being open.

## Decision Tree Model with various thresholds

In [None]:
threshold = []
accurate=[]
balance_acc = []
pred_proba = []

for thresh in range(50, 100, 5):
    new_pred = []
    for i in range(0, len(dt_pred_prob)):
        # print(len(dt_pred_prob))
        new_pred.append(dt_pred_prob[i, 1] > (thresh / 100.0))
    # print(new_pred)
    accurate.append(metrics.accuracy_score(y_test, new_pred))
    balance_acc.append(metrics.balanced_accuracy_score(y_test, new_pred))
    threshold.append(thresh)
    pred_proba.append(new_pred)

In [None]:
rate = pd.DataFrame({'threshold':pd.Series(threshold),
                     'accuracy':pd.Series(accurate),
                     'balance_acc':pd.Series(balance_acc)})

plt.plot('threshold', 'accuracy', data=rate, label='accuracy')
plt.plot('threshold', 'balance_acc', data=rate, label='balanced accuracy')
#plt.vlines(6, 0.895, 0.915)
plt.xlabel('Threshold')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
print("Balance Accuracy: ", max(balance_acc))
idx = balance_acc.index(max(balance_acc))
print("Threshold: ", threshold[idx])

## AdaBoost model

In [None]:
ABmax_depth = []
score_a = []
score_b = []

for i in range(105, 131, 1):
    atree = AdaBoostClassifier(n_estimators=i, random_state=21)
    atree.fit(X_train, y_train)
    apredict = gtree.predict(X_test)
    score_a.append(metrics.accuracy_score(y_test, apredict))
    ##
    btree = AdaBoostClassifier(modelDT, n_estimators=i, random_state=21)
    btree.fit(X_train, y_train)
    bpredict = btree.predict(X_test)
    score_b.append(metrics.accuracy_score(y_test, bpredict))
    ##
    ABmax_depth.append(i)

In [None]:
adatrees = pd.DataFrame({'score_a':pd.Series(score_a),
                     'score_b':pd.Series(score_b),
                     'n_estimators':pd.Series(ABmax_depth)})

plt.plot('n_estimators', 'score_a', data=adatrees, label='No_base')
plt.plot('n_estimators', 'score_b', data=adatrees, label='Base')
#plt.vlines(6, 0.895, 0.915)
plt.xlabel('n_estimators')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
ideal_estimators = max(score_b)
idx = score_b.index(ideal_estimators)
print(ABmax_depth[idx])

In [None]:
modelAB = AdaBoostClassifier(modelDT, n_estimators=ABmax_depth[idx], random_state=21)
modelAB.fit(X_train, y_train)
AB_pred = modelAB.predict(X_test)
print(metrics.accuracy_score(y_test, AB_pred))

In [None]:
print(confusion_matrix(y_test, AB_pred))

In [None]:
print(classification_report(y_test, AB_pred))

In [None]:
importances = modelAB.feature_importances_
std = np.std([tree.feature_importances_ for tree in modelAB.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
names = [X_train.columns[i] for i in indices]

print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, names[f], importances[indices[f]]))

In [None]:
# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

### Explore different threshold values for AB Model

In [None]:
AB_pred_prob = modelAB.predict_proba(X_test)

ABthreshold = []
ABaccurate=[]
ABbalance_acc = []
ABpred_proba = []

for thresh in range(50, 100, 5):
    new_pred = []
    for i in range(0, len(AB_pred_prob)):
        # print(len(dt_pred_prob))
        new_pred.append(AB_pred_prob[i, 1] > (thresh / 100.0))
    # print(new_pred)
    ABaccurate.append(metrics.accuracy_score(y_test, new_pred))
    ABbalance_acc.append(metrics.balanced_accuracy_score(y_test, new_pred))
    ABthreshold.append(thresh)
    ABpred_proba.append(new_pred)

In [None]:
ABrate = pd.DataFrame({'threshold':pd.Series(ABthreshold),
                     'accuracy':pd.Series(ABaccurate),
                     'balance_acc':pd.Series(ABbalance_acc)})

plt.plot('threshold', 'accuracy', data=ABrate, label='accuracy')
plt.plot('threshold', 'balance_acc', data=ABrate, label='balanced accuracy')
#plt.vlines(6, 0.895, 0.915)
plt.xlabel('Threshold')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

## Exploration of misclassified schools

We are interested in predicting schools that might be in danger of closing. This would be schools that are currently open, but that the model predicts are closed. (These would be the "false negative" cases.) There are 224 schools in this class that could be explored further.

In [None]:
result_dict = {'CURROPER':y_test, 'Predicted':AB_pred}
results = pd.DataFrame(result_dict)
results.head()

In [None]:
predicted = pd.concat([X_test, results], axis=1)
predicted.head()

In [None]:
false_pos = predicted.loc[(predicted.Predicted == True) & (predicted.CURROPER == False)]
false_pos.shape

In [None]:
false_neg = predicted.loc[(predicted.Predicted==False) & (predicted.CURROPER==True)]
false_neg.shape

## Generate graphs for the report

In [None]:
predicted['Result'] = np.nan
predicted.loc[(predicted.Predicted==True)&(predicted.CURROPER==True), 'Result'] = 'TP'
predicted.loc[(predicted.Predicted==False)&(predicted.CURROPER==False), 'Result'] = 'TN'
predicted.loc[(predicted.Predicted==True)&(predicted.CURROPER==False), 'Result'] = 'FP'
predicted.loc[(predicted.Predicted==False)&(predicted.CURROPER==True), 'Result'] = 'FN'
print(predicted.Result)

In [None]:
to_graph1 = pd.merge(predicted, data)
to_graph = pd.merge(identity, to_graph1)
to_graph.head()

In [None]:
to_graph.CONTROL = to_graph.CONTROL.astype('str')
to_graph.CONTROL.loc[to_graph.CONTROL == '1.0'] = 'Public'
to_graph.CONTROL.loc[to_graph.CONTROL == '2.0'] = 'NonProfit'
to_graph.CONTROL.loc[to_graph.CONTROL == '3.0'] = 'ForProfit'

to_graph.PREDDEG = to_graph.PREDDEG.astype('str')
to_graph.drop(['EnrollmentBins', 'TuitionBins', 'ExpenditureBins', 'FacultyBins'], axis=1, inplace=True)
to_graph.info()

In [None]:
to_graph.duplicated().sum()

In [None]:
to_graph.drop_duplicates(inplace=True)
to_graph.shape

In [None]:
print(confusion_matrix(to_graph['CURROPER'], to_graph['Predicted']))

In [None]:
print(classification_report(to_graph['CURROPER'], to_graph['Predicted']))

In [None]:
degreeControl = to_graph.pivot_table(index='PREDDEG', columns=['CONTROL','CURROPER'], aggfunc='size')
print(degreeControl)

In [None]:
predControl = to_graph.pivot_table(index='PREDDEG', columns=['CONTROL','Result'], aggfunc='size')
print(predControl)

In [None]:
controlPivot = to_graph.pivot_table(index='CONTROL_N', columns = 'Result', aggfunc='size')
print(controlPivot)

In [None]:
false_neg = to_graph.loc[to_graph.Result == 'FN']
false_neg.head()

## UGDS - Undergraduate enrollment

In [None]:
fig=plt.figure(figsize=(10,4), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.boxplot('CONTROL_N', 'UGDS', data = to_graph, hue = 'Result')
plt.ylim(0, 20000)
plt.title('Enrollment by Control')

plt.subplot(1, 2, 2)
sns.boxplot('PREDDEG', 'UGDS', data = to_graph, hue = 'Result')
plt.ylim(0, 10000)
plt.title('Enrollment by Degree Type')
plt.tight_layout()

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('TUITFTE', 'UGDS', data=to_graph.loc[to_graph.Predicted==True], hue = 'CONTROL_N',style='Result', alpha=0.5 )
plt.title('Predicted Positive')
plt.xlim(0,100000)
plt.ylim(0,20000)

plt.subplot(1, 2, 2)
sns.scatterplot('TUITFTE', 'UGDS', data=to_graph.loc[to_graph.Predicted==False], hue = 'CONTROL_N', style='Result', alpha=0.5)
plt.title('Predicted Negative')
plt.xlim(0,100000)
plt.ylim(0,20000)
plt.tight_layout()
#plt.show()
plt.savefig('../figures/TuitionEnrollmentResultsControl.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('TUITFTE', 'UGDS', data=to_graph.loc[to_graph.Predicted==True], hue='Result',style='CONTROL_N', alpha=0.5 )
plt.title('Predicted Positive')
plt.xlim(0,100000)
plt.ylim(0,20000)

plt.subplot(1, 2, 2)
sns.scatterplot('TUITFTE', 'UGDS', data=to_graph.loc[to_graph.Predicted==False], hue='Result', style='CONTROL_N',size='Result', sizes={20,40}, alpha=0.5)
plt.title('Predicted Negative')
plt.xlim(0,100000)
plt.ylim(0,20000)
plt.tight_layout()
#plt.show()
plt.savefig('../figures/TuitionEnrollmentResults.png',bbox_inches='tight')

## Tuition and Cost

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('TUITFTE', 'Cost', data=to_graph.loc[to_graph.Predicted==True], hue ='CONTROL_N', style='Result', alpha=0.5)
plt.xlim(0, 80000)
plt.ylim(0, 80000)
plt.title('Predicted Positive')

plt.subplot(1, 2, 2)
sns.scatterplot('TUITFTE', 'Cost', data=to_graph.loc[to_graph.Predicted==False], hue = 'CONTROL_N', style='Result', alpha=0.5)
plt.xlim(0, 80000)
plt.ylim(0, 80000)
plt.title('Predicted Negative')
plt.tight_layout()
plt.savefig('../figures/TuitionCostResultsControl.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('TUITFTE', 'Cost', data=to_graph.loc[to_graph.Predicted==True], hue ='Result', style='CONTROL_N', alpha=0.5)
plt.xlim(0, 80000)
plt.ylim(0, 80000)
plt.title('Predicted Positive')

plt.subplot(1, 2, 2)
sns.scatterplot('TUITFTE', 'Cost', data=to_graph.loc[to_graph.Predicted==False], hue = 'Result', style='CONTROL_N',size='Result', sizes={20,40}, alpha=0.5)
plt.xlim(0, 80000)
plt.ylim(0, 80000)
plt.title('Predicted Negative')
plt.tight_layout()
plt.savefig('../figures/TuitionCostResults.png',bbox_inches='tight')

## Cost and Debt

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('Cost', 'DEBT_MDN', data=to_graph.loc[to_graph.Predicted==True], hue='CONTROL_N', style='CURROPER', alpha=0.5)
plt.title('Predicted Positive')
plt.ylim(0,30000)

plt.subplot(1, 2, 2)
sns.scatterplot('Cost', 'DEBT_MDN', data=to_graph.loc[to_graph.Predicted==False], hue='CONTROL_N', style='CURROPER',size='Result', sizes={20,40}, alpha=0.5) #, label='True')
plt.title('Predicted Negative')
plt.ylim(0,30000)
plt.tight_layout()

#plt.show()
plt.savefig('../figures/CostDebtResultsControl.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('Cost', 'DEBT_MDN', data=to_graph.loc[to_graph.Predicted==True], hue='CONTROL_N', style='Result', alpha=0.5)
plt.title('Predicted Positive')
plt.ylim(0,30000)
plt.xlim(0,60000)

plt.subplot(1, 2, 2)
sns.scatterplot('Cost', 'DEBT_MDN', data=to_graph.loc[to_graph.Predicted==True], hue='Result', style='CONTROL_N',size='Result', sizes={20,40}, alpha=0.5)
plt.title('PredictedPostive')
plt.ylim(0,30000)
plt.xlim(0,60000)
plt.tight_layout()
#plt.show()
plt.savefig('../figures/TuitionCostResultsPositive.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('Cost', 'DEBT_MDN', data=to_graph.loc[to_graph.Predicted==False], hue='CONTROL_N', style='Result', alpha=0.5)
plt.title('Predicted Negative')
plt.ylim(0,30000)
plt.xlim(0,60000)

plt.subplot(1, 2, 2)
sns.scatterplot('Cost', 'DEBT_MDN', data=to_graph.loc[to_graph.Predicted==False], hue='Result', style='CONTROL_N', size='Result', sizes={20,40}, alpha=0.5)
plt.title('Predicted Negative')
plt.ylim(0,30000)
plt.xlim(0,60000)
plt.tight_layout()
plt.show()
plt.savefig('../figures/TuitionCostResultsNegative.png',bbox_inches='tight')

## NumBranch

In [None]:
to_graph.NUMBRANCH.hist(bins=25)

In [None]:
fig=plt.figure(figsize=(10,4), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.boxplot('CONTROL_N', 'NUMBRANCH', data = to_graph, hue = 'Result')
#plt.ylim(0, 100)
plt.title('Number of Branches by Control')

plt.subplot(1, 2, 2)
sns.boxplot('PREDDEG', 'NUMBRANCH', data = to_graph, hue = 'Result')
#plt.ylim(0, 100)
plt.title('Number of Branches by Degree Type')
plt.tight_layout()
plt.show()

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('NUMBRANCH', 'UGDS', data=to_graph.loc[to_graph.Predicted==True], hue='Result', style='CONTROL_N', alpha=0.5)
plt.title('Predicted Positive')

plt.subplot(1, 2, 2)
sns.scatterplot('NUMBRANCH', 'UGDS', data=to_graph.loc[to_graph.Predicted==False], hue='Result', style='CONTROL_N',size='Result', sizes={20,40}, alpha=0.5)
plt.title('Predicted Negative')
plt.tight_layout()
plt.show()
plt.savefig('../figures/NumBranchUGDSResults.png',bbox_inches='tight')

## Instructional Expenditure

In [None]:
fig=plt.figure(figsize=(10,4), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.boxplot('CONTROL_N', 'INEXPFTE', data = to_graph, hue = 'Result')
plt.ylim(0, 100000)
plt.title('Instructional Expenditure by Control')

plt.subplot(1, 2, 2)
sns.boxplot('PREDDEG', 'INEXPFTE', data = to_graph, hue = 'Result')
plt.ylim(0, 100000)
plt.title('Instructional Expenditure by Degree Type')
plt.tight_layout()
plt.show()

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
#sns.scatterplot('', 'UGDS', data=to_graph, hue = 'Result')
sns.scatterplot('INEXPFTE', 'UGDS', data=to_graph.loc[to_graph.Predicted==True], hue='Result', style='CONTROL_N', alpha=0.5)
plt.xlim(0, 100000)
plt.ylim(0, 12000)
plt.title('Predicted Positive')

plt.subplot(1, 2, 2)
#sns.scatterplot('INEXPFTE', 'UGDS', data=to_graph.loc[to_graph.Result=='FN'], hue = 'CONTROL')
sns.scatterplot('INEXPFTE', 'UGDS', data=to_graph.loc[to_graph.Predicted==False], hue='Result', style='CONTROL_N',size='Result', sizes={20,40}, alpha=0.5)
plt.xlim(0, 100000)
plt.ylim(0, 12000)
plt.title('Predicted Negative')
plt.tight_layout()
#plt.show()
plt.savefig('../figures/ExpenditureUGDSResults.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('INEXPFTE', 'Cost', data=to_graph.loc[to_graph.Predicted==True], hue='Result', style='CONTROL_N',size='Result', sizes={20,40})#, alpha=0.5)
plt.ylim(0, 50000)
plt.title('Predicted Positive')

plt.subplot(1, 2, 2)
sns.scatterplot('INEXPFTE', 'Cost', data=to_graph.loc[to_graph.Predicted==False], hue = 'Result', style='CONTROL_N', size='Result', sizes={20,40})
plt.ylim(0, 50000)
plt.title('Predicted Negative')
plt.tight_layout()
#plt.show()
plt.savefig('../figures/ExpenditureCostResultsPositive.png',bbox_inches='tight')

In [None]:
## Graphs of just 4yr bachelors institutions

In [None]:
bach = to_graph.loc[to_graph['PREDDEG']=='3']
bach.shape

In [None]:
bach.columns

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('Cost', 'DEBT_MDN', data=bach[bach.Predicted==False], hue='CONTROL_N', style='Result', alpha=0.5)
plt.title('Predicted Negative')
plt.ylim(0,30000)
plt.xlim(0,60000)

plt.subplot(1, 2, 2)
sns.scatterplot('Cost', 'DEBT_MDN', data=bach.loc[bach.Predicted==False], hue='Result', style='CONTROL_N', size='Result', sizes={20,40}, alpha=0.5)
plt.title('Predicted Negative')
plt.ylim(0,30000)
plt.xlim(0,60000)
plt.tight_layout()
plt.show()
plt.savefig('../figures/BachTuitionCostResultsNegative.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('Cost', 'DEBT_MDN', data=bach.loc[bach.Predicted==True], hue='CONTROL_N', style='Result', alpha=0.5)
plt.title('Predicted Positive')
plt.ylim(0,30000)
plt.xlim(0,60000)

plt.subplot(1, 2, 2)
sns.scatterplot('Cost', 'DEBT_MDN', data=bach.loc[bach.Predicted==True], hue='Result', style='CONTROL_N',size='Result', sizes={20,40}, alpha=0.5)
plt.title('PredictedPostive')
plt.ylim(0,30000)
plt.xlim(0,60000)
plt.tight_layout()
#plt.show()
plt.savefig('../figures/BachTuitionCostResultsPositive.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('Cost', 'DEBT_MDN', data=bach.loc[bach.Predicted==True], hue='CONTROL_N', style='CURROPER', alpha=0.5)
plt.title('Predicted Positive')
plt.ylim(0,30000)

plt.subplot(1, 2, 2)
sns.scatterplot('Cost', 'DEBT_MDN', data=bach.loc[bach.Predicted==False], hue='CONTROL_N', style='CURROPER',size='Result', sizes={20,40}, alpha=0.5) #, label='True')
plt.title('Predicted Negative')
plt.ylim(0,30000)
plt.tight_layout()

#plt.show()
plt.savefig('../figures/BachCostDebtResultsControl.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('TUITFTE', 'Cost', data=bach.loc[bach.Predicted==True], hue ='Result', style='CONTROL_N', alpha=0.5)
plt.xlim(0, 80000)
plt.ylim(0, 80000)
plt.title('Predicted Positive')

plt.subplot(1, 2, 2)
sns.scatterplot('TUITFTE', 'Cost', data=bach.loc[bach.Predicted==False], hue = 'Result', style='CONTROL_N',size='Result', sizes={20,40}, alpha=0.5)
plt.xlim(0, 80000)
plt.ylim(0, 80000)
plt.title('Predicted Negative')
plt.tight_layout()
plt.savefig('../figures/BachTuitionCostResults.png',bbox_inches='tight')

In [None]:
fig=plt.figure(figsize=(10,5), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(1, 2, 1)
sns.scatterplot('TUITFTE', 'UGDS', data=bach.loc[bach.Predicted==True], hue = 'CONTROL_N',style='Result', alpha=0.5 )
plt.title('Predicted Positive')
plt.xlim(0,100000)
plt.ylim(0,20000)

plt.subplot(1, 2, 2)
sns.scatterplot('TUITFTE', 'UGDS', data=bach.loc[bach.Predicted==False], hue = 'CONTROL_N', style='Result', alpha=0.5)
plt.title('Predicted Negative')
plt.xlim(0,100000)
plt.ylim(0,20000)
plt.tight_layout()
#plt.show()
plt.savefig('../figures/BachTuitionEnrollmentResultsControl.png',bbox_inches='tight')

## Explore predictions of various schools

In [None]:
bethel = to_graph.loc[to_graph['INSTNM'].str.startswith('bethel')]
bethel.shape

In [None]:
bethel.CURROPER.value_counts()

In [None]:
bethel.head()

In [None]:
taylorIN = data.loc[data['INSTNM'].str.startswith('taylor')]
taylorIN.shape

In [None]:
taylorIN.head(7)

In [None]:
indiana = data.loc[data['INSTNM'].str.startswith('indiana university')]
indiana.shape

In [None]:
indiana.head()

In [None]:
dangerBachPrivate = bach.loc[(bach['CONTROL']=='2') & (bach['Result']=='FN')]
dangerBachPrivate.shape

In [None]:
dangerBachPrivate.head(7)

In [None]:
# Naropa still in operation (at least they still have an active website)
# Thomas More still in operation
# Williamson Christian still in operation
# Ecclesia still open
# Messenger still open
# LIU = Long island university - Brentwood still open, Riverhead still open