#### Project by William Trevino, Alex Salamah, Christopher Graves, Hannah Kosinovsky

# Load Data
### Here we load our cleaned up dataset after Lab1 wrangling and exploratory analysis 

In [56]:
## pre work
import pandas as pd
import numpy as np
import time

pd.set_option('mode.chained_assignment', None)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Pull in the consolidated and cleaned data
df = pd.read_csv('minilab_data.csv')
# Drop empty columns
df = df.drop(columns=['Unnamed: 0'])
# Print some summary information to confirm data loaded properly
df.describe()
df.info()

Unnamed: 0,student_num,crime_per_c_num,advance_dgr_pct,_1yr_tchr_trnovr_pct,lea_sat_avg_score_num,lea_federal_perpupil_num,lea_local_perpupil_num,lea_state_perpupil_num,lea_salary_expense_pct,lea_supplies_expense_pct,lea_ap_participation_pct,lea_sat_participation_pct,MinorityFemalePct,MinorityMalePct,lea_avg_daily_attend_pct,total_perpupil_num,sat_above_average
count,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0,9731.0
mean,557.901181,0.734068,0.28806,0.136827,996.441968,1043.617976,2052.783094,5773.449874,0.673819,0.079906,0.128199,0.475252,0.243913,0.255945,0.950031,8869.850944,0.200082
std,310.423995,1.861805,0.107781,0.078527,70.266033,278.821002,570.770132,609.756321,0.09548,0.015282,0.074229,0.122627,0.133914,0.142572,0.00968,786.106848,0.400082
min,1.0,0.0,0.0,0.0,771.0,424.73,740.52,4882.55,0.547,0.034,0.0,0.115,0.0,0.0,0.911,7282.19,0.0
25%,346.0,0.0,0.214,0.082,954.0,887.22,1670.75,5349.09,0.613,0.069,0.08,0.39,0.131897,0.135066,0.944,8352.28,0.0
50%,520.0,0.25,0.286,0.13,998.0,1029.04,2007.35,5638.31,0.628,0.08,0.12,0.48,0.231405,0.24147,0.95,8741.79,0.0
75%,708.0,0.84,0.356,0.182,1042.0,1180.42,2352.27,6028.1,0.763,0.091,0.19,0.57,0.349959,0.366397,0.955,9286.27,0.0
max,1725.0,36.36,0.625,0.4,1205.0,2020.79,4025.31,8108.71,0.878,0.127,0.33,0.82,0.649682,0.692308,0.999,12155.97,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9731 entries, 0 to 9730
Data columns (total 17 columns):
student_num                  9731 non-null float64
crime_per_c_num              9731 non-null float64
advance_dgr_pct              9731 non-null float64
_1yr_tchr_trnovr_pct         9731 non-null float64
lea_sat_avg_score_num        9731 non-null float64
lea_federal_perpupil_num     9731 non-null float64
lea_local_perpupil_num       9731 non-null float64
lea_state_perpupil_num       9731 non-null float64
lea_salary_expense_pct       9731 non-null float64
lea_supplies_expense_pct     9731 non-null float64
lea_ap_participation_pct     9731 non-null float64
lea_sat_participation_pct    9731 non-null float64
MinorityFemalePct            9731 non-null float64
MinorityMalePct              9731 non-null float64
lea_avg_daily_attend_pct     9731 non-null float64
total_perpupil_num           9731 non-null float64
sat_above_average            9731 non-null int64
dtypes: float64(16), int64(1

# Create Models

###### Create our x (predictors) and y (response) datasets and split them into test and train.  Because we wish later to interpret the coefficient weights of our trained models, we shall standardize our features such that we are comparing apples to apples rather than apples to oranges.

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler

# dummy data frame from which we create our X values
df_foo = df
# drop our response values from the X data set
df_foo = df_foo.drop(columns=['lea_sat_avg_score_num', 'sat_above_average'])
X = df_foo.values
# create our y values (response) from the original full data frame
y = df['sat_above_average'].values
# Create a test and train set with 80/20 split per rubric
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# scale attributes by the training set
scl_obj = StandardScaler()
scl_obj.fit(X_train) # find scalings for each column that make this zero mean and unit std
# the line of code above only looks at training data to get mean and std and we can use it 
# to transform new feature data

X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test)
# apply those means and std to the test set (without snooping at the test set values)

StandardScaler(copy=True, with_mean=True, with_std=True)

#### Logistic regression - Ridge
######  Here we create a Logistic regression model using the Limited-memory BFGS ( Broyden–Fletcher–Goldfarb–Shanno) algorithm

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
# Log start time to track execution time
start = time.time()
# Build a model object with penalty of 12
lr_l2 = LogisticRegression(penalty='l2', C=1.0, class_weight=None, solver='lbfgs', max_iter=200) # get object
lr_l2.fit(X_train_scaled,y_train)  # train object
y_hat_l2 = lr_l2.predict(X_test_scaled) # get test set precitions
acc = mt.accuracy_score(y_test,y_hat_l2) # calculate the accuracy of our predictions
conf = mt.confusion_matrix(y_test,y_hat_l2) # create a confusion matrix
print("accuracy", acc )
print("confusion matrix\n",conf)
print("Exec time:", round(time.time() - start,4)) # Elapsed time since we grabbed the start time at top of cell

results = [(acc, round(time.time() - start,4), conf[0,0], conf[0,1], conf[1,0], conf[1,1])]
perf_compare = pd.DataFrame(results, columns = ['Accuracy', 'Execution Time', 'ConfusionUL', 'ConfusionUR', 'ConfusionLL', 'ConfusionLR'], index = ['Logistic LBFGS'])
perf_compare.head()



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

accuracy 0.9255264509501797
confusion matrix
 [[1497   53]
 [  92  305]]
Exec time: 0.0298


Unnamed: 0,Accuracy,Execution Time,ConfusionUL,ConfusionUR,ConfusionLL,ConfusionLR
Logistic LBFGS,0.925526,0.0299,1497,53,92,305


###### Logistric regression - Lasso
###### Here we build a Logistic Regression using  -- liblinear which does some stuff that William needs to help me interpret.  We have also adjusted the penalty factor to 11

In [59]:
start = time.time()
lr_l1 = LogisticRegression(penalty='l1', C=1.0, class_weight=None, solver='liblinear', max_iter=300) # get object

lr_l1.fit(X_train_scaled,y_train)  # train object
y_hat_l1 = lr_l1.predict(X_test_scaled) # get test set predictions
acc = mt.accuracy_score(y_test,y_hat_l1) # score the accuracy of predictions
conf = mt.confusion_matrix(y_test,y_hat_l1) # build a confusion matrix



print("accuracy", acc )
print("confusion matrix\n",conf)
print("ul",conf[0,0])
print("Exec time:", round(time.time() - start,4)) # Elapsed time since we grabbed the start time at top of cell

perf_compare.loc['Logistic LibLinear'] = [acc, round(time.time() - start,4), conf[0,0], conf[0,1], conf[1,0], conf[1,1]]
perf_compare.head()


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=300, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

accuracy 0.9265536723163842
confusion matrix
 [[1498   52]
 [  91  306]]
ul 1498
Exec time: 0.1797


Unnamed: 0,Accuracy,Execution Time,ConfusionUL,ConfusionUR,ConfusionLL,ConfusionLR
Logistic LBFGS,0.925526,0.0299,1497.0,53.0,92.0,305.0
Logistic LibLinear,0.926554,0.1797,1498.0,52.0,91.0,306.0


###### Support Vector Machine (SVM) Model
###### Here we creat an SVM model with the Radial Basis Function  (RBF) algorithm

In [60]:
from sklearn.svm import SVC

start = time.time()
svm_rbf = SVC(C=0.5, kernel='rbf', gamma='auto') # get object
svm_rbf.fit(X_train_scaled, y_train)  # train object

y_hat_rbf = svm_rbf.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat_rbf) # score the accuracy of predictions
conf = mt.confusion_matrix(y_test,y_hat_rbf) # build a confusion matrix
print('accuracy:', acc )
print(conf)
print("Exec time:", round(time.time() - start,4)) # Elapsed time since we grabbed the start time at top of cell

perf_compare.loc['SVM RBF'] = [acc, round(time.time() - start,4), conf[0,0], conf[0,1], conf[1,0], conf[1,1]]
perf_compare.head()

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

accuracy: 0.975346687211094
[[1537   13]
 [  35  362]]
Exec time: 0.3879


Unnamed: 0,Accuracy,Execution Time,ConfusionUL,ConfusionUR,ConfusionLL,ConfusionLR
Logistic LBFGS,0.925526,0.0299,1497.0,53.0,92.0,305.0
Logistic LibLinear,0.926554,0.1797,1498.0,52.0,91.0,306.0
SVM RBF,0.975347,0.388,1537.0,13.0,35.0,362.0


###### SVM
###### Here we creat an SVM model with the Linear kernel algorithm - Why is this slower than the others (not incremental?)

In [61]:
start = time.time()
svm_linear = SVC(C=0.5, kernel='linear', gamma='auto') # get object
svm_linear.fit(X_train_scaled, y_train)  # train object

y_hat_linear = svm_linear.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat_linear) # score the accuracy of predictions
conf = mt.confusion_matrix(y_test,y_hat_linear) # build a confusion matrix
print('accuracy:', acc )
print(conf)
print("Exec time:", round(time.time() - start,4)) # Elapsed time since we grabbed the start time at top of cell

perf_compare.loc['SVM Linear'] = [acc, round(time.time() - start,4), conf[0,0], conf[0,1], conf[1,0], conf[1,1]]
perf_compare.head()

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

accuracy: 0.9363122752953261
[[1489   61]
 [  63  334]]
Exec time: 0.5037


Unnamed: 0,Accuracy,Execution Time,ConfusionUL,ConfusionUR,ConfusionLL,ConfusionLR
Logistic LBFGS,0.925526,0.0299,1497.0,53.0,92.0,305.0
Logistic LibLinear,0.926554,0.1797,1498.0,52.0,91.0,306.0
SVM RBF,0.975347,0.388,1537.0,13.0,35.0,362.0
SVM Linear,0.936312,0.5038,1489.0,61.0,63.0,334.0


# Model Advantages

#### We find that SVM using the RBF Algorithm provides that best overall accuracy of prediction at a slight efficiency decrease (execution time increase) versus the Logistic regression models. 

#### We additionally captured the confusion matrix for each model in order to explore how often the model correctly identifies the relatively sparse occurence of above SAT average performant schools.  

#### Notably, our winning SVM RBF model presents the fewest number of schools improperly classified as average or below when they are indeed above average

#### The Upper Left (UL) Cell represents average or below SAT schools which were correctly predicted
#### The Upper Right (UR) Cell represents average or below SAT schools which were incorectly predicted above average
#### The Lower Left (LL) Cell above average SAT schools which were incorrectly predicted as average or below
#### The Lower Right(LR) Cell represents above average SAT schools which were correctly predicted

#### Logistic Regression using lbfgs, l2 -- fastest run time, decent overall accuracy, however, misclassified 92 above average schools. 
#### Logistic regression using liblinear, l1 -- second fastest run time, better accuracy versus LBFGS , but more or less on par.
#### SVM using rbf -- third fastest run time, best accuracy.  
#### SVM using linear -- very slow, second best accuracy

###### Note that execution times will be dependent on the underlying hardware platform.

In [9]:
perf_compare.head() #standardization

Unnamed: 0,Accuracy,Execution Time,ConfusionUL,ConfusionUR,ConfusionLL,ConfusionLR
Logistic LBFGS,0.925526,0.3822,1497.0,53.0,92.0,305.0
Logistic LibLinear,0.926554,0.1741,1498.0,52.0,91.0,306.0
SVM RBF,0.975347,0.5743,1537.0,13.0,35.0,362.0
SVM Linear,0.936312,0.883,1489.0,61.0,63.0,334.0


# Interpret Feature Importance

In [105]:
zip_vars = zip(df_foo.columns,lr_l2.coef_.T) # combine attributes
zip_vars = sorted(zip_vars)
compare = pd.DataFrame(zip_vars, columns=['Feature', 'weight'])
compare.loc[:, 'Log LBFGS Coefficient'] = compare.weight.map(lambda x: x[0]) 
compare = compare.drop(columns=['weight'])

zip_vars = zip(df_foo.columns,lr_l1.coef_.T) # combine attributes
zip_vars = sorted(zip_vars)
foo = pd.DataFrame(zip_vars, columns=['Feature', 'weight'])
foo.loc[:, 'Log LibLin Coefficient'] = foo.weight.map(lambda x: x[0]) 
foo = foo.drop(columns=['weight'])
compare = pd.merge(compare,foo, on='Feature')

zip_vars = zip(df_foo.columns,svm_linear.coef_.T) # combine attributes
zip_vars = sorted(zip_vars)
foo = pd.DataFrame(zip_vars, columns=['Feature', 'weight'])
foo.loc[:, 'SVM Lin Coefficient'] = foo.weight.map(lambda x: x[0]) 
foo = foo.drop(columns=['weight'])
compare = pd.merge(compare,foo, on='Feature')
compare.sort_values('Log LBFGS Coefficient')

Unnamed: 0,Feature,Log LBFGS Coefficient,Log LibLin Coefficient,SVM Lin Coefficient
7,lea_federal_perpupil_num,-2.325312,-2.448435,-1.38767
12,lea_supplies_expense_pct,-1.106658,-1.104092,-0.780264
10,lea_sat_participation_pct,-0.554906,-0.553131,-0.165829
1,MinorityMalePct,-0.198723,-0.196652,-0.049775
14,total_perpupil_num,-0.170022,0.150159,-0.118947
2,_1yr_tchr_trnovr_pct,-0.143678,-0.141936,-0.074103
0,MinorityFemalePct,-0.107748,-0.10613,-0.058089
4,crime_per_c_num,0.013176,0.010475,0.01368
13,student_num,0.061631,0.057584,0.030059
6,lea_avg_daily_attend_pct,0.123324,0.117817,0.119812


##### Because we have standardized our features , interpreting the coefficients is realatively straight forward.  Further, our Linear SVM gives us coefficients on the base features and can also be interpreted in line with the logistic regression.  

### Negative Associations
##### Federal Per Pupil Spend:  Since the greater part of school funds typically comes from local taxes, federal subsidy is likely associated with disadvantaged areas and perhaps this is indcative of lower SAT performance
##### Supplies Expense Percentage:  This negative association may be attributable to underspending elsewhere or simply indicative of the fact that salaries drive higher performance in SAT than supplies
##### SAT Participation:  A counterintuitive result.  This may be the fact that ill prepared pupils or those not college bound due to economic disadvantage tend to be those that do not particpate.  Thus, higher participation school wide might lead to lower performance. 
##### Minority Male Percentage:  The magnitude is quite small.  However, minority representation might be associated with under funded districts and/or economic disadvantage
##### Total Per Pupil Spend:  Our Log Lib Linear model gives contradictory indications but all provide relatively small coefficients.  The takeaway might be that the spend by student (regardless of source) is not a dramtic factor in SAT performance
##### One Year Turnover:  No surprise here.  Higher turnover among teachers is associated with lower SAT performance
##### Minority Femaile Percentage:  The magnitude is quite small.  However, minority representation might be associated with under funded districts and/or economic disadvantage
### Positive Associations
##### Crime Rate: Crime rate while positive accross the models does not appear to be a meaningful factor in SAT performance
##### School Size (number of students):  While it would appear larger schools perform better this isn't a large magnitude coefficient
##### Attendence:  Unsurprisingly, higher attendence predicts better SAT performance
##### Teachers with Advance Degrees:  While the positive association is not surprising.  The magnitude is.  Of the predictors in our model, this appears to be the most impactful in terms of predicting SAT outcomes.


# Interpret Support Vectors

##### While we were able to interpret support vectors for our linear model, the number of features in the RBF model makes interpretation impractical.  While the prediction accuracy was highest, it is less useful for interpretation or drawing out actionable information.

In [18]:
print(svm_rbf.support_vectors_)
print(svm_rbf.support_)

[[8.05000000e+02 1.12000000e+00 1.84000000e-01 ... 3.05418719e-01
  9.37000000e-01 9.27564000e+03]
 [3.03000000e+02 0.00000000e+00 4.00000000e-01 ... 6.53594771e-02
  9.46000000e-01 8.16005000e+03]
 [7.05000000e+02 0.00000000e+00 3.41000000e-01 ... 1.65945166e-01
  9.50000000e-01 9.01786000e+03]
 ...
 [3.61000000e+02 0.00000000e+00 4.23000000e-01 ... 6.33608815e-02
  9.54000000e-01 9.71132000e+03]
 [5.70000000e+01 2.28100000e+01 4.78000000e-01 ... 2.57577364e-01
  9.52000000e-01 8.36290000e+03]
 [3.77000000e+02 0.00000000e+00 3.44000000e-01 ... 2.75510204e-01
  9.49000000e-01 8.70505878e+03]]
[   0    1    4 ... 7758 7766 7772]
