In [2]:
import pandas as pd
from sklearn.model_selection  import train_test_split
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier 
from sklearn.svm import SVC 


college = pd.read_csv("College.csv", sep=',', header=0).drop(['Unnamed: 0'], axis=1)
college

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,No,2197,1515,543,4,26,3089,2029,6797,3900,500,1200,60,60,21.0,14,4469,40
773,Yes,1959,1805,695,24,47,2849,1107,11520,4960,600,1250,73,75,13.3,31,9189,83
774,Yes,2097,1915,695,34,61,2793,166,6900,4200,617,781,67,75,14.4,20,8323,49
775,Yes,10705,2453,1317,95,99,5217,83,19840,6510,630,2115,96,96,5.8,49,40386,99


In [3]:
# 1-1
# check scale of variables
college.var().sort_values()

S.F.Ratio      1.566853e+01
perc.alumni    1.535567e+02
Terminal       2.167478e+02
PhD            2.666086e+02
Grad.Rate      2.950737e+02
Top10perc      3.111825e+02
Top25perc      3.922292e+02
Books          2.725978e+04
Personal       4.584258e+05
Enroll         8.633684e+05
Room.Board     1.202743e+06
P.Undergrad    2.317799e+06
Accept         6.007960e+06
Apps           1.497846e+07
Outstate       1.618466e+07
F.Undergrad    2.352658e+07
Expend         2.726687e+07
dtype: float64

In [4]:
# 1-2
# split data into training/test set

y=college['Private'].astype('category').cat.codes
X=college.loc[:, college.columns != 'Private']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=109)  


In [5]:
# 1-3
# normalize
scaler = StandardScaler()  
print(scaler.fit_transform(X_train).std(axis=0))
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 
print(scaler.fit_transform(X_train).mean(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[-2.45353155e-17  1.63568770e-17 -2.78066909e-17  1.30855016e-17
  1.63568770e-17  9.81412618e-18  3.59851293e-17  2.28996278e-17
 -9.81412618e-18 -5.72490694e-17 -1.14498139e-17  0.00000000e+00
 -1.30855016e-17 -8.17843849e-18  0.00000000e+00 -4.90706309e-18
 -2.61710032e-17]


In [17]:
# 2-1
"""
Try to fit a NN model on the training data, and evaluate its performance 
on the test data. Let’s try to fit a NN with one hidden layer with 10 
nodes and use logistic for an activation function. 
What is the test error rate? What is the test error rate if we always 
predict “Yes”? What is precision? What is recall?  What is prediction 
accuracy among the cases that are predicted to be private? How would you 
compare this result with overall rate of private college?

"""
## nn model
mlp = MLPClassifier(activation='logistic',hidden_layer_sizes=(10), max_iter=1000)  
mlp.fit(X_train, y_train)  

  
## predict test set 
y_pred = mlp.predict(X_test)  

## confusion matrix
print(metrics.confusion_matrix(y_test, y_pred)) 
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
print('Precision: ', metrics.precision_score(y_test, y_pred))
print('Recall: ', metrics.recall_score(y_test, y_pred))


[[ 68   8]
 [  5 153]]
Accuracy:  0.9444444444444444
Precision:  0.9503105590062112
Recall:  0.9683544303797469


In [16]:
print(college['Private'].value_counts())
print(565/(565+212))

Yes    565
No     212
Name: Private, dtype: int64
0.7271557271557272


In [13]:
# 2-2
"""
Let’s try to fit a better NN model with different number of layers and 
hidden nodes (Do some experiments on the number of layers and number of 
nodes). Also, you can try to use other activation function, e.g. 
‘identity’, ‘tanh’, ‘relu’. 
What is the test error rate? What is the test error rate if we always 
predict “Yes”? What is precision? What is recall?  What is prediction 
accuracy among the cases that are predicted to be private? How would you 
compare this result with overall rate of private college?
"""

## nn model - identity
mlp = MLPClassifier(activation = 'identity',hidden_layer_sizes=(10,10,10), max_iter=1000)  
mlp.fit(X_train, y_train)  

  
## predict test set 
y_pred = mlp.predict(X_test)  

## confusion matrix
print(metrics.confusion_matrix(y_test, y_pred)) 
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
print('Precision: ', metrics.precision_score(y_test, y_pred))
print('Recall: ', metrics.recall_score(y_test, y_pred))


[[ 69   7]
 [  4 154]]
Accuracy:  0.9529914529914529
Precision:  0.9565217391304348
Recall:  0.9746835443037974


In [15]:
## nn model - relu
mlp = MLPClassifier(activation = 'relu',hidden_layer_sizes=(10,10,10), max_iter=1000)  
mlp.fit(X_train, y_train)  

  
## predict test set 
y_pred = mlp.predict(X_test)  

## confusion matrix
print(metrics.confusion_matrix(y_test, y_pred)) 
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
print('Precision: ', metrics.precision_score(y_test, y_pred))
print('Recall: ', metrics.recall_score(y_test, y_pred))

[[ 65  11]
 [  8 150]]
Accuracy:  0.9188034188034188
Precision:  0.9316770186335404
Recall:  0.9493670886075949


In [18]:
# 3-1
# kernel: linear
svclassifier = SVC(kernel='linear')    	## Linear SVM
svclassifier.fit(X_train, y_train)  
y_pred = svclassifier.predict(X_test)  	## predict test set
print(metrics.confusion_matrix(y_test, y_pred)) 
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
print('Precision: ', metrics.precision_score(y_test, y_pred))
print('Recall: ', metrics.recall_score(y_test, y_pred))

[[ 66  10]
 [  3 155]]
Accuracy:  0.9444444444444444
Precision:  0.9393939393939394
Recall:  0.9810126582278481


In [25]:
# 3-2
# kernel: rbf
svclassifier = SVC(kernel='rbf')    	## Linear SVM
svclassifier.fit(X_train, y_train)  
y_pred = svclassifier.predict(X_test)  	## predict test set
print(metrics.confusion_matrix(y_test, y_pred)) 
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred)*100,'%')
print('Precision: ', metrics.precision_score(y_test, y_pred)*100,'%')
print('Recall: ', metrics.recall_score(y_test, y_pred)*100 ,'%')

[[ 63  13]
 [  3 155]]
Accuracy:  93.16239316239316 %
Precision:  92.26190476190477 %
Recall:  98.10126582278481 %
