In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

#Import dataset
data = pd.read_csv("/Users/hannahkim/Desktop/CSCI Final Project/Dataset.csv", index_col=None, header=0, encoding='latin-1')
data1 = data.drop(['Lizard ID', 'Class 1'], axis=1)
data1.head()

Unnamed: 0,Class 2,Bin #,Bin Length (Âµm),Body Weight,Snout to Vent Length,Retinal Surface Area,Occular Average (Âµm),Macula Diameter,Total Foveal Region length (Âµm),GCL,INL,ONL,GCL+INL+ONL
0,Hch Mut,51,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,17,79,7,103
1,Hch Mut,60,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,7,41,8,56
2,Hch Mut,52,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,14,80,8,102
3,Hch Mut,53,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,17,65,9,91
4,Hch Mut,54,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,14,59,7,80


In [24]:
#Splitting the data into independent and dependent variables
X = data1.iloc[:,1:13].values
y = data1.iloc[:,0].values
print('The independent features set: ')
print(X[:3,:])
print('The dependent variable: ')
print(y[:3])

The independent features set: 
[[5.10000000e+01 3.87450000e+01 1.11000000e-01 1.60000000e+01
  4.44328000e+05 2.06800000e+03 7.74900000e+02 7.74901752e+02
  1.70000000e+01 7.90000000e+01 7.00000000e+00 1.03000000e+02]
 [6.00000000e+01 3.87450000e+01 1.11000000e-01 1.60000000e+01
  4.44328000e+05 2.06800000e+03 7.74900000e+02 7.74901752e+02
  7.00000000e+00 4.10000000e+01 8.00000000e+00 5.60000000e+01]
 [5.20000000e+01 3.87450000e+01 1.11000000e-01 1.60000000e+01
  4.44328000e+05 2.06800000e+03 7.74900000e+02 7.74901752e+02
  1.40000000e+01 8.00000000e+01 8.00000000e+00 1.02000000e+02]]
The dependent variable: 
['Hch Mut' 'Hch Mut' 'Hch Mut']


In [25]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)

print(len(X_train))
print(len(X_test))

3692
923


In [26]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
from sklearn import svm

linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train)
rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(X_train, y_train)
sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train, y_train)

In [28]:
h = .01

#create the mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

titles = ['Linear kernel','RBF kernel','Polynomial kernel','Sigmoid kernel']

In [29]:
linear_pred = linear.predict(X_test)
poly_pred = poly.predict(X_test)
rbf_pred = rbf.predict(X_test)
sig_pred = sig.predict(X_test)

# retrieve the accuracy for all 4 kernel functions
accuracy_lin = linear.score(X_test, y_test)
accuracy_poly = poly.score(X_test, y_test)
accuracy_rbf = rbf.score(X_test, y_test)
accuracy_sig = sig.score(X_test, y_test)

print(accuracy_lin)
print(accuracy_poly)
print(accuracy_rbf)
print(accuracy_sig)

0.7388949079089924
0.8060671722643553
0.9761646803900325
0.5666305525460456


In [21]:
cm_lin = confusion_matrix(y_test, linear_pred)
cm_poly = confusion_matrix(y_test, poly_pred)
cm_rbf = confusion_matrix(y_test, rbf_pred)
cm_sig = confusion_matrix(y_test, sig_pred)

print(cm_lin)
print(cm_poly)
print(cm_rbf)
print(cm_sig)

# Making the Confusion Matrix
confusionmatrix_rbf = pd.crosstab(y_test, rbf_pred, rownames=['Actual Class'], colnames=['Predicted Class'])
confusionmatrix_lin.head()


[[198  55   5   0   0   0]
 [ 40 115  10  46   0   0]
 [  8  15 119  54   0   0]
 [  0  54  38  92   0   0]
 [  0   0   0   0  82   0]
 [  0   0   0   0   0 223]]
[[242  15   1   0   0   0]
 [ 76 135   0   0   0   0]
 [ 22  20 135  19   0   0]
 [ 12  42  16 114   0   0]
 [  0   0   0   0  82   0]
 [  0   0   0   0   0 223]]
[[246   9   3   0   0   0]
 [  1 206   4   0   0   0]
 [  0   1 194   1   0   0]
 [  0   6   2 176   0   0]
 [  0   0   1   0  81   0]
 [  0   0   3   0   0 220]]
[[171  37  19   0   0  31]
 [ 50  85  21  50   0   5]
 [ 19  29 100  47   0   1]
 [ 12  36  51  85   0   0]
 [  2   0   3   0  48  29]
 [ 20   6   9   0   2 186]]


In [22]:
from sklearn.metrics import classification_report

print(classification_report(y_test, rbf_pred))

              precision    recall  f1-score   support

      2mo WT       1.00      0.95      0.97       258
      4mo WT       0.93      0.98      0.95       211
      6mo WT       0.94      0.99      0.96       196
      Adt WT       0.99      0.96      0.98       184
     Hch Mut       1.00      0.99      0.99        82
      Hch WT       1.00      0.99      0.99       223

    accuracy                           0.97      1154
   macro avg       0.98      0.98      0.98      1154
weighted avg       0.97      0.97      0.97      1154

