In [18]:
#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

#Import dataset
data = pd.read_csv("/Users/hannahkim/Desktop/CSCI Final Project/Dataset.csv", index_col=None, header=0, encoding='latin-1')
data1 = data.drop(['Lizard ID', 'Class 1'], axis=1)
data1.head()

Unnamed: 0,Class 2,Bin #,Bin Length (Âµm),Body Weight,Snout to Vent Length,Retinal Surface Area,Occular Average (Âµm),Macula Diameter,Total Foveal Region length (Âµm),GCL,INL,ONL,GCL+INL+ONL
0,Hch Mut,51,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,17,79,7,103
1,Hch Mut,60,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,7,41,8,56
2,Hch Mut,52,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,14,80,8,102
3,Hch Mut,53,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,17,65,9,91
4,Hch Mut,54,38.745,0.111,16.0,444328.0,2068.0,774.9,774.901752,14,59,7,80


In [19]:
#Creating the dependent variable class
factor = pd.factorize(data1['Class 2'])
data1.species = factor[0]
definitions = factor[1]
print(factor)

(array([0, 0, 0, ..., 5, 5, 5]), Index(['Hch Mut', 'Hch WT', '2mo WT', '4mo WT', '6mo WT', 'Adt WT'], dtype='object'))


  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
#Splitting the data into independent and dependent variables
X = data1.iloc[:,1:13].values
y = data1.iloc[:,0].values
print('The independent features set: ')
print(X[:3,:])
print('The dependent variable: ')
print(y[:3])

The independent features set: 
[[5.10000000e+01 3.87450000e+01 1.11000000e-01 1.60000000e+01
  4.44328000e+05 2.06800000e+03 7.74900000e+02 7.74901752e+02
  1.70000000e+01 7.90000000e+01 7.00000000e+00 1.03000000e+02]
 [6.00000000e+01 3.87450000e+01 1.11000000e-01 1.60000000e+01
  4.44328000e+05 2.06800000e+03 7.74900000e+02 7.74901752e+02
  7.00000000e+00 4.10000000e+01 8.00000000e+00 5.60000000e+01]
 [5.20000000e+01 3.87450000e+01 1.11000000e-01 1.60000000e+01
  4.44328000e+05 2.06800000e+03 7.74900000e+02 7.74901752e+02
  1.40000000e+01 8.00000000e+01 8.00000000e+00 1.02000000e+02]]
The dependent variable: 
['Hch Mut' 'Hch Mut' 'Hch Mut']


In [21]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)

print(len(X_train))
print(len(X_test))

3692
923


In [22]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, max_features = 3, criterion = 'entropy', bootstrap = False)

classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [2, 10, 50], 'max_features': [2, 6, 11]},
{'bootstrap': [False], 'n_estimators': [2, 10, 50], 'max_features': [3, 6, 11]},
]

grid_search = GridSearchCV(classifier, param_grid, cv=5, 
                           scoring='neg_mean_squared_error', return_train_score=True)

y_train2 = pd.factorize(y_train)
y_train_factorized = y_train2[0]
grid_search.fit(X_train, y_train_factorized)

grid_search.best_params_

{'bootstrap': False, 'max_features': 3, 'n_estimators': 10}

In [24]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
confusionmatrix = pd.crosstab(y_test, y_pred, rownames=['Actual Class'], colnames=['Predicted Class'])
print(confusionmatrix)

Predicted Class  2mo WT  4mo WT  6mo WT  Adt WT  Hch Mut  Hch WT
Actual Class                                                    
2mo WT              202       0       0       0        0       0
4mo WT                1     167       0       0        0       0
6mo WT                0       0     153       0        0       0
Adt WT                0       0       0     147        0       0
Hch Mut               0       0       0       0       71       0
Hch WT                0       0       0       0        0     182


In [14]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)*100
print(accuracy)

99.78331527627302


In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      2mo WT       1.00      1.00      1.00       202
      4mo WT       0.99      0.99      0.99       168
      6mo WT       1.00      0.99      1.00       153
      Adt WT       1.00      1.00      1.00       147
     Hch Mut       1.00      1.00      1.00        71
      Hch WT       1.00      1.00      1.00       182

    accuracy                           1.00       923
   macro avg       1.00      1.00      1.00       923
weighted avg       1.00      1.00      1.00       923

