<a href="https://colab.research.google.com/github/im-vne/Snakes-or-No-Snakes/blob/main/Time_800_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model building and evaluation

This notebook contains the main two methods: CNN and logistic regression. For side analyses see their specific notebook.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import os # work with system
import cv2 # work with images
import numpy as np # arrays and numerical analysis
import matplotlib.pyplot as plt # for data plots
import sklearn.linear_model # linear models
from sklearn.model_selection import train_test_split # data splitting
from sklearn import metrics # model evaluation
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import random # for random sampling
import pandas as pd

os.getcwd()
### set working directory in data folder
#os.chdir("drive/Shareddrives/Capstone 2023/Data/MicrosoftSnakeAlgorithmProject")

'/content'

In [None]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
!nvidia-smi

Mon Jul 24 19:10:04 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#import dataframe with path and label
df = pd.read_csv("/content/drive/Shareddrives/Capstone 2023/Data/Time_Data/cumulative_organized_Data_Path.csv")
first_column = df.columns[0]
df = df.drop([first_column], axis=1)
print(len(df))

#Remove faulty image incase it's not
empty_img = "MicrosoftSnakeAlgorithmProject/Snake_Images/FL_Snakes/WF8/IMG_7087.JPG"
df = df[df.Path != empty_img]
print(len(df))
print(df.head(3))
print(df.tail(3))

23275
23275
   Snake                 Date location camera  \
0      1  2016-03-08 07:36:30      LPS   TSP1   
1      1  2016-03-08 07:37:00      LPS   TSP1   
2      1  2016-03-08 07:37:30      LPS   TSP1   

                                                Path  cumsum_snakes  \
0  /content/drive/Shareddrives/Capstone 2023/Data...              1   
1  /content/drive/Shareddrives/Capstone 2023/Data...              2   
2  /content/drive/Shareddrives/Capstone 2023/Data...              3   

   cumsum_snakes_total  cumsum_snakes_location  cumsum_snakes_camera  
0                    1                       1                     1  
1                    2                       2                     2  
2                    3                       3                     3  
       Snake                 Date location  camera  \
23272      0  2019-10-13 14:55:00      LPS  APTR12   
23273      0  2019-10-13 14:55:30      LPS  APTR12   
23274      0  2019-10-13 14:56:00      LPS  APTR12   

     

In [None]:
random.seed(123)

#convert dataframe columns to correct types
df['Date'] = pd.to_datetime(df['Date'])
df = df.astype({"cumsum_snakes":"int","cumsum_snakes_total":"int","cumsum_snakes_location":"int","cumsum_snakes_camera":"int", "Path": "string"})

#sort dates in ascending order
df.sort_values(by='Date', inplace = True)

#get the first 400 instances of snakes
filtered_df = df.loc[(df['cumsum_snakes_total'] <= 400)]
print(filtered_df)

#####################3333
df.toframe()
#########################
###
# It takes a total of 1870 observations to get 400 snakes
# Training set is filtered_df


train = df.loc[(df['cumsum_snakes_total'] <= 800)]
test = df.loc[(df['cumsum_snakes_total'] > 800)]
print("-------------")
print("training set")
print(train)
print("length of testing:")
print(len(train))

print("-------------")
print("untouched testing set")
print("length of testing:")
print(len(test))

####
# Validation set
# 15% of 23275 at the end
print("-------------")
print("validation set")
val = test.sample(n=3372)
print(val.head(5))
print("length of val:")
print(len(val))

print("------------")
print("testing set")
test = test.drop(val.index)
print(test)
print("length of final testing:")
print(len(test))



0        /content/drive/Shareddrives/Capstone 2023/Data...
1        /content/drive/Shareddrives/Capstone 2023/Data...
2        /content/drive/Shareddrives/Capstone 2023/Data...
3        /content/drive/Shareddrives/Capstone 2023/Data...
4        /content/drive/Shareddrives/Capstone 2023/Data...
                               ...                        
23270    /content/drive/Shareddrives/Capstone 2023/Data...
23271    /content/drive/Shareddrives/Capstone 2023/Data...
23272    /content/drive/Shareddrives/Capstone 2023/Data...
23273    /content/drive/Shareddrives/Capstone 2023/Data...
23274    /content/drive/Shareddrives/Capstone 2023/Data...
Name: Path, Length: 23275, dtype: object
      Snake                Date location camera  \
0         1 2016-03-08 07:36:30      LPS   TSP1   
1         1 2016-03-08 07:37:00      LPS   TSP1   
2         1 2016-03-08 07:37:30      LPS   TSP1   
3         1 2016-03-08 07:38:00      LPS   TSP1   
4         1 2016-03-13 14:46:00      LPS   TSP1   
... 

In [None]:
print(" ")
print("Total unique cameras in training:")
print(len(train.camera.unique()))
print("max images per camera:")
print(train.cumsum_snakes_camera.value_counts().max())
print("min images per camera:")
print(train.cumsum_snakes_camera.value_counts().min())
print("total observations:")
print(len(train))
print(" ")

print("--------------------")
print(" ")
print("Total unique cameras in val:")
print(len(val.camera.unique()))
print("max images per camera:")
print(val.cumsum_snakes_camera.value_counts().max())
print("min images per camera:")
print(val.cumsum_snakes_camera.value_counts().min())
print("total observations:")
print(len(val))
print(" ")

print("--------------------")
print(" ")
print("Total unique cameras in testing:")
print(len(test.camera.unique()))
print("max images per camera:")
print(test.cumsum_snakes_camera.value_counts().max())
print("min images per camera:")
print(test.cumsum_snakes_camera.value_counts().min())
print("total observations:")
print(len(test))
print(" ")

 
Total unique cameras in training:
23
max images per camera:
880
min images per camera:
1
total observations:
1871
 
--------------------
 
Total unique cameras in val:
65
max images per camera:
429
min images per camera:
1
total observations:
3492
 
--------------------
 
Total unique cameras in testing:
65
max images per camera:
2225
min images per camera:
1
total observations:
17912
 


In [None]:
#get xtrain, y train, xtest, y train, x val, y val

#training set
print("Training")
y_train = train['Snake']
x_train = train.loc[:, train.columns == 'Path']

print(y_train)
print(x_train)
print("-------")

#val set will be the very last instances in which 400, 800 and 1200 don't touch
print("Validation")
y_val = val['Snake']
x_val = train.loc[:, val.columns == 'Path']
print("-------")

#testing set
print("")
y_test = test['Snake']
x_test = test.loc[:, test.columns == 'Path']
print(y_test)
print(x_test)
print("-------")

Training
0       1
1       1
2       1
3       1
4       1
       ..
1866    0
1867    0
1868    1
1869    1
1870    1
Name: Snake, Length: 1871, dtype: int64
                                                   Path
0     /content/drive/Shareddrives/Capstone 2023/Data...
1     /content/drive/Shareddrives/Capstone 2023/Data...
2     /content/drive/Shareddrives/Capstone 2023/Data...
3     /content/drive/Shareddrives/Capstone 2023/Data...
4     /content/drive/Shareddrives/Capstone 2023/Data...
...                                                 ...
1866  /content/drive/Shareddrives/Capstone 2023/Data...
1867  /content/drive/Shareddrives/Capstone 2023/Data...
1868  /content/drive/Shareddrives/Capstone 2023/Data...
1869  /content/drive/Shareddrives/Capstone 2023/Data...
1870  /content/drive/Shareddrives/Capstone 2023/Data...

[1871 rows x 1 columns]
-------
Validation
-------

1871     1
1872     1
1873     1
1874     1
1875     1
        ..
23270    0
23271    0
23272    0
23273    0
23274 

In [None]:
################ Look Here
# y_train = y_train.tolist()/
x_train = np.array(x_train['Path'].tolist())

# y_val = y_val.tolist()
x_val = np.array(x_val['Path'].tolist())

# y_test = y_test.tolist()
x_test = np.array(x_test['Path'].tolist())


In [None]:
class data_generator(keras.utils.Sequence) :

  def __init__(self, image_filenames, labels, batch_size, log) :
    self.image_filenames = image_filenames
    self.labels = labels
    self.batch_size = batch_size
    self.log = log

  def __len__(self) :
    return (np.ceil(len(self.image_filenames) / float(self.batch_size))).astype(int)

  def __getitem__(self, idx) :
    batch_x = self.image_filenames[idx * self.batch_size : (idx+1) * self.batch_size]
    batch_y = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]

    # read in and process image (different resizing for logistic regression to reduce trainable parameters)
    if self.log:
      x_list = [cv2.resize(cv2.imread(file_name, cv2.IMREAD_GRAYSCALE), (100, 60), interpolation = cv2.INTER_NEAREST) for file_name in batch_x]
      print("passed")
    else:
      x_list = [cv2.resize(cv2.imread(file_name, cv2.IMREAD_GRAYSCALE), (512, 384), interpolation = cv2.INTER_NEAREST) for file_name in batch_x]

    x_arr = np.array(x_list)
    x_arr_reshaped = x_arr.reshape([x_arr.shape[0], x_arr.shape[1], x_arr.shape[2], 1])

    y_arr = np.array(batch_y)
    y_arr_reshaped = y_arr.reshape([x_arr_reshaped.shape[0], 1])

    return x_arr_reshaped, y_arr_reshaped

In [None]:
def create_generators(x_train, y_train, x_val, y_val, x_test, y_test, batch_size = 32, log = False):

  train_generator = data_generator(x_train, y_train, batch_size, log)
  validation_generator = data_generator(x_val, y_val, batch_size, log)
  test_generator = data_generator(x_test, y_test, batch_size, log)

  return train_generator, validation_generator, test_generator


In [None]:
#numpy array under the hood tbh

<pandas.core.indexing._iLocIndexer object at 0x7a26b4e20ae0>


### **CNN**

In [None]:
# create data generators
batch_size = 32
train_generator, validation_generator, test_generator = create_generators(x_train, y_train, x_val, y_val, x_test, y_test, batch_size = batch_size, log = False)

In [None]:
# create model
def build_model():

    model = models.Sequential()
    model.add(layers.Conv2D(16, (3, 3), activation='relu', input_shape=(384, 512, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(32, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(1, activation = 'sigmoid'))

    # compile model with desired features
    model.compile(
        optimizer = "adam", loss = "binary_crossentropy",
        metrics=['accuracy']
    )

    return model

In [None]:
# build the model
cnn_model = build_model()

In [None]:
# view model architecture
cnn_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_9 (Conv2D)           (None, 382, 510, 16)      160       
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 191, 255, 16)     0         
 2D)                                                             
                                                                 
 conv2d_10 (Conv2D)          (None, 189, 253, 32)      4640      
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 94, 126, 32)      0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 92, 124, 64)       18496     
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 46, 62, 64)      

In [None]:
#attempt 1
#filename_test = x_train.values.tolist()
#print(type(filename_test[0]))

#filename_test01 = filename_test[0]
#print(filename_test01)
#cv2.imread(filename_test01, cv2.IMREAD_GRAYSCALE)



In [None]:
cnn_fit = cnn_model.fit(
    train_generator,
    steps_per_epoch = int(x_train.shape[0] // batch_size),
    # steps_per_epoch = 10,
    epochs = 3,
    verbose = 1,
    validation_data = validation_generator,
    validation_steps = int(x_val.shape[0] // batch_size)
    # validation_steps = 3
)

Epoch 1/3

In [None]:
# can load saved (see google sheet for model descriptions)
# cnn_model = tf.keras.models.load_model("../Snakes-or-No-Snakes/cnn_balanced_seed1_3e")

In [None]:
model_predict = cnn_model.predict(
    test_generator
)

In [None]:
# get class predictions
predictions_cnn = model_predict > 0.5
predictions_cnn = predictions_cnn.reshape(x_test.shape[0])
print(predictions_cnn.shape)

In [None]:
#set(list(predictions_cnn.reshape(1262)))

In [None]:
# plot confusion matrix
metrics.ConfusionMatrixDisplay.from_predictions(y_test, predictions_cnn, cmap = "GnBu")

In [None]:
### model evaluation metrics

# (true positive + true negative)/total
accuracy = sklearn.metrics.accuracy_score(y_test, predictions_cnn)
print("accuracy: " + str(accuracy))

# true positive/(true positive + false positive)
precision = sklearn.metrics.precision_score(y_test, predictions_cnn)
print("precision: " + str(precision))

# true positive/(true positive + false negative)
sensitivity = sklearn.metrics.recall_score(y_test, predictions_cnn)
print("sensitivity: " + str(sensitivity))

# true negative/(true negative + false positive)
specificity = sklearn.metrics.recall_score(y_test, predictions_cnn, pos_label = 0)
print("specificity: " + str(specificity))

# 2 * (precision*recall)/(precision+recall)
f1_score = 2*(precision*sensitivity)/(precision+sensitivity)
print("f1_score: " + str(f1_score))

# no false positives means precision and specificity will be 100%

In [None]:
# save model
#cnn_model.save('../Snakes-or-No-Snakes/cnn_final_seed1_3e')

In [None]:
# plot accuracy across epochs
plt.plot(cnn_fit.history['accuracy'], label='accuracy')
plt.plot(cnn_fit.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')