In [1]:
#libraries
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
import pandas as pd
import os
import datetime
from sklearn.utils.class_weight import compute_class_weight

#Source Files
from lib.partition import split_by_day
import lib.file_utilities as util

# Parsing Files

Given a filepath to the recording directories:
- retrieve x amount of files for each species
- for each file get its metadata
- split the files for each species into dictionaries keyed by day

In [2]:
#directory of each species click folder
ggdir = os.path.abspath("./features/Gg")
lodir = os.path.abspath("./features/Lo/A")

In [3]:
plt.ion()   # enable interactive plotting

#use_onlyN = np.Inf  # use this to get all files
use_onlyN = 10
#get list of click files for each species
ggfiles = util.get_files(ggdir, ".czcc", use_onlyN)
lofiles = util.get_files(lodir, ".czcc", use_onlyN)

In [4]:
#create lists of tuples (.site, .label, .start, .features) for each species
ggmeta_data = util.parse_files(ggfiles)
lometa_data = util.parse_files(lofiles)
test_meta_data = ggmeta_data[0]

#create dictionaries keyed by day
#key=datetime.start value=list[tuples (.site, label, .start, .features)]
gg_day_dict = split_by_day(ggmeta_data)
lo_day_dict = split_by_day(lometa_data)

#create lists of days in dictionaries
gg_keys = list(gg_day_dict.keys())
lo_keys = list(lo_day_dict.keys())

Extracting information about files and loading features for  10 recordings.
Reading file 0/10
Extracting information about files and loading features for  10 recordings.
Reading file 0/10


# Splitting Days


Using the dictionaries created:
- split the keyed days into training and testing days

In [5]:
#create lists of lists for training days and test days
#<species>_train_test_days[0] is train
#<species>_train_test_days[1] is test
gg_train_test_days = train_test_split(gg_keys, test_size=0.33, random_state=42)
lo_train_test_days = train_test_split(lo_keys, test_size=0.33, random_state=42)

gg_train_days = gg_train_test_days[0]
gg_test_days = gg_train_test_days[1]

lo_train_days = lo_train_test_days[0]
lo_test_days = lo_train_test_days[1]

#print("GG Train Days: {}\n {} \n".format(len(gg_train_days), gg_train_days))
#print("GG Test Days: {}\n {} \n".format(len(gg_test_days), gg_test_days))

#print("LO Train Days: {}\n {} \n".format(len(lo_train_days), lo_train_days))
#print("LO Test Days: {}\n {} \n".format(len(lo_test_days), lo_test_days))

# Create Training Data

Given a list of training days for each species:
- create a large tensor of examples of shape (x, 20)
- create a large vector of labels of size (x, 2)
- labels are one hot encoded for each species
- Gg: [1,0]
- Lo: [0,1]

- Class Weights
    - uses compute_class_weights from sklearn library to compute weights to alleviate class imbalance
    - the compute class_imbalance library does not work with one hot encoded labels so a second label array
    one_d_labels has been created that is 1 dimensional
    - this array is used to create the weight dictionary that will be passed to class_weight parameter in fit()


In [6]:
def create_train_data(train_days_gg, train_days_lo):
    train_tensor_examples = []
    train_tensor_labels = []
    one_d_labels = []
    
    #iterate through gg training days
    for day in train_days_gg: 
        #print("Day: {}\n".format(day))
        #iterate through recordings in day
        for recording in gg_day_dict[day]:
            #get row, col to make correct amount of labels
            row, col = recording.features.shape
            #print("This recording has rows: {} cols: {} \n".format(row, col))
            train_tensor_examples.append(recording.features)
            #create row amount of gg labels
            train_tensor_labels.append([[1,0]] * row)
            one_d_labels.append(np.full(row, 0))
            
    
    #iterate through lo training days
    for day in train_days_lo:
        #print("Day: {}\n".format(day))
        #iterate through recordings in day
        for recording in lo_day_dict[day]:
            #get row, col to make correct amount of labels
            row, col = recording.features.shape
            #print("This recording has rows: {} cols: {}\n".format(row, col))
            train_tensor_examples.append(recording.features)
            #create row amount of lo labels
            train_tensor_labels.append([[0,1]] * row)
            one_d_labels.append(np.full(row, 1))
            
           
    #stack all recordings on top of one another, creates (total # training examples, 20) ndarray
    big_train_examples = np.concatenate(train_tensor_examples, axis=0)
    #combines all labels together, create vector of size (total # training examples)
    one_d_labels = np.concatenate(one_d_labels, axis=0)
    big_train_labels = np.concatenate(train_tensor_labels, axis=0)
    
    #print("Shape of training data: {}\n".format(big_train_examples.shape))
    #print("Size of label vector: {}\n".format(big_train_labels.size))
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(one_d_labels), y=one_d_labels)
    class_weights_dict = dict(enumerate(class_weights))
    
    return big_train_examples, big_train_labels, class_weights_dict

In [7]:
X_train, Y_train, class_weights = create_train_data(gg_train_days, lo_train_days)
print(X_train.shape)
print(Y_train.shape)
print(class_weights)

(24792, 20)
(24792, 2)
{0: 0.8976754290679991, 1: 1.1286533733952473}


# Create Testing Data

Given a list of test days for each species:
- Examples
    - create a list of batches of features
    - each batch is of shape (100, 20)
    - each click in a batch comes from the same file

- Labels
    - create a list of batches of labels
    - each batch is of shape (100, 2)
    - each label in a batch comes from the same file
    - labels are still one hot encoded, refer to training data

In [28]:
# similar to making training data, but we don't combine everything
#instead we create a list that we can iterate through and test on each element separately
def create_test_data(test_days_gg, test_days_lo):
    test_tensor_examples = []
    test_tensor_labels = []
    
    #iterate through gg test days
    for day in test_days_gg:
        for recording in gg_day_dict[day]:
            if len(recording.features) < 100:
                continue
            groups, groups_labels = hunnit_group(recording)
            test_tensor_examples.append(groups)
            test_tensor_labels.append(groups_labels)
            
    for day in test_days_lo:
        for recording in lo_day_dict[day]:
            if len(recording.features) < 100:
                continue
            groups, groups_labels = hunnit_group(recording)
            test_tensor_examples.append(groups)
            test_tensor_labels.append(groups_labels)
            
    big_test_examples = np.concatenate(test_tensor_examples, axis=0)
    big_test_labels = np.concatenate(test_tensor_labels, axis=0)
            
    return big_test_examples, big_test_labels

In [29]:
# helper function to create batches, takes a metadata tuple
def hunnit_group(recording):
    recording_tensor = recording.features
    print("Number of rows in features {}".format(len(recording_tensor)))
    if recording.label == "Gg":
        print("Label is Gg: [1,0]\n")
        label = 0
    else:
        print("Label is Lo: [0,1]\n")
        label = 1
    # list comprehension to grab as many 100 clicks as we can
    hunnit_batches = [recording_tensor[x:x+100] for x in range(0, len(recording_tensor), 100) if ((len(recording_tensor) - x) >= 100)]
    # for each 100 clicks batch create a corresponding label vector
    label_batches = [np.array(label)] * len(hunnit_batches)
    return hunnit_batches, label_batches

In [32]:
X_test, Y_test = create_test_data(gg_test_days, lo_test_days)

print("Number of batches to go through: {}\n".format(len(X_test)))
print(X_test[265].shape)
print(len(X_test[265]))
#print(len(Y_test[265]))
print(X_test[265])
#print(Y_test[265])
print(Y_test)


Number of rows in features 120
Label is Gg: [1,0]

Number of rows in features 360
Label is Gg: [1,0]

Number of rows in features 1180
Label is Gg: [1,0]

Number of rows in features 4770
Label is Gg: [1,0]

Number of rows in features 108
Label is Lo: [0,1]

Number of rows in features 5158
Label is Lo: [0,1]

Number of rows in features 5294
Label is Lo: [0,1]

Number of rows in features 10098
Label is Lo: [0,1]

Number of batches to go through: 266

(100, 20)
100
[[ 2.55555313e+02 -2.51788692e+01 -3.47140808e+01 ... -2.55522311e-01
   5.52612782e-01  1.32973158e+00]
 [ 5.70930290e+01  1.44229145e+01 -2.53474312e+01 ...  1.12609797e+01
   7.46982479e+00  6.09665573e-01]
 [ 7.37710114e+01  8.85740471e+00 -5.24337540e+01 ...  1.45737171e+01
   4.26936102e+00 -2.67292809e+00]
 ...
 [ 6.83872833e+01  1.31987944e+01 -2.66798344e+01 ...  1.09381065e+01
   5.23020029e+00 -6.38920498e+00]
 [ 7.55746841e+01  2.49358845e+01 -2.42202511e+01 ...  1.67088585e+01
   4.40210676e+00 -3.58579278e+00]
 [ 2

# Create Model

In [11]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers

In [12]:
model = Sequential()
model.add(Dense(100, input_dim=20, activation='relu', kernel_regularizer='l2'))
model.add(Dense(100, activation='relu', kernel_regularizer='l2'))
model.add(Dense(100, activation='relu', kernel_regularizer='l2'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model

In [13]:
nn_model = model.fit(X_train, Y_train, epochs=5, batch_size=16, class_weight=class_weights)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Test Model

In [35]:
out = model.predict(X_test[0])

In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
pred_classes = []
for x in X_test:
    out = model.predict(x)
    sum_prob = np.sum(np.log(out), axis = 0)
    #print(sum_prob)
    #print(np.argmax(sum_prob))
    pred_classes.append(np.argmax(sum_prob))

In [38]:
confoos = confusion_matrix(Y_test, pred_classes)

In [39]:
confoos

array([[ 62,   0],
       [  3, 201]], dtype=int64)