In [54]:
#libraries
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
import pandas as pd
import os
import datetime

#Source Files
from lib.partition import split_by_day
import lib.file_utilities as util

# Parsing Files

Given a filepath to the recording directories:
- retrieve x amount of files for each species
- for each file get its metadata
- split the files for each species into dictionaries keyed by day

In [55]:
#directory of each species click folder
ggdir = os.path.abspath("./features/Gg")
lodir = os.path.abspath("./features/Lo/A")

In [56]:
plt.ion()   # enable interactive plotting

#use_onlyN = np.Inf  # use this to get all files
use_onlyN = 10
#get list of click files for each species
ggfiles = util.get_files(ggdir, ".czcc", use_onlyN)
lofiles = util.get_files(lodir, ".czcc", use_onlyN)

In [67]:
#create lists of tuples (.site, .label, .start, .features) for each species
ggmeta_data = util.parse_files(ggfiles)
lometa_data = util.parse_files(lofiles)
test_meta_data = ggmeta_data[0]

#create dictionaries keyed by day
#key=datetime.start value=list[tuples (.site, label, .start, .features)]
gg_day_dict = split_by_day(ggmeta_data)
lo_day_dict = split_by_day(lometa_data)

#create lists of days in dictionaries
gg_keys = list(gg_day_dict.keys())
lo_keys = list(lo_day_dict.keys())

Extracting information about files and loading features for  10 recordings.
Reading file 0/10
Extracting information about files and loading features for  10 recordings.
Reading file 0/10


# Splitting Days


Using the dictionaries created:
- split the keyed days into training and testing days

In [59]:
#create lists of lists for training days and test days
#<species>_train_test_days[0] is train
#<species>_train_test_days[1] is test
gg_train_test_days = train_test_split(gg_keys, test_size=0.33, random_state=42)
lo_train_test_days = train_test_split(lo_keys, test_size=0.33, random_state=42)

gg_train_days = gg_train_test_days[0]
gg_test_days = gg_train_test_days[1]

lo_train_days = lo_train_test_days[0]
lo_test_days = lo_train_test_days[1]

#print("GG Train Days: {}\n {} \n".format(len(gg_train_days), gg_train_days))
#print("GG Test Days: {}\n {} \n".format(len(gg_test_days), gg_test_days))

#print("LO Train Days: {}\n {} \n".format(len(lo_train_days), lo_train_days))
#print("LO Test Days: {}\n {} \n".format(len(lo_test_days), lo_test_days))

# Create Training Data

Given a list of training days for each species:
- create a large tensor of examples of shape (x, 20)
- create a large vector of labels of size (x, 2)
- labels are one hot encoded for each species
- Gg: [1,0]
- Lo: [0,1]


In [60]:
def create_train_data(train_days_gg, train_days_lo):
    train_tensor_examples = []
    train_tensor_labels = []
    
    #iterate through gg training days
    for day in train_days_gg: 
        #print("Day: {}\n".format(day))
        #iterate through recordings in day
        for recording in gg_day_dict[day]:
            #get row, col to make correct amount of labels
            row, col = recording.features.shape
            #print("This recording has rows: {} cols: {} \n".format(row, col))
            train_tensor_examples.append(recording.features)
            #create row amount of gg labels
            train_tensor_labels.append([[1,0]] * row)
            
    
    #iterate through lo training days
    for day in train_days_lo:
        #print("Day: {}\n".format(day))
        #iterate through recordings in day
        for recording in lo_day_dict[day]:
            #get row, col to make correct amount of labels
            row, col = recording.features.shape
            #print("This recording has rows: {} cols: {}\n".format(row, col))
            train_tensor_examples.append(recording.features)
            #create row amount of lo labels
            train_tensor_labels.append([[0,1]] * row)
           
    #stack all recordings on top of one another, creates (total # training examples, 20) ndarray
    big_train_examples = np.concatenate(train_tensor_examples, axis=0)
    #combines all labels together, create vector of size (total # training examples)
    big_train_labels = np.concatenate(train_tensor_labels, axis=0)
    
    #print("Shape of training data: {}\n".format(big_train_examples.shape))
    #print("Size of label vector: {}\n".format(big_train_labels.size))    
    
    return big_train_examples, big_train_labels

In [61]:
X_train, Y_train = create_train_data(gg_train_days, lo_train_days)
print(X_train.shape)
print(Y_train.shape)

(24792, 20)
(24792, 2)


# Create Testing Data

Given a list of test days for each species:
- Examples
    - create a list of batches of features
    - each batch is of shape (100, 20)
    - each click in a batch comes from the same file

- Labels
    - create a list of batches of labels
    - each batch is of shape (100, 2)
    - each label in a batch comes from the same file
    - labels are still one hot encoded, refer to training data

In [110]:
# similar to making training data, but we don't combine everything
#instead we create a list that we can iterate through and test on each element separately
def create_test_data(test_days_gg, test_days_lo):
    test_tensor_examples = []
    test_tensor_labels = []
    
    #iterate through gg test days
    for day in test_days_gg:
        for recording in gg_day_dict[day]:
            if len(recording.features) < 100:
                continue
            groups, groups_labels = hunnit_group(recording)
            test_tensor_examples.append(groups)
            test_tensor_labels.append(groups_labels)
            
    for day in test_days_lo:
        for recording in lo_day_dict[day]:
            if len(recording.features) < 100:
                continue
            groups, groups_labels = hunnit_group(recording)
            test_tensor_examples.append(groups)
            test_tensor_labels.append(groups_labels)
            
    big_test_examples = np.concatenate(test_tensor_examples, axis=0)
    big_test_labels = np.concatenate(test_tensor_labels, axis=0)
            
    return big_test_examples, big_test_labels

In [111]:
# helper function to create batches, takes a metadata tuple
def hunnit_group(recording):
    recording_tensor = recording.features
    print("Number of rows in features {}".format(len(recording_tensor)))
    if recording.label == "Gg":
        print("Label is Gg: [1,0]\n")
        label = [1,0]
    else:
        print("Label is Lo: [0,1]\n")
        label = [0,1]
    # list comprehension to grab as many 100 clicks as we can
    hunnit_batches = [recording_tensor[x:x+100] for x in range(0, len(recording_tensor), 100) if ((len(recording_tensor) - x) >= 100)]
    # for each 100 clicks batch create a corresponding label vector
    label_batches = [np.array([label] * 100)] * len(hunnit_batches)
    return hunnit_batches, label_batches

In [117]:
X_test, Y_test = create_test_data(gg_test_days, lo_test_days)

#print(len(X_test[7]))
print("Number of batches to go through: {}\n".format(len(X_test)))
print(X_test[8].shape)
print(len(Y_test[7]))
print(Y_test[7].shape)


Number of rows in features 120
Label is Gg: [1,0]

Number of rows in features 360
Label is Gg: [1,0]

Number of rows in features 1180
Label is Gg: [1,0]

Number of rows in features 4770
Label is Gg: [1,0]

Number of rows in features 108
Label is Lo: [0,1]

Number of rows in features 5158
Label is Lo: [0,1]

Number of rows in features 5294
Label is Lo: [0,1]

Number of rows in features 10098
Label is Lo: [0,1]

Number of batches to go through: 266

(100, 20)
100
(100, 2)


# Create Model

In [27]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers

In [41]:
model = Sequential()
model.add(Dense(100, input_dim=20, activation='relu', kernel_regularizer='l2'))
model.add(Dense(100, activation='relu', kernel_regularizer='l2'))
model.add(Dense(100, activation='relu', kernel_regularizer='l2'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model

In [42]:
nn_model = model.fit(X_train, Y_train, epochs=5, batch_size=16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Test Model