In [23]:
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from lib.partition import split_by_day
import lib.file_utilities as util
import os
import datetime
from sklearn.model_selection import train_test_split 
import pandas as pd

In [2]:
#directory of each species click folder
ggdir = os.path.abspath("./features/Gg")
lodir = os.path.abspath("./features/Lo")

In [3]:
plt.ion()   # enable interactive plotting

#use_onlyN = np.Inf  # use this to get all files
use_onlyN = 10
#get list of click files for each species
ggfiles = util.get_files(ggdir, ".czcc", use_onlyN)
lofiles = util.get_files(lodir, ".czcc", use_onlyN)

In [37]:
#create lists of tuples (.site, .label, .start, .features) for each species
ggmeta_data = util.parse_files(ggfiles)
lometa_data = util.parse_files(lofiles)

#create dictionaries keyed by day
#key=datetime.start value=list[tuples (.site, label, .start, .features)]
gg_day_dict = split_by_day(ggmeta_data)
lo_day_dict = split_by_day(lometa_data)

#create lists of days in dictionaries
gg_keys = list(gg_day_dict.keys())
lo_keys = list(lo_day_dict.keys())

Extracting information about files and loading features for  10 recordings.
Reading file 0/10
Extracting information about files and loading features for  11 recordings.
Reading file 0/11
10
[[ 21.22266197  18.08026123 -55.92998505 ...  10.63014317 -20.54516792
    0.75878745]
 [ 44.82302856  -0.89646739 -48.98347092 ...   7.28026104  -2.92683625
   -6.40061665]
 [ 60.1114502   20.60423088 -35.70820999 ...   3.87498951  -3.25583935
   -8.86636543]
 ...
 [123.2456131  -37.61224365 -94.31462097 ...   1.51197267  -4.94524336
   -6.63756227]
 [ 95.93180847 -18.74723434 -58.40883255 ...   4.69485521  -2.48706627
    7.36611462]
 [ 74.69039917   3.21624899 -47.29899979 ...   7.31964016  -4.19823313
   -3.24057126]]
4070


In [5]:
#create lists of lists for training days and test days
#<species>_train_test_days[0] is train
#<species>_train_test_days[1] is test
gg_train_test_days = train_test_split(gg_keys, test_size=0.33, random_state=42)
lo_train_test_days = train_test_split(lo_keys, test_size=0.33, random_state=42)

In [6]:
print("Number of days in gg Train data {} \n".format(len(gg_train_test_days[0])))
print("Number of days in gg Test data {}\n".format(len(gg_train_test_days[1])))

#lists to hold feature matrices
gg_train_features = []
gg_test_features = []

#iterate through train/test list
print("This is for GG features")
for x in range(2):
    print("\nIn {} data\n".format(x))
    #iterate through days
    for y in gg_train_test_days[x]:
        print("Curr Date is {} \n".format(y))
        #iterate through tuples in that day
        print("Number of features in this date {} \n".format(len(gg_day_dict[y])))
        for z in gg_day_dict[y]:
            #append feature to train list
            if x == 0:
                gg_train_features.append(z.features)
            #append feature to test list
            if x == 1:
                gg_test_features.append(z.features)

lo_train_features = []
lo_test_features = []
print()
print("This is for LO features")
#iterate through train/test list
for x in range(2):
    print("\nIn {} data\n".format(x))
    #iterate through days
    for y in lo_train_test_days[x]:
        print("Curr Date is {} \n".format(y))
        #iterate through tuples in that day
        print("Number of features in this date {} \n".format(len(lo_day_dict[y])))
        for z in lo_day_dict[y]:
            #append feature to train list
            if x == 0:
                lo_train_features.append(z.features)
            #append feature to test list
            if x == 1:
                lo_test_features.append(z.features)
                
print("GG Training Features \n{}".format(len(gg_train_features)))
print("GG Testing Features \n{}".format(len(gg_test_features)))

print("LO Testing Features \n{}".format(len(lo_train_features)))
print("LO Testing Features \n{}".format(len(lo_test_features)))
            

            
        

Number of days in gg Train data
3
Number of days in gg Test data
2

[[datetime.date(2006, 8, 15), datetime.date(2009, 1, 25), datetime.date(2006, 8, 17)], [datetime.date(2009, 2, 23), datetime.date(2006, 8, 19)]]
2

This is for GG features

In 0 data

Curr Date is 2006-08-15 

Number of features in this date 2 

Curr Date is 2009-01-25 

Number of features in this date 2 

Curr Date is 2006-08-17 

Number of features in this date 1 


In 1 data

Curr Date is 2009-02-23 

Number of features in this date 3 

Curr Date is 2006-08-19 

Number of features in this date 2 


This is for LO features

In 0 data

Curr Date is 2007-02-20 

Number of features in this date 3 

Curr Date is 2007-02-27 

Number of features in this date 1 

Curr Date is 2007-02-24 

Number of features in this date 1 

Curr Date is 2006-10-18 

Number of features in this date 1 


In 1 data

Curr Date is 2009-02-19 

Number of features in this date 1 

Curr Date is 2007-01-24 

Number of features in this date 3 

Curr 

20
20
[[ 21.22266197  18.08026123 -55.92998505 ...  10.63014317 -20.54516792
    0.75878745]
 [ 44.82302856  -0.89646739 -48.98347092 ...   7.28026104  -2.92683625
   -6.40061665]
 [ 60.1114502   20.60423088 -35.70820999 ...   3.87498951  -3.25583935
   -8.86636543]
 ...
 [123.2456131  -37.61224365 -94.31462097 ...   1.51197267  -4.94524336
   -6.63756227]
 [ 95.93180847 -18.74723434 -58.40883255 ...   4.69485521  -2.48706627
    7.36611462]
 [ 74.69039917   3.21624899 -47.29899979 ...   7.31964016  -4.19823313
   -3.24057126]]
