In [1]:
import pandas as pd
import altair as alt
import os
import numpy as np

In [2]:
os.chdir('/Users/cristiana/Documents/Duke/MP/Python')

In [3]:
# read in CCB Acoustic metadata, parse date columns
ccb = pd.read_csv('./Data/CCB Metadata.csv', 
                 sep=',', 
                 encoding='utf-8',
                 parse_dates = ['deployDate','recoveryDate'])

ccb.head()

Unnamed: 0,c_recordOnDays,c_uniqueUnitID,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,recoveryDate,samplingRate_Hz
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.9412,-70.288,2011-05-08,5000
1,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0206_FD020...,2011-02-17,30.5,N,41.8771,-70.254,2011-05-08,5000
2,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0207_FD020...,2011-02-17,30.9,N,41.8948,-70.4439,2011-05-08,5000
3,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0208_FD020...,2011-02-17,32.1,N,41.9334,-70.1859,2011-05-08,5000
4,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0209_FD020...,2011-02-17,42.1,N,41.9508,-70.3901,2011-05-08,5000


In [4]:
# add column of ones and depYear to ccb
ccb['ones'] = 1
ccb['depYear'] = ccb['deployDate'].dt.year
ccb.head()

Unnamed: 0,c_recordOnDays,c_uniqueUnitID,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,recoveryDate,samplingRate_Hz,ones,depYear
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.9412,-70.288,2011-05-08,5000,1,2011
1,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0206_FD020...,2011-02-17,30.5,N,41.8771,-70.254,2011-05-08,5000,1,2011
2,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0207_FD020...,2011-02-17,30.9,N,41.8948,-70.4439,2011-05-08,5000,1,2011
3,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0208_FD020...,2011-02-17,32.1,N,41.9334,-70.1859,2011-05-08,5000,1,2011
4,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0209_FD020...,2011-02-17,42.1,N,41.9508,-70.3901,2011-05-08,5000,1,2011


In [5]:
# group by deployment
ccb_group = ccb.groupby(['depYear']).agg({'ones':'sum'})
ccb_group

Unnamed: 0_level_0,ones
depYear,Unnamed: 1_level_1
2011,8
2013,9
2014,5
2015,4
2016,4
2017,4
2018,4


In [6]:
# groupby, retain data
    # this selected a single hydrophone from each deployment...not sure if this is what I want
ccb_group2 = ccb.loc[ccb.groupby("depYear")["deployDate"].idxmin()]
ccb_group2

Unnamed: 0,c_recordOnDays,c_uniqueUnitID,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,recoveryDate,samplingRate_Hz,ones,depYear
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.9412,-70.288,2011-05-08,5000,1,2011
8,100,2013_BRP_CCB_S1011_Dep22_20130126_PU0016_FD02130,2013-01-28,37.2,N,41.94075,-70.287483,2013-05-06,5000,1,2013
21,172,2014_BRP_CCB_S1038_Dep23_20140131_AM0269_AM00000,2014-02-07,30.0,N,41.9563,-70.231333,2014-05-17,64000,1,2014
22,93,2015_BRP_CCB_S1059_Dep24_20150217_PU0148_FD02640,2015-02-17,38.0,N,41.9678,-70.255667,2015-05-21,5000,1,2015
26,111,2016_BRP_CCB_S1060_Dep25_20160214_PU0163_FD02810,2016-02-15,34.1,N,41.957367,-70.230967,2016-05-25,5000,1,2016
30,106,2017_BRP_CCB_S1073_Dep26_20170219_PU0100_FD029...,2017-02-21,37.5,N,41.967545,-70.254935,2017-05-16,5000,1,2017
34,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000,1,2018


In [7]:
# add between days column, explode out (row for each day)
ccb_group2['between_days'] = ccb_group2.apply(lambda row: pd.date_range(row['deployDate'],row['recoveryDate'],freq='D'), axis=1)
ccb_days = ccb_group2.explode('between_days')
ccb_days

Unnamed: 0,c_recordOnDays,c_uniqueUnitID,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,recoveryDate,samplingRate_Hz,ones,depYear,between_days
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.94120,-70.28800,2011-05-08,5000,1,2011,2011-02-17
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.94120,-70.28800,2011-05-08,5000,1,2011,2011-02-18
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.94120,-70.28800,2011-05-08,5000,1,2011,2011-02-19
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.94120,-70.28800,2011-05-08,5000,1,2011,2011-02-20
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.94120,-70.28800,2011-05-08,5000,1,2011,2011-02-21
...,...,...,...,...,...,...,...,...,...,...,...,...
34,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000,1,2018,2018-05-26
34,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000,1,2018,2018-05-27
34,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000,1,2018,2018-05-28
34,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000,1,2018,2018-05-29


In [8]:
# select essential columns of acoustic data
Acoustic_clean = ccb_days[['c_uniqueUnitID', 'between_days']]
Acoustic_clean

Unnamed: 0,c_uniqueUnitID,between_days
0,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17
0,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-18
0,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-19
0,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-20
0,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-21
...,...,...
34,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-05-26
34,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-05-27
34,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-05-28
34,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-05-29


In [9]:
# create date range for full time period and add it to dataframe (will test 2011/01/01 - 2018/05/30)
start_date = '2011-01-01'
end_date = '2018-05-30'

SevenYears = pd.DataFrame({'between_days':pd.date_range(start_date, end_date)})
SevenYears.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2707 entries, 0 to 2706
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   between_days  2707 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 21.3 KB


In [10]:
# join ccb data to SevenYear date range
SevenYears_acou = SevenYears.merge(ccb_days, on = 'between_days', how = 'left')
SevenYears_acou

Unnamed: 0,between_days,c_recordOnDays,c_uniqueUnitID,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,recoveryDate,samplingRate_Hz,ones,depYear
0,2011-01-01,,,NaT,,,,,NaT,,,
1,2011-01-02,,,NaT,,,,,NaT,,,
2,2011-01-03,,,NaT,,,,,NaT,,,
3,2011-01-04,,,NaT,,,,,NaT,,,
4,2011-01-05,,,NaT,,,,,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2702,2018-05-26,106.0,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000.0,1.0,2018.0
2703,2018-05-27,106.0,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000.0,1.0,2018.0
2704,2018-05-28,106.0,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000.0,1.0,2018.0
2705,2018-05-29,106.0,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018-02-13,,N,41.95665,-70.23315,2018-05-30,5000.0,1.0,2018.0


In [32]:
# export csv of SevenYears_acou
##SevenYears_acou.to_csv('./scratch/SevenYears_acou.csv')

In [13]:
SevenYears_seasonal = SevenYears_acou[SevenYears_acou['between_days'].dt.month <= 5]
SevenYears_seasonal.head()

Unnamed: 0,between_days,c_recordOnDays,c_uniqueUnitID,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,recoveryDate,samplingRate_Hz,ones,depYear
0,2011-01-01,,,NaT,,,,,NaT,,,
1,2011-01-02,,,NaT,,,,,NaT,,,
2,2011-01-03,,,NaT,,,,,NaT,,,
3,2011-01-04,,,NaT,,,,,NaT,,,
4,2011-01-05,,,NaT,,,,,NaT,,,


In [45]:
#SevenYears_seasonal.to_csv('./scratch/SevenYears_seasonal.csv')

### Fake Data

In [16]:
# fake prey data -- chunk 1
p_count_1 = 45
p_start_1 = '2011-01-20'
p_end_1 = '2013-03-20'

# fake prey data -- chunk 2
p_count_2 = 56
p_start_2 = '2014-04-01'
p_end_2 = '2015-04-29'

# fake prey data -- chunk 3
p_count_3 = 100
p_start_3 = '2016-02-15'
p_end_3 = '2018-05-05'

# create fake prey dataframes
prey1 = pd.DataFrame({'Transect': p_count_1,
                    'between_days': pd.date_range(p_start_1, p_end_1)})

prey2 = pd.DataFrame({'Transect': p_count_2,
                    'between_days': pd.date_range(p_start_2, p_end_2)})

prey3 = pd.DataFrame({'Transect': p_count_3,
                    'between_days': pd.date_range(p_start_3, p_end_3)})

# trim prey datasets to be seasonal (to fit my example master df..)
prey1_seasonal = prey1[prey1['between_days'].dt.month <= 5]

In [21]:
# concatenate prey1 with sevenyears data (days in which both types of data are found are duplicated)
prey1_join = SevenYears_seasonal.append(prey1_seasonal, sort = True)
prey1_join

Unnamed: 0,Transect,between_days,c_recordOnDays,c_uniqueUnitID,depYear,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,ones,recoveryDate,samplingRate_Hz
0,,2011-01-01,,,,NaT,,,,,,NaT,
1,,2011-01-02,,,,NaT,,,,,,NaT,
2,,2011-01-03,,,,NaT,,,,,,NaT,
3,,2011-01-04,,,,NaT,,,,,,NaT,
4,,2011-01-05,,,,NaT,,,,,,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,45.0,2013-03-16,,,,NaT,,,,,,NaT,
787,45.0,2013-03-17,,,,NaT,,,,,,NaT,
788,45.0,2013-03-18,,,,NaT,,,,,,NaT,
789,45.0,2013-03-19,,,,NaT,,,,,,NaT,


In [35]:
# manually added some prey data into same column as acoustic data
overlap_test = pd.read_csv('./scratch/Prey1_join_test.csv',
                          parse_dates = ['between_days'])
overlap_test # c_recordOnDays = field with overlapping data (2011, feb-march)

Unnamed: 0.1,Unnamed: 0,Transect,between_days,c_recordOnDays,c_uniqueUnitID,depYear,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,ones,recoveryDate,samplingRate_Hz
0,0,,2011-01-01,,,,,,,,,,,
1,1,,2011-01-02,,,,,,,,,,,
2,2,,2011-01-03,,,,,,,,,,,
3,3,,2011-01-04,,,,,,,,,,,
4,4,,2011-01-05,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,2702,,2018-05-26,106.0,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018.0,2018-02-13,,N,41.95665,-70.23315,1.0,2018-05-30,5000.0
1568,2703,,2018-05-27,106.0,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018.0,2018-02-13,,N,41.95665,-70.23315,1.0,2018-05-30,5000.0
1569,2704,,2018-05-28,106.0,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018.0,2018-02-13,,N,41.95665,-70.23315,1.0,2018-05-30,5000.0
1570,2705,,2018-05-29,106.0,2018_BRP_CCB_S1074_Dep27_20180213_PU0202_FD03186,2018.0,2018-02-13,,N,41.95665,-70.23315,1.0,2018-05-30,5000.0


In [54]:
small_overlap = pd.read_csv('./scratch/JanFebMarch_overlapTest_small.csv',
                          parse_dates = ['between_days'])

small_overlap

Unnamed: 0,between_days,c_recordOnDays,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,2011-01-01,,,,,,
1,2011-01-02,,,,,,
2,2011-01-03,,,,,,
3,2011-01-04,,,,,,
4,2011-01-05,,,,,,
...,...,...,...,...,...,...,...
119,2011-03-11,71.0,,,,,
120,2011-03-11,45.0,,,,,
121,2011-03-12,71.0,,,,,
122,2011-03-12,45.0,,,,,


In [56]:
shmall_overlap = small_overlap[['between_days', 'c_recordOnDays']]
shmall_overlap

Unnamed: 0,between_days,c_recordOnDays
0,2011-01-01,
1,2011-01-02,
2,2011-01-03,
3,2011-01-04,
4,2011-01-05,
...,...,...
119,2011-03-11,71.0
120,2011-03-11,45.0
121,2011-03-12,71.0
122,2011-03-12,45.0


In [57]:
shmall_overlap.dtypes

between_days      datetime64[ns]
c_recordOnDays           float64
dtype: object

In [22]:
#prey1_join.to_csv('./scratch/Prey1_join.csv')

In [None]:
# steps
    # make prey chunks seasonal (trim just Jan-May)
    # join (/concatenate) seasonal prey chunk1 to my SevenYears df -- see if it duplicates the days
    # test this visual - can I make both prey and acoustics show up as sep bars on same days?

### Plot
< years of acoustic data>

In [36]:
# high level year summary (has a leap year gap..)
alt.Chart(SevenYears_seasonal).mark_rect().encode(
    x = 'monthdate(between_days):T',
    y = 'year(between_days):O',
    color = alt.Color('depYear:N',
                     scale = alt.Scale(
                     domain = ['null','2011','2013','2014','2015','2016','2017','2018'],
                     range =['white','red','orange','yellow','green','blue','indigo','violet']))
).properties(
    width = 700,
    height = 100)


In [37]:
# similar plot, facet years rather than color
alt.Chart(SevenYears_seasonal).mark_bar().encode(
    x = 'monthdate(deployDate):T',
    x2 = 'monthdate(recoveryDate):T'
).facet(
    row = 'year(between_days):O')

## ** how to show 2012 gap?

In [38]:
# similar plot, facet years rather than color
alt.Chart(SevenYears_seasonal).mark_bar().encode(
    x = 'monthdate(deployDate):T',
    x2 = 'monthdate(recoveryDate):T',
    y = 'year(between_days):T'
).properties(
    width = 900,
    height = 300)

# ** how to start x axis from Jan 1?

In [39]:
overlap_test.head()

Unnamed: 0.1,Unnamed: 0,Transect,between_days,c_recordOnDays,c_uniqueUnitID,depYear,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,ones,recoveryDate,samplingRate_Hz
0,0,,2011-01-01,,,,,,,,,,,
1,1,,2011-01-02,,,,,,,,,,,
2,2,,2011-01-03,,,,,,,,,,,
3,3,,2011-01-04,,,,,,,,,,,
4,4,,2011-01-05,,,,,,,,,,,


In [41]:
overlap_test.dtypes

Unnamed: 0                           int64
Transect                           float64
between_days                datetime64[ns]
c_recordOnDays                     float64
c_uniqueUnitID                      object
depYear                            float64
deployDate                          object
deploymentDepth_Meters             float64
dutyCycle_Flag                      object
latitudeDeployed_DecDeg            float64
longitudeDeployed_DecDeg           float64
ones                               float64
recoveryDate                        object
samplingRate_Hz                    float64
dtype: object

In [66]:
# plot overlap test data
alt.Chart(overlap_test).mark_bar().encode(
    y = alt.Y('c_recordOnDays:N'),
    x = alt.X('monthdate(between_days):T'),
    color = 'c_recordOnDays:N'
).facet(
    row = 'year(between_days):T')

In [65]:
# try plotting small overlap subset
alt.Chart(shmall_overlap).mark_rect().encode(
    y = alt.Y('c_recordOnDays:N'),
    x = alt.X('monthdate(between_days):T'),
    color = 'c_recordOnDays:N')

In [61]:
# try plotting small overlap subset
alt.Chart(shmall_overlap).mark_rect().encode(
    y = alt.Y('month(between_days):O'),
    x = alt.X('monthdate(between_days):T'),
    color = 'c_recordOnDays:N'
).facet(
    row = 'year(between_days)')