This Script:
- brings in **acoustic data**
    - explodes between days
- makes a **test date range** (Jan 1 - Feb 28 2011)
    - left joins (cleaned, less columns) acoustic data to test date range 
        --> **Acou_JanFeb**
- creates **fake prey data**
    - left joins **fake prey data** to **Acou_JanFeb** 
        --> **CCB_JanFeb**
- made some plots of Jan/Feb
- attempted to create new column that describes data availability/overlap

In [4]:
import pandas as pd
import altair as alt
import os
import numpy as np

In [5]:
os.chdir('/Users/cristiana/Documents/Duke/MP/Python')

In [6]:
# read in CCB Acoustic metadata, parse date columns
ccb = pd.read_csv('./Data/CCB Metadata.csv', 
                 sep=',', 
                 encoding='utf-8',
                 parse_dates = ['deployDate','recoveryDate'])

ccb.head()

Unnamed: 0,c_recordOnDays,c_uniqueUnitID,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,recoveryDate,samplingRate_Hz
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.9412,-70.288,2011-05-08,5000
1,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0206_FD020...,2011-02-17,30.5,N,41.8771,-70.254,2011-05-08,5000
2,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0207_FD020...,2011-02-17,30.9,N,41.8948,-70.4439,2011-05-08,5000
3,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0208_FD020...,2011-02-17,32.1,N,41.9334,-70.1859,2011-05-08,5000
4,80,2011_BRP_CCB_S1016_Dep20_20110217_PU0209_FD020...,2011-02-17,42.1,N,41.9508,-70.3901,2011-05-08,5000


In [7]:
# add between days column, explode out (row for each day)
ccb['between_days'] = ccb.apply(lambda row: pd.date_range(row['deployDate'],row['recoveryDate'],freq='D'), axis=1)
ccb_days = ccb.explode('between_days')
ccb_days

Unnamed: 0,c_recordOnDays,c_uniqueUnitID,deployDate,deploymentDepth_Meters,dutyCycle_Flag,latitudeDeployed_DecDeg,longitudeDeployed_DecDeg,recoveryDate,samplingRate_Hz,between_days
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.941200,-70.288000,2011-05-08,5000,2011-02-17
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.941200,-70.288000,2011-05-08,5000,2011-02-18
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.941200,-70.288000,2011-05-08,5000,2011-02-19
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.941200,-70.288000,2011-05-08,5000,2011-02-20
0,71,2011_BRP_CCB_S1016_Dep20_20110217_PU0205_FD020...,2011-02-17,37.2,N,41.941200,-70.288000,2011-05-08,5000,2011-02-21
...,...,...,...,...,...,...,...,...,...,...
37,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0222_FD03178,2018-02-13,,N,41.937667,-70.237983,2018-05-30,5000,2018-05-26
37,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0222_FD03178,2018-02-13,,N,41.937667,-70.237983,2018-05-30,5000,2018-05-27
37,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0222_FD03178,2018-02-13,,N,41.937667,-70.237983,2018-05-30,5000,2018-05-28
37,106,2018_BRP_CCB_S1074_Dep27_20180213_PU0222_FD03178,2018-02-13,,N,41.937667,-70.237983,2018-05-30,5000,2018-05-29


### Create new dataframe with test date range (Jan-Feb of 2011)

In [None]:
# create date range for full time period and add it to dataframe (will test with Jan-Feb 2011)
start_date = '2011-01-01'
end_date = '2011-02-28'

JanFeb2011 = pd.DataFrame({'between_days':pd.date_range(start_date, end_date)})
JanFeb2011.info()

In [None]:
# join ccb data to JanFeb date range
JanFeb_ccb = JanFeb2011.merge(ccb_days, on = 'between_days', how = 'left')
JanFeb_ccb

In [None]:
#pd.DataFrame.to_csv(JanFeb_ccb, "./scratch/JanFeb_ccb.csv")

### Create example prey data

In [None]:
# create fake prey data (add between days column, explode out (row for each day))
# (fake prey = Jan03 - Feb03, 2011)
prey_count = 40
prey_start = '2011-01-03'
prey_end = '2011-02-03'

prey = pd.DataFrame({'Transect': prey_count,
                    'between_days': pd.date_range(prey_start, prey_end)})
prey.head()

### Clean acoustic data

In [None]:
ccb_days.head()

In [None]:
# select essential columns of acoustic data
Acoustic_clean = ccb_days[['c_uniqueUnitID', 'between_days']]
Acoustic_clean

In [None]:
# join clean acoustic data to test (JanFeb) date range
acou_JanFeb = JanFeb2011.merge(Acoustic_clean, on = 'between_days', how = 'left')
acou_JanFeb

In [None]:
# join Acou_JanFeb to fake prey data
CCB_JanFeb = acou_JanFeb.merge(prey, on = 'between_days', how = 'left')
CCB_JanFeb

In [None]:
CCB_JanFeb.dtypes

### Create fake 'data availability' column

In [None]:
CCB_JanFeb

In [3]:
# read in CCB Acoustic metadata, parse date columns
CCB_fake = pd.read_csv('./scratch/CCB_JanFeb_fakecolumn_twoyear.csv', 
                 sep=',', 
                 encoding='utf-8',
                 parse_dates = ['between_days'])

CCB_fake.head()

Unnamed: 0.1,Unnamed: 0,between_days,c_uniqueUnitID,Transect,DataAvail
0,0.0,2011-01-01,,,
1,1.0,2011-01-02,,,
2,2.0,2011-01-03,,40.0,Prey
3,3.0,2011-01-04,,40.0,Prey
4,4.0,2011-01-05,,40.0,Prey


In [None]:
CCB_JanFeb['DataAvail'] = ['No' if x.month == '01' else 'Yes' for x in CCB_JanFeb['between_days']] 
CCB_JanFeb

In [None]:
#pd.DataFrame.to_csv(CCB_JanFeb, "./scratch/CCB_JanFeb_fakecolumn.csv")

In [None]:
# the acoustics column is a general type ('object') that has floats and strings..conditions aren't picking up NaN's
CCB_JanFeb['c_uniqueUnitID'].astype('str')

In [None]:
CCB_JanFeb.info()

In [None]:
type(CCB_JanFeb['c_uniqueUnitID'][1])

### Try to list data availability/overlap in one column

In [None]:
CCB_JanFeb['Transect']

In [None]:
for x in CCB_JanFeb['Transect']:
    print(x, type(x))

In [None]:
CCB_JanFeb['DataAvail'] = ['No' if x == 'nan' else 'Yes' for x in CCB_JanFeb['Transect']] 
CCB_JanFeb

In [None]:
# create conditions
conditions = [
        (CCB_JanFeb['c_uniqueUnitID'] == 'NaN') & (CCB_JanFeb['Transect'] == 'NaN'),
        (CCB_JanFeb['c_uniqueUnitID'] != 'NaN') & (CCB_JanFeb['Transect'] != 'NaN'),
        (CCB_JanFeb['c_uniqueUnitID'] == 'NaN') & (CCB_JanFeb['Transect'] != 'NaN'),
        (CCB_JanFeb['c_uniqueUnitID'] != 'NaN') & (CCB_JanFeb['Transect'] == 'NaN')]

# create resulting values for conditions
values = ['No Data', 'Both', 'Prey', 'Acoustic']

# create column to fill
CCB_JanFeb['DataAvailability'] = np.select(conditions, values)

CCB_JanFeb

In [None]:
# create a list of our conditions
conditions = [
    (df['likes_count'] <= 2),
    (df['likes_count'] > 2) & (df['likes_count'] <= 9),
    (df['likes_count'] > 9) & (df['likes_count'] <= 15),
    (df['likes_count'] > 15)
    ]

# create a list of the values we want to assign for each condition
values = ['tier_4', 'tier_3', 'tier_2', 'tier_1']

# create a new column and use np.select to assign values to it using our lists as arguments
df['tier'] = np.select(conditions, values)

# display updated DataFrame
df.head()

### Plahts

In [7]:
# plot prey and acoustic data availability in Jan and Feb
alt.Chart(CCB_fake).mark_rect().encode(
    x = alt.X('between_days:T', timeUnit = 'monthdate'), # dif btwn year(btwn_days) and btwndays, timUnit
    color = 'DataAvail:N'
).properties(
    width = 500,
    height = 50
).facet(
    row = 'month(between_days)')



In [8]:
# plot of available months and years of data
alt.Chart(CCB_fake).mark_rect().encode(
    y = alt.Y('DataAvail:N'),
    x = alt.X('between_days:T', timeUnit = 'monthdate'),
    color = 'DataAvail:N'
).facet(
    row = 'year(between_days)'
)

In [14]:
# plot prey and acoustic data availability in Jan and Feb
alt.Chart(CCB_fake).mark_rect().encode(
    x = alt.X('date(between_days):O'), # dif btwn year(btwn_days) and btwndays, timUnit
    y = alt.Y('DataAvail:N'),
    color = 'DataAvail:N'
).properties(
    width = 500,
    height = 50
).facet(
    row = 'year(between_days)')

