### Overlap Framework

This script does the following
- brings in 3 dataframes (acoustic, whale, and zooplankton data)
    - formats each piece for combination into single df
- combines pieces into single df

... next need to figure out how to generate overlap report

In [28]:
import pandas as pd
import numpy as np

`Acoustic Data`

In [2]:
# read in acoustic deployment CSV (received from Enrico on 3/25/21)
ccb = pd.read_csv('../InputsForEric_OverlapFramework/Acoustic_deployments_2008-2018.csv',
                 sep = ',',
                 encoding = 'utf-8',
                 parse_dates = ['deployDate','recoveryDate'])
ccb

Unnamed: 0,deployDate,recoveryDate
0,2008-02-23,2008-05-20
1,2009-02-17,2009-05-16
2,2010-03-08,2010-05-05
3,2011-02-17,2011-05-08
4,2013-01-28,2013-05-06
5,2014-03-01,2014-05-21
6,2015-02-17,2015-05-21
7,2016-02-15,2016-05-25
8,2017-02-21,2017-05-16
9,2018-02-13,2018-05-30


In [3]:
# create between_days column that fills in days between deployDate and recoveryDate
ccb['between_days'] = ccb.apply(lambda row: pd.date_range(row['deployDate'],row['recoveryDate'],freq='D'), axis=1)

# explode out between_days so there is one record per day
ccb_explode = ccb.explode('between_days')

ccb_explode

Unnamed: 0,deployDate,recoveryDate,between_days
0,2008-02-23,2008-05-20,2008-02-23
0,2008-02-23,2008-05-20,2008-02-24
0,2008-02-23,2008-05-20,2008-02-25
0,2008-02-23,2008-05-20,2008-02-26
0,2008-02-23,2008-05-20,2008-02-27
...,...,...,...
9,2018-02-13,2018-05-30,2018-05-26
9,2018-02-13,2018-05-30,2018-05-27
9,2018-02-13,2018-05-30,2018-05-28
9,2018-02-13,2018-05-30,2018-05-29


In [11]:
# create clean acoustic CSV for plotting
acoustic_clean = ccb_explode[['between_days']]

# create data type column
acoustic_clean['DataType'] = 'Acoustic' 

# rename date column
acoustic_clean = acoustic_clean.rename(columns = {'between_days':'Date'})

acoustic_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Date,DataType
0,2008-02-23,Acoustic
0,2008-02-24,Acoustic
0,2008-02-25,Acoustic
0,2008-02-26,Acoustic
0,2008-02-27,Acoustic
...,...,...
9,2018-05-26,Acoustic
9,2018-05-27,Acoustic
9,2018-05-28,Acoustic
9,2018-05-29,Acoustic


`Whale Data`

In [5]:
# read in NARWC sightings data (sent from Enrico; date column created in R)
sightings_date = pd.read_csv('../InputsForEric_OverlapFramework/NARWC_Sightings_DateColumn.csv',
                           sep = ',',
                           encoding = 'utf-8',
                           parse_dates = ['date'])
sightings_date

Unnamed: 0.1,Unnamed: 0,SightingId,MatchingStatusId,SightingEGNo,Age,AgeClassCode,IntermatchCode,SightingYear,SightingMonth,SightingDay,SightingTime,SightingLetter,Latitude,Longitude,ObserverCode,AreaCode,RegionCode,Behaviors,date
0,1,28,3,1001.0,A,A,,1986,2,16,1747,C,29.56500,-81.15333,NEA/A,FL,SEUS,W/CALF,1986-02-16
1,2,29,3,1001.0,A,A,,1986,2,17,1735,M,29.38167,-81.07000,NEA/A,FL,SEUS,W/CALF,1986-02-17
2,3,30,3,1001.0,A,A,,1986,2,19,1710,A,29.12333,-80.90167,NEA/A,FL,SEUS,W/CALF,1986-02-19
3,4,31,3,1001.0,A,A,,1986,2,20,0,?,29.05000,-80.81667,CALD,FL,SEUS,W/CALF,1986-02-20
4,5,32,3,1001.0,A,A,,1986,5,15,1630,?,41.38333,-69.08167,URI/A,GSC,GSC,SKM FD,1986-05-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84504,84505,108231,1,,,,,2019,8,3,924,B,47.77020,-64.10043,DFO/VL/C,GSL,NRTH,,2019-08-03
84505,84506,108232,1,,,,,2019,8,3,941,C,47.77625,-64.09368,DFO/VL/C,GSL,NRTH,,2019-08-03
84506,84507,108234,1,,,,,2019,8,3,1002,D,47.77622,-64.09387,DFO/VL/C,GSL,NRTH,,2019-08-03
84507,84508,108235,1,,,,,2019,8,3,1117,E,47.79905,-63.97385,DFO/VL/C,GSL,NRTH,,2019-08-03


In [6]:
# filter data to just include CCB sightings between 2008 and 2018 to match our other datasets
sightings_ccb = sightings_date.query('SightingYear >= 2008 & SightingYear <= 2018 & AreaCode == "CCB"')

sightings_ccb

Unnamed: 0.1,Unnamed: 0,SightingId,MatchingStatusId,SightingEGNo,Age,AgeClassCode,IntermatchCode,SightingYear,SightingMonth,SightingDay,SightingTime,SightingLetter,Latitude,Longitude,ObserverCode,AreaCode,RegionCode,Behaviors,date
42539,42540,62095,3,2645.0,12,A,,2008,1,12,1527,A,41.80000,-70.35000,CCS/A,CCB,NE,"ENTGL, FRST ENTGL, LIN TR, NOT FL",2008-01-12
43170,43171,62762,3,3530.0,4,J,,2008,3,27,1133,L,41.95833,-70.39167,CCS/A,CCB,NE,MOPN,2008-03-27
43491,43492,63922,3,1971.0,19,A,,2008,4,8,1751,#1,41.90500,-70.43333,CCS/A,CCB,NE,"CO FD, SUB FD",2008-04-08
43496,43497,63927,3,3040.0,A,A,,2008,4,9,1222,GG,41.86167,-70.38000,CCS/A,CCB,NE,"ECH, SKM FD",2008-04-09
43543,43544,63063,3,3710.0,1,J,2007CalfOf2460,2008,2,3,1112,A,41.96333,-70.08333,CCS/A,CCB,NE,"LIN TR, MOPN",2008-02-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81279,81280,104753,3,3860.0,10,A,,2018,4,27,936,E,42.04500,-70.57500,WHOI/DR,CCB,NE,"BLOW, SKM FD",2018-04-27
81280,81281,104754,3,2340.0,A,A,,2018,4,27,1033,F,42.05097,-70.57461,WHOI/DR,CCB,NE,"BLOW, CO FD, SKM FD",2018-04-27
81281,81282,104755,3,1307.0,A,A,,2018,4,27,1033,G,42.05097,-70.57461,WHOI/DR,CCB,NE,"CO FD, SKM FD",2018-04-27
81539,81540,105020,3,3260.0,A,A,,2018,3,29,1542,C,42.05000,-70.53330,WHOI/DR,CCB,NE,"SUB FD, UW EXH",2018-03-29


In [7]:
# create column to count number of sightings for each date (assuming each record in OG spreadsheet = 1 whale sighting)
sightings_ccb['sightings'] = 1
sightings_ccb.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0.1,Unnamed: 0,SightingId,MatchingStatusId,SightingEGNo,Age,AgeClassCode,IntermatchCode,SightingYear,SightingMonth,SightingDay,SightingTime,SightingLetter,Latitude,Longitude,ObserverCode,AreaCode,RegionCode,Behaviors,date,sightings
42539,42540,62095,3,2645.0,12,A,,2008,1,12,1527,A,41.8,-70.35,CCS/A,CCB,NE,"ENTGL, FRST ENTGL, LIN TR, NOT FL",2008-01-12,1
43170,43171,62762,3,3530.0,4,J,,2008,3,27,1133,L,41.95833,-70.39167,CCS/A,CCB,NE,MOPN,2008-03-27,1
43491,43492,63922,3,1971.0,19,A,,2008,4,8,1751,#1,41.905,-70.43333,CCS/A,CCB,NE,"CO FD, SUB FD",2008-04-08,1
43496,43497,63927,3,3040.0,A,A,,2008,4,9,1222,GG,41.86167,-70.38,CCS/A,CCB,NE,"ECH, SKM FD",2008-04-09,1
43543,43544,63063,3,3710.0,1,J,2007CalfOf2460,2008,2,3,1112,A,41.96333,-70.08333,CCS/A,CCB,NE,"LIN TR, MOPN",2008-02-03,1


In [8]:
# groupby survey date and sum sightings for each date
sightings_group = sightings_ccb.groupby(['date']).agg({'sightings':'sum'})
sightings_group

Unnamed: 0_level_0,sightings
date,Unnamed: 1_level_1
2008-01-12,1
2008-01-29,1
2008-02-03,2
2008-02-04,2
2008-02-21,1
...,...
2018-05-13,1
2018-12-11,9
2018-12-14,5
2018-12-19,8


In [15]:
sightings_group = sightings_group.reset_index()

In [16]:
sightings_group = sightings_group.rename(columns={'date':'Date'})
sightings_group['DataType'] = 'Whale'
sightings_group

Unnamed: 0,Date,sightings,DataType
0,2008-01-12,1,Whale
1,2008-01-29,1,Whale
2,2008-02-03,2,Whale
3,2008-02-04,2,Whale
4,2008-02-21,1,Whale
...,...,...,...
358,2018-05-13,1,Whale
359,2018-12-11,9,Whale
360,2018-12-14,5,Whale
361,2018-12-19,8,Whale


`Zooplankton`

In [19]:
# read in zooplank data (pre cleaned in google sheets) - spatial data?
zooplank = pd.read_csv('../InputsForEric_OverlapFramework/Zooplankton_clean.csv',
                    parse_dates = ['Date'])

zooplank['DataType'] = 'Zooplankton'

zooplank

Unnamed: 0,Date,DataType
0,2011-02-17,Zooplankton
1,2011-02-24,Zooplankton
2,2011-03-17,Zooplankton
3,2011-04-19,Zooplankton
4,2011-04-25,Zooplankton
...,...,...
64,2018-04-13,Zooplankton
65,2018-04-22,Zooplankton
66,2018-04-27,Zooplankton
67,2018-04-30,Zooplankton


In [20]:
# next:
    # concatenate
    # pivot to wide form
    # code overlaps?
        # new column value
            # for each date, fill in A, AW, AWP, AP, WP, P, W
    # flip back to long formb

In [21]:
# concatenate three dataframes into one
concat_full = pd.concat([acoustic_clean, sightings_group, zooplank])
concat_full

Unnamed: 0,Date,DataType,sightings
0,2008-02-23,Acoustic,
0,2008-02-24,Acoustic,
0,2008-02-25,Acoustic,
0,2008-02-26,Acoustic,
0,2008-02-27,Acoustic,
...,...,...,...
64,2018-04-13,Zooplankton,
65,2018-04-22,Zooplankton,
66,2018-04-27,Zooplankton,
67,2018-04-30,Zooplankton,


In [26]:
# how many unique dates in full concat?
x = concat_full.Date.unique()
len(x) # 987

# spit out value counts for each date
concat_vals = concat_full['Date'].value_counts()
concat_vals

# create value count column (able to flag triple vs double overlap)
concat_valcounts = pd.DataFrame(concat_vals)
concat_valcounts = concat_valcounts.reset_index()
concat_valcounts.columns = ['Date', 'DataCount']
concat_valcounts

Unnamed: 0,Date,DataCount
0,2011-02-24,3
1,2018-02-24,3
2,2013-03-18,3
3,2015-02-18,3
4,2018-05-06,3
...,...,...
982,2013-03-10,1
983,2014-05-01,1
984,2018-05-26,1
985,2008-04-13,1


In [29]:
# create column to flag triple overlap
concat_valcounts['Trifecta'] = np.where(concat_valcounts['DataCount'] == 3,1,0)
concat_valcounts

Unnamed: 0,Date,DataCount,Trifecta
0,2011-02-24,3,1
1,2018-02-24,3,1
2,2013-03-18,3,1
3,2015-02-18,3,1
4,2018-05-06,3,1
...,...,...,...
982,2013-03-10,1,0
983,2014-05-01,1,0
984,2018-05-26,1,0
985,2008-04-13,1,0


In [22]:
# pivot to wide format
AWZ_pivot = concat_full.pivot(index = 'Date', columns = 'DataType', values = 'DataType')
AWZ_pivot

DataType,Acoustic,Whale,Zooplankton
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-01-12,,Whale,
2008-01-29,,Whale,
2008-02-03,,Whale,
2008-02-04,,Whale,
2008-02-21,,Whale,
...,...,...,...
2018-05-30,Acoustic,,
2018-12-11,,Whale,
2018-12-14,,Whale,
2018-12-19,,Whale,


In [30]:
# join value count/trifecta information to wide data
AWZ_overlap = AWZ_pivot.merge(concat_valcounts, left_on=['Date'],
                             right_on=['Date'],
                             how='left')

AWZ_overlap

Unnamed: 0,Date,Acoustic,Whale,Zooplankton,DataCount,Trifecta
0,2008-01-12,,Whale,,1,0
1,2008-01-29,,Whale,,1,0
2,2008-02-03,,Whale,,1,0
3,2008-02-04,,Whale,,1,0
4,2008-02-21,,Whale,,1,0
...,...,...,...,...,...,...
982,2018-05-30,Acoustic,,,1,0
983,2018-12-11,,Whale,,1,0
984,2018-12-14,,Whale,,1,0
985,2018-12-19,,Whale,,1,0


In [41]:
#pd.DataFrame.to_csv(AWZ_overlap, '../ScratchData/AWZ_overlap2.csv')

In [38]:
# replace NaN values with blank string
AWZ_overlap = AWZ_overlap.replace(np.nan, '', regex=True)
AWZ_overlap

Unnamed: 0,Date,Acoustic,Whale,Zooplankton,DataCount,Trifecta
0,2008-01-12,,Whale,,1,0
1,2008-01-29,,Whale,,1,0
2,2008-02-03,,Whale,,1,0
3,2008-02-04,,Whale,,1,0
4,2008-02-21,,Whale,,1,0
...,...,...,...,...,...,...
982,2018-05-30,Acoustic,,,1,0
983,2018-12-11,,Whale,,1,0
984,2018-12-14,,Whale,,1,0
985,2018-12-19,,Whale,,1,0


In [40]:
# try to create new column that contains all present datatypes..
AWZ_overlap['Overlap'] = AWZ_overlap[['Acoustic','Whale','Zooplankton']].agg(''.join,axis=1)
AWZ_overlap

Unnamed: 0,Date,Acoustic,Whale,Zooplankton,DataCount,Trifecta,Overlap
0,2008-01-12,,Whale,,1,0,Whale
1,2008-01-29,,Whale,,1,0,Whale
2,2008-02-03,,Whale,,1,0,Whale
3,2008-02-04,,Whale,,1,0,Whale
4,2008-02-21,,Whale,,1,0,Whale
...,...,...,...,...,...,...,...
982,2018-05-30,Acoustic,,,1,0,Acoustic
983,2018-12-11,,Whale,,1,0,Whale
984,2018-12-14,,Whale,,1,0,Whale
985,2018-12-19,,Whale,,1,0,Whale


In [43]:
# melt AWZ wide data back to long form, bring along overlap info
AWZ_melt2 = pd.melt(AWZ_overlap, 
                  id_vars=['Date'],
                  value_vars=['Overlap'])

AWZ_melt2

Unnamed: 0,Date,variable,value
0,2008-01-12,Overlap,Whale
1,2008-01-29,Overlap,Whale
2,2008-02-03,Overlap,Whale
3,2008-02-04,Overlap,Whale
4,2008-02-21,Overlap,Whale
...,...,...,...
982,2018-05-30,Overlap,Acoustic
983,2018-12-11,Overlap,Whale
984,2018-12-14,Overlap,Whale
985,2018-12-19,Overlap,Whale


In [44]:
# melt AWZ wide data back to long form, bring along overlap info
AWZ_melt = pd.melt(AWZ_overlap, 
                  id_vars=['Date','Overlap'],
                  value_vars=['Acoustic','Whale','Zooplankton'])

AWZ_melt

Unnamed: 0,Date,Overlap,variable,value
0,2008-01-12,Whale,Acoustic,
1,2008-01-29,Whale,Acoustic,
2,2008-02-03,Whale,Acoustic,
3,2008-02-04,Whale,Acoustic,
4,2008-02-21,Whale,Acoustic,
...,...,...,...,...
2956,2018-05-30,Acoustic,Zooplankton,
2957,2018-12-11,Whale,Zooplankton,
2958,2018-12-14,Whale,Zooplankton,
2959,2018-12-19,Whale,Zooplankton,


In [46]:
# trim just overlap column
overlap_col = AWZ_overlap[['Date','Overlap']]
overlap_col

Unnamed: 0,Date,Overlap
0,2008-01-12,Whale
1,2008-01-29,Whale
2,2008-02-03,Whale
3,2008-02-04,Whale
4,2008-02-21,Whale
...,...,...
982,2018-05-30,Acoustic
983,2018-12-11,Whale
984,2018-12-14,Whale
985,2018-12-19,Whale


In [45]:
#pd.DataFrame.to_csv(AWZ_melt, '../ScratchData/AWZ_melt2.csv')

In [47]:
# join overlap info to original concat_full df
concat_overlap = concat_full.merge(overlap_col,
                                  left_on=['Date'],
                                  right_on=['Date'],
                                  how='left')

concat_overlap


Unnamed: 0,Date,DataType,sightings,Overlap
0,2008-02-23,Acoustic,,Acoustic
1,2008-02-24,Acoustic,,Acoustic
2,2008-02-25,Acoustic,,AcousticWhale
3,2008-02-26,Acoustic,,Acoustic
4,2008-02-27,Acoustic,,Acoustic
...,...,...,...,...
1312,2018-04-13,Zooplankton,,AcousticWhaleZooplankton
1313,2018-04-22,Zooplankton,,AcousticWhaleZooplankton
1314,2018-04-27,Zooplankton,,AcousticWhaleZooplankton
1315,2018-04-30,Zooplankton,,AcousticWhaleZooplankton


In [48]:
#pd.DataFrame.to_csv(concat_overlap, '../ScratchData/concat_overlap.csv')

`Plotting`

In [49]:
import altair as alt

In [54]:
# interactivity
interval = alt.selection(type='interval', encodings=['x']) 
hover = alt.selection_single(on='mouseover', nearest=True, empty='none')

# chart chunks
timeline_base = alt.Chart(concat_overlap).mark_rect().encode(
    y = alt.Y('DataType:O', axis=alt.Axis(title='Data Type')),
    color = 'DataType:N'
).properties(
    width = 600
)

timeline_overview = timeline_base.encode(
    x = alt.X(
        'Date:T', 
        timeUnit = 'yearmonthdate', 
        axis = alt.Axis(title='Date')
    )
).add_selection( # adding interactivity
    interval
).properties(
    height = 40
)

timeline_detail = timeline_base.encode(
    x = alt.X(
        'Date:T', 
        timeUnit='yearmonthdate',
        axis = alt.Axis(title=''),
        scale = alt.Scale(domain=interval) # using the interactive selection to show X range
    )
).properties(
    height = 100
)


# display chart
alt.vconcat(timeline_detail, timeline_overview)

In [53]:
# interactivity
interval = alt.selection(type='interval', encodings=['x']) 
hover = alt.selection_single(on='mouseover', nearest=True, empty='none')

# chart chunks
timeline_base = alt.Chart(concat_overlap).mark_rect().encode(
    y = alt.Y('DataType:O', axis=alt.Axis(title='Data Type')),
    color = 'DataType:N'
).properties(
    width = 600
)

timeline_overview = timeline_base.encode(
    x = alt.X(
        'Date:T', 
        timeUnit = 'yearmonthdate', 
        axis = alt.Axis(title='Date')
    )
).add_selection( # adding interactivity
    interval
).properties(
    height = 40
)

timeline_detail = timeline_base.encode(
    x = alt.X(
        'Date:T', 
        timeUnit='yearmonthdate',
        axis = alt.Axis(title=''),
        scale = alt.Scale(domain=interval) # using the interactive selection to show X range
    )
).properties(
    height = 100
)

text = timeline_detail.mark_text(dy=-5).encode(
    text = 'sightings:Q',
    opacity = alt.condition(hover, alt.value(1), alt.value(0))
)

# display chart
alt.vconcat(timeline_detail, timeline_overview, text)