### purpose 1
This script combines clean acoustic, tractline/aerial survey and zooplankton survey into one dataframe that is plotted temporally (each individual dataframe has two cols, Date and DataType)

### purpose 2
practiced / tested overlap on the small sample 

In [1]:
import pandas as pd
import altair as alt
import os
import numpy as np

### create small sample
- made small date range (Jan 1-5)
- made df with one prey record on Jan 3
- concatenated, successfully inserted prey into sample range

In [2]:
# read in small sample data
sample = pd.read_csv('../data/SmallSample_Jan2011.csv',
                    parse_dates = ['date'])

sample

Unnamed: 0,date,data
0,2011-01-01,acoustic
1,2011-01-02,acoustic
2,2011-01-03,acoustic
3,2011-01-04,acoustic
4,2011-01-05,acoustic


In [3]:
# create df with a prey record from 2011-01-03
start_date = '2011-01-03'
end_date = '2011-01-04'
sample_data = 'prey'


sample_prey = pd.DataFrame({'date':[start_date,end_date],
                           'data':sample_data})

sample_prey['date'] = pd.to_datetime(sample_prey['date'])
sample_prey

Unnamed: 0,date,data
0,2011-01-03,prey
1,2011-01-04,prey


In [4]:
# zooplank
z_date = '2011-01-03'
z_data = 'zooplank'

sample_zoo = pd.DataFrame({'date':[z_date],
                          'data':z_data})

sample_zoo['date'] = pd.to_datetime(sample_zoo['date'])
sample_zoo

Unnamed: 0,date,data
0,2011-01-03,zooplank


Just overlap between acoustic and prey

In [5]:
acou_prey = pd.concat([sample, sample_prey])
acou_prey

Unnamed: 0,date,data
0,2011-01-01,acoustic
1,2011-01-02,acoustic
2,2011-01-03,acoustic
3,2011-01-04,acoustic
4,2011-01-05,acoustic
0,2011-01-03,prey
1,2011-01-04,prey


In [18]:
acou_prey.set_index('date')

Unnamed: 0_level_0,data
date,Unnamed: 1_level_1
2011-01-01,acoustic
2011-01-02,acoustic
2011-01-03,acoustic
2011-01-04,acoustic
2011-01-05,acoustic
2011-01-03,prey
2011-01-04,prey


In [19]:
AP_flip = acou_prey.pivot(index='date',columns='data')
AP_flip

date
2011-01-01
2011-01-02
2011-01-03
2011-01-04
2011-01-05


In [6]:
# find duplicates (double)
dupdates = acou_prey[acou_prey.duplicated(subset=['date','date'], keep=False)]
dupdates

Unnamed: 0,date,data
2,2011-01-03,acoustic
3,2011-01-04,acoustic
0,2011-01-03,prey
1,2011-01-04,prey


Double overlap between acoustic and prey, triple overlap between acoustic, prey and zooplankton

In [7]:
acou_prey_zoo = pd.concat([sample, sample_prey, sample_zoo])
acou_prey_zoo

Unnamed: 0,date,data
0,2011-01-01,acoustic
1,2011-01-02,acoustic
2,2011-01-03,acoustic
3,2011-01-04,acoustic
4,2011-01-05,acoustic
0,2011-01-03,prey
1,2011-01-04,prey
0,2011-01-03,zooplank


### Pivot and Gap Identification

In [23]:
APZ_pivot = acou_prey_zoo.pivot(index = 'date', columns = 'data', values = 'data')
APZ_pivot

data,acoustic,prey,zooplank
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-01-01,acoustic,,
2011-01-02,acoustic,,
2011-01-03,acoustic,prey,zooplank
2011-01-04,acoustic,prey,
2011-01-05,acoustic,,


In [35]:
# Identifies with T/F where we have overlap of all 3 (acou, prey, zoo)
    # True = all three columns have a value
    # False = at least one column has an NaN value (aka, data gap)
    
def my_function(row):
    return all(row[['acoustic', 'prey', 'zooplank']].notna())

In [36]:
APZ_pivot['overlap'] = APZ_pivot.apply(my_function, axis = 1)
APZ_pivot

data,acoustic,prey,zooplank,overlap
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-01,acoustic,,,False
2011-01-02,acoustic,,,False
2011-01-03,acoustic,prey,zooplank,True
2011-01-04,acoustic,prey,,False
2011-01-05,acoustic,,,False


In [24]:
# val counts
m = acou_prey_zoo['date'].value_counts()
m

# data frame
df_vals = pd.DataFrame(m)
df_vals = df_vals.reset_index()
df_vals.columns = ['date', 'count']
df_vals

Unnamed: 0,date,count
0,2011-01-03,3
1,2011-01-04,2
2,2011-01-05,1
3,2011-01-02,1
4,2011-01-01,1


In [25]:
df_vals['trifecta'] = np.where(df_vals['count'] == 3,1,0)
df_vals

Unnamed: 0,date,count,trifecta
0,2011-01-03,3,1
1,2011-01-04,2,0
2,2011-01-05,1,0
3,2011-01-02,1,0
4,2011-01-01,1,0


### Merge available data into one timeline (acoustic, aerial, zooplankton)
- make sure acoustic data is set up same
    ** nulls or no nulls? -> no nulls, otherwise it adds a 'null' category in vis
- restructure tractline csv and zooplankton csv (date, data)
- bring in these two csvs
- concatenate the three

**concatenated df of acoustic, aerial, zooplankton**

`Acoustic`

In [6]:
# read in 'clean between days ccb data'
ccb = pd.read_csv('../data/Acoustic_betweendays_clean.csv',
                 parse_dates = ['Date'])

# fill in 'DataType' column, rename date column
ccb['DataType'] = 'acoustic'

# select desired columns
acoustic = ccb[['Date','DataType']]
acoustic

Unnamed: 0,Date,DataType
0,2011-02-17,acoustic
1,2011-02-18,acoustic
2,2011-02-19,acoustic
3,2011-02-20,acoustic
4,2011-02-21,acoustic
...,...,...
662,2018-05-26,acoustic
663,2018-05-27,acoustic
664,2018-05-28,acoustic
665,2018-05-29,acoustic


`Aerial`

In [7]:
# read in aerial data (precleaned in google sheets) - spatial data for transects available
aerial = pd.read_csv('/Users/cristiana/Documents/Duke/MP/Python/Data/Processed/Aerial_clean.csv',
                    parse_dates = ['Date'])

aerial

Unnamed: 0,Date,DataType
0,2011-02-17,aerial
1,2011-02-24,aerial
2,2011-03-04,aerial
3,2011-03-08,aerial
4,2011-03-13,aerial
...,...,...
151,2018-04-30,aerial
152,2018-05-04,aerial
153,2018-05-06,aerial
154,2018-05-09,aerial


`Zooplankton`

In [8]:
# read in zooplank data (pre cleaned in google sheets) - spatial data?
zooplank = pd.read_csv('/Users/cristiana/Documents/Duke/MP/Python/Data/Processed/Zooplankton_clean.csv',
                    parse_dates = ['Date'])

zooplank

Unnamed: 0,Date,DataType
0,2011-02-17,zooplankton
1,2011-02-24,zooplankton
2,2011-03-17,zooplankton
3,2011-04-19,zooplankton
4,2011-04-25,zooplankton
...,...,...
64,2018-04-13,zooplankton
65,2018-04-22,zooplankton
66,2018-04-27,zooplankton
67,2018-04-30,zooplankton


In [9]:
concat = pd.concat([acoustic, aerial, zooplank])
concat

Unnamed: 0,Date,DataType
0,2011-02-17,acoustic
1,2011-02-18,acoustic
2,2011-02-19,acoustic
3,2011-02-20,acoustic
4,2011-02-21,acoustic
...,...,...
64,2018-04-13,zooplankton
65,2018-04-22,zooplankton
66,2018-04-27,zooplankton
67,2018-04-30,zooplankton


In [11]:
#pd.DataFrame.to_csv(concat, '/Users/cristiana/Documents/Duke/MP/Python/Data/Final/Plotting_AcouAerialTL_March10.csv') 

In [10]:
# Acoustic/Aerial/Zooplankton plot -- best version
alt.Chart(concat).mark_rect().encode(
    y = alt.Y('DataType:N', title = 'Data Type'),
    x = alt.X('monthdate(Date):T', title = 'Date'),
    color = 'DataType:N'
).facet(
    row = 'year(Date)'
).properties(
    title = 'Data Availability By Type')

In [11]:
# Acoustic/Aerial/Zooplankton plot -- playing with parameters
alt.Chart(concat).mark_rect().encode(
    y = alt.Y('DataType:N', title = 'Data Type'),
    x = alt.X('monthdate(Date):O', title = 'Date'),
    color = 'DataType:N'
).facet(
    row = 'year(Date)'
).properties(
    title = 'Data Availability By Type')