Script Name: Scrapper.py
<br>Purpose: Scrapping chinook data and orca for dashboard
<br>Author: Zoe Liu
<br>Date: Oct 12th 2022

#### 1 Setup

1.1 Install selenium and chromedriver

In [1]:
#!apt update
#!apt install chromium-chromedriver
!pip install selenium
#!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting selenium
  Downloading selenium-4.8.3-py3-none-any.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.10.2-py3-none-any.whl (17 kB)
Collecting trio~=0.17
  Downloading trio-0.22.0-py3-none-any.whl (384 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m384.9/384.9 KB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Collecting outcome
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting sniffio
  Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━

In [2]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

Executing: /tmp/apt-key-gpghome.f3XXQwOJRT/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.aDNTpUUcU8/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.mM1xRHwLDN/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Hit:2 http://archive.ubuntu.com/ubuntu 



1.2 Import libraries

In [3]:
import os
import re
import shutil

import requests
import selenium
from selenium import webdriver 
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

from datetime import date
from datetime import datetime
from time import strptime
import time
from time import sleep

from google.colab import files
from google.colab import drive

import pandas as pd
pd.options.mode.chained_assignment = None #suppress chained assignment 
import numpy as np

1.3 Get today's date

In [9]:
today=date.today()
#todaystr=str(today)
todaystr='2023-03-26'
curyr=today.year
lastyr=curyr-1
twoyr=curyr-2
ayl=[y for y in range(1980, curyr+1)] #year list for Albion
byl=[y for y in range(1939, curyr+1)] #year list for Bonneville Dam

1.4 Define data path

In [5]:
fos_path='/content/data/foschinook/'
bon_path='/content/data/bonchinook/'
acartia_path='/content/data/acartia/'

1.5 Mount Google Drive

In [6]:
if os.path.exists('/content/drive/MyDrive/')==False:
  drive.mount('/content/drive')

Mounted at /content/drive


#### 2 Scap web data

2.1 Scrap Chinook data by year

In [7]:
def scrap_fos(yrs='2023',spe='CHINOOK SALMON', fos_path=fos_path):
  #Set up chrome driver
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  driver=webdriver.Chrome(options=options)
  #landing page of fos data
  url = 'https://www-ops2.pac.dfo-mpo.gc.ca/fos2_Internet/Testfish/rptcsbdparm.cfm?stat=CPTFM&fsub_id=242'
  driver.get(url)
  sleep(3)
  #select year and species and generate report
  driver.find_element('id','lboYears').send_keys(yrs)
  driver.find_element('id','lboSpecies').send_keys(spe)
  driver.find_element('name','cmdRunReport').click()
  sleep(1)
  window_after = driver.window_handles[1] #Switch to the newly opened window
  driver.switch_to.window(window_after)
  #scrape data from the report table 
  table=driver.find_elements(By.XPATH,'.//tr')
  #process scrapped data and convert to pandas df
  tabls0=list(map(lambda x: x.text.split(sep=' '), table))
  tabls=[t for t in tabls0 if ((len(t)==12) & (t[0]!='Printed'))]
  dat=pd.DataFrame(tabls, columns=['day','mon','year','netlen','catch1','sets1','effort1','cpue1', 'catch2','sets2','effort2','cpue2'])
  #archive data to csv
  if os.path.exists(fos_path)==False:
    os.makedirs(fos_path)
  dat.to_csv(fos_path+'fos'+yrs+'.csv', index=False)

Use the following line to download the current year

In [None]:
scrap_fos(yrs=str(curyr),spe='CHINOOK SALMON', fos_path=fos_path)

Use the following lines to download all fos chinook data
<br>Note: commented out after running it once

In [None]:
'''
for y in ayl:
  scrap_fos(yrs=str(y),spe='CHINOOK SALMON', fos_path=fos_path)
'''

"\nfor y in ayl:\n  scrap_fos(yrs=str(y),spe='CHINOOK SALMON', fos_path=fos_path)\n"

2.2 Scrap Bonneville Dam Chinook Daily count

In [None]:
def scrap_bon(yrs='2023', bon_path=bon_path):
  if os.path.exists(bon_path)==False:
    os.makedirs(bon_path)
  #Set up chrome driver
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  driver=webdriver.Chrome(options=options)
  #landing page of Columbia Basin Research 
  url = 'https://www.cbr.washington.edu/dart/query/adult_daily'
  driver.get(url)
  sleep(3)
  #select year and generate report
  site='BON'
  driver.find_element('id','daily').click()
  driver.find_element('id','outputFormat2').click()
  driver.find_element('id','year-select').send_keys(yrs)
  driver.find_element('id','proj-select').send_keys(site)
  driver.find_element('id','calendar').click()
  driver.find_element('id','run1').click()
  driver.find_element(By.XPATH, ".//input[@type='submit']").click()
  sleep(3)
  dlf=[x for x in os.listdir("/content") if x[-4:]=='.csv']
  filename=dlf[0]
  os.rename('/content/'+filename, bon_path+'bon'+yrs+'.csv')

Use the following line to download the current year

In [None]:
scrap_bon(yrs=str(curyr), bon_path=bon_path)

Use the following lines to download all fos chinook data
<br>Note: commented out after running it once

In [None]:
'''
for y in byl:
  scrap_bon(yrs=str(y), bon_path=bon_path)
'''

2.3 Scrap Acartia orca data

In [8]:
def scrap_acartia(acartia_path='/content/data/acartia/',write2drive=True):
  #Create local folder in Google Colab
  if os.path.exists(acartia_path)==False:
    os.makedirs(acartia_path)
  #Mount Google Drive 
  if os.path.exists('/content/drive/MyDrive/')==False:
    drive.mount('/content/drive')
  #Read Acartia token
  f = open('/content/drive/MyDrive/Orcasound/salmon/acartia_token.txt', 'r')
  atoken=f.read()
  #Acartia webpages
  url='https://acartia.io/api/v1/sightings/'
  response = requests.get(url, headers={'Authorization': 'Bearer '+atoken, 
                                     'Content-Type': 'application/json'})
  acartia=pd.DataFrame(response.json(), 
                     columns=['type','created','profile','trusted','entry_id','latitude','longitude','photo_url','signature',
                              'ssemmi_id','no_sighted','submitter_did','data_source_id',
                              'data_source_name','ssemmi_date_added','data_source_entity', 'data_source_witness', 'data_source_comments'])
  acartia=acartia[['type','created','latitude','longitude','no_sighted','data_source_id','data_source_comments']]
  acartia=acartia.drop_duplicates()
  acartia=acartia.sort_values(by=['created'])
  #save acartia to the Colab local folder ***disappear after each session!!!
  acartia.to_csv(acartia_path+'acartia_'+todaystr+'.csv', index=False)
  #Upload to Google Drive if indicated in write2drive
  if write2drive:
    shutil.copy(acartia_path+'acartia_'+todaystr+'.csv','/content/drive/MyDrive/Orcasound/salmon/data/acartia')

Run the following code to scrape Acartia data

In [9]:
scrap_acartia()

#### 3 Options for downloading and saving data

Option1: Download files to Google Colab local drive
<br>Note: Google Colab local files will be removed after each session.  

In [None]:
'''
fosfiles=[x for x in os.listdir('/content/data/foschinook/') if x[-4:]=='.csv']
for f in fosfiles:
  files.download('/content/data/foschinook/'+f)

bonfiles=[x for x in os.listdir('/content/data/bonchinook/') if x[-4:]=='.csv']
for f in bonfiles:
  files.download('/content/data/bonchinook/'+f)
'''

"\nfosfiles=[x for x in os.listdir('/content/data/foschinook/') if x[-4:]=='.csv']\nfor f in fosfiles:\n  files.download('/content/data/foschinook/'+f)\n\nbonfiles=[x for x in os.listdir('/content/data/bonchinook/') if x[-4:]=='.csv']\nfor f in bonfiles:\n  files.download('/content/data/bonchinook/'+f)\n"

Option 2: Upload all files to google drive
<br> Note: Run this the first time when you download the entire data 

In [None]:
'''
if os.path.exists('/content/drive/MyDrive/')==False:
  drive.mount('/content/drive')

fosfiles=[x for x in os.listdir('/content/data/foschinook/') if x[-4:]=='.csv']
for f in fosfiles:
  shutil.copy('/content/data/foschinook/'+f,'/content/drive/MyDrive/Orcasound/salmon/data/foschinook')

bonfiles=[x for x in os.listdir('/content/data/bonchinook/') if x[-4:]=='.csv']
for f in bonfiles:
  shutil.copy('/content/data/bonchinook/'+f,'/content/drive/MyDrive/Orcasound/salmon/data/bonchinook')
'''

"\nif os.path.exists('/content/drive/MyDrive/')==False:\n  drive.mount('/content/drive')\n\nfosfiles=[x for x in os.listdir('/content/data/foschinook/') if x[-4:]=='.csv']\nfor f in fosfiles:\n  shutil.copy('/content/data/foschinook/'+f,'/content/drive/MyDrive/Orcasound/salmon/data/foschinook')\n\nbonfiles=[x for x in os.listdir('/content/data/bonchinook/') if x[-4:]=='.csv']\nfor f in bonfiles:\n  shutil.copy('/content/data/bonchinook/'+f,'/content/drive/MyDrive/Orcasound/salmon/data/bonchinook')\n"

Option 3: Upload only current year files to google drive

In [None]:
if os.path.exists('/content/drive/MyDrive/')==False:
  drive.mount('/content/drive')
shutil.copy('/content/data/foschinook/fos'+str(curyr)+'.csv','/content/drive/MyDrive/Orcasound/salmon/data/foschinook')
shutil.copy('/content/data/bonchinook/bon'+str(curyr)+'.csv','/content/drive/MyDrive/Orcasound/salmon/data/bonchinook')

'/content/drive/MyDrive/Orcasound/salmon/data/bonchinook/bon2023.csv'

#### 4 Processing downloaded data

In [11]:
if os.path.exists('/content/drive/MyDrive/'):
  fos_path='/content/drive/MyDrive/Orcasound/salmon/data/foschinook/'
  bon_path='/content/drive/MyDrive/Orcasound/salmon/data/bonchinook/'
  acartia_path='/content/drive/MyDrive/Orcasound/salmon/data/acartia/'
else:
  fos_path='/content/data/foschinook/'
  bon_path='/content/data/bonchinook/'
  acartia_path='/content/data/acartia/'

4.1 Processing downloaded data - Albion

In [None]:
albion=pd.DataFrame(columns=['day','m'])
for i in reversed(range(len(ayl))):
    d=pd.read_csv(fos_path+'fos'+str(ayl[i])+'.csv',\
        usecols=['day','mon','cpue1'])
    d['m']=d['mon'].apply(lambda x: datetime.strptime(x, '%b').month)
    d=d.drop(columns='mon')
    d=d.rename(columns={'cpue1':'cpue'+str(ayl[i])})
    albion=pd.merge(left=albion, right=d, how='outer', on=['m','day'], sort=True)
del(d,i)

Calculate historical average

In [None]:
albion['cpue_hist']=albion.iloc[:,5:].mean(axis=1, skipna=True).round(decimals=2)

In [None]:
albion['month']=albion['m'].apply(lambda x: datetime.strptime(str(x), '%m').strftime('%b'))
albion['date']=albion[['month','day']].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")

Create Albion lag dataset

In [None]:
lag1=3
albion_curyr_lagged=albion[['date','cpue'+str(curyr)]]
albion_curyr_lagged['date1']=pd.to_datetime(albion_curyr_lagged['date'], format='%b-%d')
albion_curyr_lagged['date2']=albion_curyr_lagged['date1']+ pd.Timedelta(days=lag1)
albion_curyr_lagged['m']=pd.DatetimeIndex(albion_curyr_lagged['date2']).month
albion_curyr_lagged['day']=pd.DatetimeIndex(albion_curyr_lagged['date2']).day
#albion_curyr_lagged['mon']=albion_curyr_lagged['m'].apply(lambda x: datetime.strptime(str(x), '%m').strftime('%b'))
#albion_curyr_lagged['date_new']=albion_curyr_lagged[['mon','day']].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")
albion_curyr_lagged=albion_curyr_lagged.drop(columns=['date','date1','date2'])
#albion_curyr_lagged.head()

5.2 Processing downloaded data - Bonneville Dam

In [None]:
bonnev=pd.DataFrame(columns=['day','m'])
for i in reversed(range(len(byl))):
    d=pd.read_csv(bon_path+'bon'+str(byl[i])+'.csv',\
        usecols=['Project','Date','Chin'])
    d=d[d['Project']=='Bonneville']
    d['Chin2']=d['Chin'].apply(lambda x: 0 if x<0 else x)
    d['m']=pd.DatetimeIndex(d['Date']).month
    d['day']=pd.DatetimeIndex(d['Date']).day
    d=d.rename(columns={'Chin2':'chin'+str(byl[i])})
    d=d.drop(columns=['Project','Date','Chin'])
    bonnev=pd.merge(left=bonnev, right=d, how='outer', on=['m','day'], sort=True)
del(d,i)
#bonnev.tail(10)

In [None]:
bonnev['chin_hist']=bonnev.iloc[:,5:].mean(axis=1, skipna=True).round(decimals=1)

In [None]:
bonnev['month']=bonnev['m'].apply(lambda x: datetime.strptime(str(x), '%m').strftime('%b'))
bonnev['date']=bonnev[['month','day']].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")

Create Bonneville Dam lagged data

In [None]:
lag2=8
bonnev_curyr_lagged=bonnev[['date','chin'+str(curyr)]]
bonnev_curyr_lagged=bonnev_curyr_lagged[bonnev_curyr_lagged['date']!='Feb-29']
bonnev_curyr_lagged['date1']=pd.to_datetime(bonnev_curyr_lagged['date'].apply(lambda x: str(curyr)+'-'+x), format='%Y-%b-%d')
bonnev_curyr_lagged['date2']=bonnev_curyr_lagged['date1']+ pd.Timedelta(days=lag2)
bonnev_curyr_lagged['m']=pd.DatetimeIndex(bonnev_curyr_lagged['date2']).month
bonnev_curyr_lagged['day']=pd.DatetimeIndex(bonnev_curyr_lagged['date2']).day
#bonnev_curyr_lagged['mon']=bonnev_curyr_lagged['m'].apply(lambda x: datetime.strptime(str(x), '%m').strftime('%b'))
#bonnev_curyr_lagged['date_new']=bonnev_curyr_lagged[['mon','day']].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")
bonnev_curyr_lagged=bonnev_curyr_lagged.drop(columns=['date','date1','date2'])
#bonnev_curyr_lagged.head()

In [None]:
cal1=bonnev[['date']]
cal1=cal1[cal1['date']!='Feb-29']
cal1['date1']=pd.to_datetime(cal1['date'].apply(lambda x: str(curyr)+'-'+x), format='%Y-%b-%d')
cal1['m']=pd.DatetimeIndex(cal1['date1']).month
cal1['day']=pd.DatetimeIndex(cal1['date1']).day
cal1=cal1.drop(columns=['date1'])
#cal1.head(10)

In [None]:
cal2=bonnev[['date']]
cal2['date1']=pd.to_datetime(cal2['date'].apply(lambda x: str(curyr)+'-'+x), errors='coerce', format='%Y-%b-%d')
cal2['m']=pd.DatetimeIndex(cal2['date1']).month
cal2['day']=pd.DatetimeIndex(cal2['date1']).day
cal2=cal2.drop(columns=['date1'])
#cal2.head(10)

In [None]:
lagged=cal1.merge(bonnev_curyr_lagged, how='left', on=['m','day'])
lagged=lagged.merge(albion_curyr_lagged, on=['m','day'], how='left')

4.3 Processing Acartia Data

Function to clean up dates format

In [10]:
def cleandates(s):
  if s.count('T')>0:
    s=s.replace('T',' ')
    s=s.replace('Z','')
    s=s.split('.')[0]
  return s

In [31]:
#acartia=pd.read_csv(os.path.join(acartia_path, 'acartia_'+todaystr+'.csv'))
acartia=pd.read_csv(os.path.join('/content/drive/MyDrive/Orcasound/salmon/data/acartia', 'acartia_'+todaystr+'.csv'))
acartia=acartia[~acartia['created'].isnull()]
acartia['created']=acartia['created'].apply(lambda x: cleandates(x))
acartia['m']=pd.DatetimeIndex(acartia['created']).month
acartia['day']=pd.DatetimeIndex(acartia['created']).day
acartia['year']=pd.DatetimeIndex(acartia['created']).year
acartia['month']=acartia['m'].apply(lambda x: datetime.strptime(str(x), '%m').strftime('%b'))
acartia['date']=acartia[['month','day']].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")
acartia['date_ymd']=acartia[['year','m','day']].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")
acartia['time']=pd.DatetimeIndex(acartia['created']).time

# Define keys to look up
srkw_keys=['SRKW', 'srkw', 'southern resident', 'Southern Resident', 'Southern resident', 'southern Resident']
jpod_keys=['J pod', 'Jpod', 'J ppd', 'J-pod', 'Js', 
           'j pod', 'jpod', 'j ppd', 'j-pod',  
           'j+k', 'k+j', 'j & k', 'k & j', 'j and k', 'k and j','jk pods', 'kj pods',
           'J+K', 'K+J', 'J & K', 'K & J', 'J and K', 'K and J','JK pods', 'KJ pods',
           'j+l', 'l+j', 'j & l', 'l & j', 'j and l', 'l and j', 'jl pods', 'lj pods',
           'J+L', 'L+J', 'J & L', 'L & J', 'J and L', 'L and J', 'JL pods', 'LJ pods',
           'j, k, l pod', 'j, k, and l pod','jkl', 
           'J, K, L pod', 'J, K, and L pod','JKL', 
           'j27', 'j38', 'j35','j40',
           'J27', 'J38', 'J35','J40',
           ]
kpod_keys=['K pod', 'Kpod', 'K-pod', 'Ks',
           'k pod', 'kpod', 'k-pod', 
           'j+k', 'k+j', 'j & k', 'k & j', 'j and k', 'k and j', 'jk pods', 'kj pods',
           'J+K', 'K+J', 'J & K', 'K & J', 'J and K', 'K and J', 'JK pods', 'KJ pods',
           'k+l', 'l+k', 'k & l','l & k', 'k and l', 'l and k', 'lk pods', 'kl pods',
           'K+L', 'L+K', 'K & L','L & K', 'K and L', 'L and K', 'LK pods', 'KL pods',
           'j, k, l pod', 'j, k, and l pod','jkl', 
           'J, K, L pod', 'J, K, and L pod','JKL', 
           'k37', 'K37',
]
lpod_keys=['L pod', 'Lpod', 'L-pod', 'Ls',
           'j+l', 'l+j', 'j & l', 'l & j', 'j and l', 'l and j', 'jl pods', 'lj pods',
           'J+L', 'L+J', 'J & L', 'L & J', 'J and L', 'L and J', 'JL pods', 'LJ pods',
           'k+l', 'l+k', 'k & l','l & k', 'k and l', 'l and k', 'lk pods', 'kl pods',
           'K+L', 'L+K', 'K & L','L & K', 'K and L', 'L and K', 'LK pods', 'KL pods',
           'j, k, l pod', 'j, k, and l pod','jkl', 
           'J, K, L pod', 'J, K, and L pod','JKL', 
           'l12','l54','l-12','l82','l85','l87', 
           'L12','L54','L-12','L82','L85','L87',
]
biggs_keys=['Bigg','bigg', 'Transient', 'transient', 'Ts',
            't99', 't137','t46','t10','t2c','t49',
            'T99','T137','T36','T10','T2C','T49',
            ]

# J pod
acartia['J']=acartia['data_source_comments'].apply(lambda x: 1 if any([k for k in jpod_keys if k in str(x)])
else 0)
# K pod
acartia['K']=acartia['data_source_comments'].apply(lambda x: 1 if any([k for k in kpod_keys if k in str(x)])
else 0)
# L pod
acartia['L']=acartia['data_source_comments'].apply(lambda x: 1 if any([k for k in lpod_keys if k in str(x)])
else 0)

# Southern Residents
acartia['sum_jkl']=acartia['J']+acartia['K']+acartia['L']
acartia['srkw_generic']=acartia['data_source_comments'].apply(lambda x: 1 if any([k for k in srkw_keys if k in str(x).lower()])
else 0)
acartia['srkw_type']=acartia['type'].apply(lambda x: 1 if isinstance(x, str) and ('Southern Resident') in x else 0)
acartia['srkw']=acartia[['J', 'K', 'L', 'srkw_generic', 'srkw_type']].values.max(axis=1)

# Biggs
acartia['biggs']=acartia['data_source_comments'].apply(lambda x: 1 if any([k for k in biggs_keys if k in str(x).lower()]) or ('Ts') in str(x) else 0)

acartia['sum_srkw_biggs']=acartia['srkw']+acartia['biggs']

# direction 
acartia['south']=acartia['data_source_comments'].apply(lambda x: 1 if ('southbound') in str(x).lower()
or ('heading south' in str(x).lower())
else 0)

acartia['southeast']=acartia['data_source_comments'].apply(lambda x: 1 if ('southeast') in str(x).lower()
or ('SE' in str(x))
else 0)

acartia['southwest']=acartia['data_source_comments'].apply(lambda x: 1 if ('southwest') in str(x).lower()
or ('SW' in str(x))
else 0)

acartia['north']=acartia['data_source_comments'].apply(lambda x: 1 if ('northbound') in str(x).lower()
or ('heading north' in str(x).lower())
else 0)

acartia['northeast']=acartia['data_source_comments'].apply(lambda x: 1 if ('northeast') in str(x).lower()
or ('NE' in str(x))
else 0)

acartia['northwest']=acartia['data_source_comments'].apply(lambda x: 1 if ('northwest') in str(x).lower()
or ('NW' in str(x))
else 0)

acartia['east']=acartia['data_source_comments'].apply(lambda x: 1 if (('eastbound') in str(x).lower() and ('southeastbound') not in str(x).lower())
or ('heading east' in str(x).lower())
else 0)

acartia['west']=acartia['data_source_comments'].apply(lambda x: 1 if (('westbound') in str(x).lower() and ('northwestbound' not in str(x).lower()))
or ('heading west' in str(x).lower())
else 0)

acartia['dir_sum']=acartia['south']+acartia['southeast']+acartia['southwest']+acartia['north']+acartia['northeast']+acartia['northwest']+acartia['east']+acartia['west']

Save processed Acartia data to drive 

In [32]:
for y in range(2018,curyr+1):
  act=acartia[acartia['year']==y]
  act=act[act['srkw']==1]
  act=act.drop_duplicates()
  if os.path.exists('/content/drive/MyDrive/'):
    #Upload to Google Drive
    act.to_csv('/content/drive/MyDrive/Orcasound/salmon/data/acartia/'+'srkw_'+str(y)+'.csv', index=False)
  elif os.path.exists('/content/data/acartia/'):
    #save acartia to the Colab local folder ***disappear after each session!!!
    act.to_csv('/content/drive/data/acartia/'+'srkw_'+str(y)+'.csv', index=False)
  else: 
    os.path.mkdir('/content/data/acartia/')
    act.to_csv('/content/drive/data/acartia/'+'srkw_'+str(y)+'.csv', index=False)