In [6]:
import pandas as pd
import numpy as np
import datetime
from sqlalchemy import create_engine
from rpy2.robjects import pandas2ri, r
pandas2ri.activate()

# Importing data into python
Here we select the required data - at the moment working with a very small subset (hummingbirds, Colorado, 2008-2014) while testing out code. 

In [19]:
# connect to database
engine = create_engine('postgresql://postgres:password123.@localhost:5432/ebird_data')

# Extract the data for US, humminbirds, from 2004-2014
humdat = pd.read_sql_query("""SELECT info.sampling_event_id, 
                           loc_id, 
                           latitude, 
                           longitude, 
                           year, 
                           month, 
                           day, 
                           time,
                           sppres.species
                           FROM ebird_checklist_info info
                           INNER JOIN ebird_checklist_species sppres
                               ON info.sampling_event_id = sppres.sampling_event_id
                           INNER JOIN ebird_species_info spinfo
                               ON sppres.species = spinfo.species
                           WHERE state_province = 'Colorado' 
                           AND family = 'Trochilidae' 
                           AND year >=2008
                           AND month IN (5, 6, 7) 
                           AND count_type IN ('P21', 'P22', 'P34')"""
                           , con=engine)

# check the data
humdat.shape

(34898, 9)

Convert the observation date to full date and give a value of 1 to each observation (will be used later).

In [8]:
humdat['obs_date'] = humdat.apply(lambda x: datetime.datetime.strptime(str(x['year']) + ' ' + str(x['day']), '%Y %j').strftime('%Y-%m-%d'), axis = 1)
humdat['value'] = 1

So here I'm reducing the data so that I only have month/location combinations with >= 3 replicates (equal to the lowest threshold used by Kamp et al. 2016) and then taking all locations where at least 5 years have +3 observations. This is subject to change after discussions because the choices are essentially arbitrary. 

In [12]:
humdat_sml = humdat[['obs_date', 'year', 'loc_id']].drop_duplicates().groupby(['loc_id', 'year']).size().reset_index()
humdat_sml.columns = ['loc_id', 'year', 'obs']
humdat_month_location = humdat_sml.pivot(index = 'loc_id', columns = 'year', values = 'obs').fillna(value=0)
humdat_location_obs = humdat_month_location.apply(lambda x: (x >= 3).sum(), axis = 1)
humdat_location = humdat_location_obs[humdat_location_obs >= 3].reset_index()
humdat_obs = humdat[humdat.loc_id.isin(humdat_location.loc_id)]

Number of locations

In [13]:
humdat_location.shape[0]

160

Number of observations

In [14]:
humdat_obs.shape[0]

13121

Output the locations before and after data pruning to plot for comparison in r


In [15]:
locations_sml = humdat_obs[['loc_id', 'latitude', 'longitude']].drop_duplicates()
locations_full = humdat[['loc_id', 'latitude', 'longitude']].drop_duplicates()
r_locations_sml = pandas2ri.py2ri(locations_sml)
r.assign("locations_sml", r_locations_sml)
r("save(locations_sml, file='D:/eBird_trends/locations_sml.rda')")
r_locations_full = pandas2ri.py2ri(locations_full)
r.assign("locations_full", r_locations_full)
r("save(locations_full, file='D:/eBird_trends/locations_full.rda')")

rpy2.rinterface.NULL

Get information for input to the function - maximum number of replicates; unique locations, years and species.

In [16]:
# get the maximum number of unique sampling replicates 
max_rep = humdat_obs[['obs_date', 'year', 'loc_id']].drop_duplicates().sort_values(['loc_id', 'obs_date'])
max_rep = max(max_rep.groupby(['loc_id', 'year']).size())
year = humdat_obs.year.unique()
loc_id = humdat_obs.loc_id.unique()
species = pd.DataFrame(humdat_obs.species.unique(), columns = ['species'])
species_full = pd.DataFrame(humdat.species.unique(), columns = ['species'])

Species lost by filtering the data:

In [18]:
missing_species = species_full[~species_full.species.isin(species.species)]
missing_species

Unnamed: 0,species
8,Eugenes_fulgens


Next we define a function to get the species presence absence data into the correct order, set up a year x location x species x replicate array for input to the occupancy model and then loop through years and locations to create it. 

In [None]:
def data_juggle(in_dat, timestep, location, species, max_rep):
    dat_sub = in_dat[(in_dat.year == timestep) & (in_dat.loc_id == location)]
    dat_sub = dat_sub[['species', 'obs_date', 'value']].drop_duplicates()
    # get lookup for date/time and replicate
    if dat_sub.shape[0]==0:
        out_dat = np.zeros((species.size, max_rep))
        out_dat[:] = np.NaN
    else:
        sampling_reps = dat_sub[['obs_date']].drop_duplicates().sort_values(['obs_date'])
        sampling_reps['replicate'] = range(1, len(sampling_reps) + 1)
        
        dat_samp_reps = dat_sub.merge(sampling_reps)
        dat_wide = dat_samp_reps.pivot(index = 'species',columns = 'replicate', values = 'value').reset_index()
        dat_species = species.merge(dat_wide,how = "left").fillna(value = 0)
        extra_cols = list(range(dat_species.columns[dat_species.shape[1]-1]+1, max_rep+1))
        extra_cols = pd.DataFrame(index = dat_species.index, columns = extra_cols)
        out_dat = pd.concat([dat_species, extra_cols], axis = 1).drop(['species'], axis = 1).as_matrix()
        print( str(timestep) + ' ' + str(location) + ' done...')
    return out_dat;
    
wide_dat = np.zeros((year.size, loc_id.size, species.size, max_rep))

for i in range(0, len(year)):
    for j in range(0, len(loc_id)):
        wide_dat[i][j] = data_juggle(humdat_obs, year[i], loc_id[j], species, max_rep)