In [1]:
import pandas as pd
import numpy as np
import os
import re
import xarray as xr
import geopandas as gpd
from rasterio import features
from rasterio.enums import MergeAlg
from rasterio.plot import show
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from shapely.geometry import Point, LineString, MultiPoint, shape
from shapely.ops import split
from tqdm.notebook import tqdm
from sklearn.neighbors import BallTree
from lotek.gps import calc_ta, calc_dist
from lotek.conversion import shp2mask
%config Completer.use_jedi = False
%matplotlib widget

#from distributed import Client, LocalCluster
#cluster = LocalCluster(n_workers=5, threads_per_worker=2)
#client = Client(cluster)
#client

In [2]:
"""
Define files to be used
"""
# CSV of GPS collar data
#gps_f = 'C:/SPK_local/data/cattle_gps/Lotek/2017/TRMappended2017_GoodDays_TrackQ3.csv'
inDIR = 'T:/5-CattleCollars/3-Projects/CARM_PairedComps/TRM_Appended_Files'
outDIR = 'C:/SPK_local/for_others/Augustine_lotek'
gps_f_name = 'TRMappended_GoodDays_2018TrackQ3.csv'
gps_f = os.path.join(inDIR, gps_f_name)

# distance to fence raster to be used as 30 m template
tif_30m_f = 'C:/SPK_local/data/rasters/Masks/CPER_dist_to_fence_2017.tif'

# plant community map to be used as 1 m template
#veg_f = 'G:/neon_v18/neon_class_2017_v18.tif'
veg_f = 'T:/3-GIS/CPER/Layers/NEON/veg_maps_v18/neon_class_2017_v18.tif'

# pasture boundaries shapefile
cper_f = "C:/SPK_local/data/vectors/Pasture_Boundaries/Shapefiles/cper_pastures_2017_clip.shp"

In [3]:
"""
Read in GPS data and format
"""
# read data
df_gps = pd.read_csv(gps_f, engine='python')

In [4]:
# fix column names as needed
#col_remap_dict = {
#    'Date_Time_Fix': 'Date_Time',
#    'duration_check': 'duration'
#}
#df_gps = df_gps.rename(columns=col_remap_dict)

# convert datetime strings to dates
df_gps['Fix_DateTime'] = pd.to_datetime(df_gps['Fix_Date'].astype(str) + ' ' + df_gps['Fix_Time'])
df_gps['Fix_Date'] = pd.to_datetime(df_gps['Fix_Date'])

# sort data by fix time for each steer
df_gps = df_gps.groupby('Steer_ID').apply(lambda x: x.sort_values('Fix_DateTime')).reset_index(drop=True)

# calculate the actual duration between fixes
df_gps['Fix_Duration'] = df_gps.groupby('Steer_ID').apply(
    lambda x: (x.Fix_DateTime - x.Fix_DateTime.shift(1)).astype("timedelta64[s]")).reset_index()['Fix_DateTime']

# check pasture names in data
display(df_gps['Pasture'].unique())

array(['7NW', '17N', '20SE', '24W', '15', '26', '25SE', '19N', '31', '5'],
      dtype=object)

In [5]:
#df_test = pd.read_csv(os.path.join(outDIR, 'TRMappended_GoodDays_2017TrackQ3_flagged.csv'), engine='python')

In [6]:
#df_test[['Date_Time', 'Fix_Date', 'Fix_Time', 'Date_Time_Fix', 'duration', 'duration_check']]

In [7]:
# fix pasture names using a dictionary of form {'old':'new'}
past_remap_dict = {
    '31': '31E', 
    '19N': '19N',
    '25SE': '25SE',
    '26': '26E',
    '15': '15E',
    '24W': '24W',
    '20SE': '20SE',
    '17N': '17N',
    '7NW': '7NW',
    '5': '5E'
}
df_gps = df_gps.replace({'Pasture': past_remap_dict})

In [8]:
"""
Extract pasture name of each GPS fix to identify fixes outside assigned pastures
"""
# read in 1 m vegetation community raster as template
xr_veg = xr.open_rasterio(veg_f).squeeze('band')

# read in CPER pasture shapefile
df_cper = gpd.read_file(cper_f)

# dissolve by pasture to make sure each pasture is a single polygon
df_cper = df_cper.dissolve(by='Past_Name_').reset_index().rename(columns={'Past_Name_': 'Pasture'})

# convert pasture polygons to an xarray (1 m gridded raster) object with pasture name as value
cper_info = df_cper[['Pasture', 'geometry']].reset_index(drop=True).reset_index().rename(columns={'index': 'id'})
past_dict = {row.id+1: row.Pasture for _, row in cper_info.iterrows()}
past_dict[0] = 'UNK'
past_mask_shp = [(row.geometry, row.id+1) for _, row in cper_info.iterrows()]
past_mask_1m = shp2mask(shp=past_mask_shp, xr_object=xr_veg)
past_mask_1m.values = np.array([past_dict[i] for i in past_mask_1m.values.flatten()]).reshape(past_mask_1m.shape)

# convert GPS collar coordinates to xarray objects
target_lon = xr.DataArray(df_gps['UTM_X'], dims="points")
target_lat = xr.DataArray(df_gps['UTM_Y'], dims="points")

# get pasture name of the nearest pixel in the 1 m pasture mask and assign to GPS dataframe
past_tmp = past_mask_1m.sel(x=target_lon, y=target_lat, method='nearest')
df_gps['past_mask'] = past_tmp.values

In [9]:
"""
Clean up data based on distance from assigned pasture, grazing hrs per day and missing fixes per day
"""
# define threshold distance (m) from deployment pasture to remove GPS fixes from dataset
dist_thresh = 100

# define minimum and maximum grazing hours per day for removing days from GPS dataset
hrs_min = 6
hrs_max = 13

# define maximum number of missing fixes allowed per day
missing_max = 20

# loop through pastures and get coordinates of nearest location within the pasture boundary for each fix
for past_i in tqdm(df_gps['Pasture'].unique()):
    #print(past_i)
    xr_past_stacked = past_mask_1m.where(past_mask_1m == past_i, drop=True).stack(xy=['x', 'y'])
    x_coords = [i[0] for i in xr_past_stacked[xr_past_stacked.notnull()]['xy'].values]
    y_coords = [i[1] for i in xr_past_stacked[xr_past_stacked.notnull()]['xy'].values]

    # Create a BallTree object
    # borrowed from: https://stackoverflow.com/questions/58893719/find-nearest-point-in-other-dataframe-with-a-lot-of-data
    tree = BallTree(np.array(list(zip(x_coords, y_coords))), leaf_size=2)

    #Query the BallTree on all GPS coordinates from collars deployed in the pasture to find the distance
    # to the nearest pixel within the pasture and its id
    dist_tmp, id_tmp= tree.query(
        df_gps.loc[df_gps['Pasture'] == past_i, ['UTM_X', 'UTM_Y']].values, # The input array for the query
        k=1, # The number of nearest neighbors
    )
    df_gps.loc[df_gps['Pasture'] == past_i, 'distance_nearest'] = dist_tmp
    df_gps.loc[df_gps['Pasture'] == past_i, 'id_nearest'] = id_tmp
    df_gps.loc[df_gps['Pasture'] == past_i, 'UTM_X_nearest'] = np.array(x_coords)[id_tmp.squeeze()]
    df_gps.loc[df_gps['Pasture'] == past_i, 'UTM_Y_nearest'] = np.array(y_coords)[id_tmp.squeeze()]
    
    # print the number of locations more than 50 m from the pasture in which the collar was deployed
    #print(str(len(
    #    df_gps[(df_gps['Pasture'] == past_i) & 
    #           (df_gps['Pasture'] != df_gps['past_mask']) & 
    #           (df_gps['distance_nearest'] > dist_thresh)])) +
    #      ' locations found > ' + str(dist_thresh) + ' m from pasture. \n')

# remove all GPS locations more than 100 m from the pasture in which the collar was deployed
df_gps = df_gps[~((df_gps['Pasture'] != df_gps['past_mask']) & (df_gps['distance_nearest'] > dist_thresh))]

# create new final fix coordinates by correcting near-fence but outside coordinates
df_gps['UTM_X_fnl'] = df_gps['UTM_X']
df_gps['UTM_Y_fnl'] = df_gps['UTM_Y']
df_gps.loc[df_gps['Pasture'] != df_gps['past_mask'], 
           'UTM_X_fnl'] = df_gps[df_gps['Pasture'] != df_gps['past_mask']]['UTM_X_nearest']
df_gps.loc[df_gps['Pasture'] != df_gps['past_mask'], 
           'UTM_Y_fnl'] = df_gps[df_gps['Pasture'] != df_gps['past_mask']]['UTM_Y_nearest']

  0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
"""
Calculate movment stats at each GPS fix
    turning angle (degrees): angle between previous, current and next fix, 
        converted to difference from straight line (0) with possible values ranging 0-180
    step length (m): distance from previous to current fix
    movement rate (m/min): distance (m) between previous and current fix divided by
        time (mins) between previous and current fix
"""

# calculate the turning angle and distance at each fix for each day and each steer
df_gps['steplength'] = np.nan
df_gps['turnangle'] = np.nan
for group in tqdm(df_gps.groupby(['Steer_ID', 'Fix_Date'])):
    group[1]['UTM_X_fnl_lag1'] = group[1]['UTM_X_fnl'].shift(1)
    group[1]['UTM_Y_fnl_lag1'] = group[1]['UTM_Y_fnl'].shift(1)
    group[1]['UTM_X_fnl_lead1'] = group[1]['UTM_X_fnl'].shift(-1)
    group[1]['UTM_Y_fnl_lead1'] = group[1]['UTM_Y_fnl'].shift(-1)
    a_list = list(group[1][['UTM_X_fnl_lag1', 'UTM_Y_fnl_lag1']].values)
    b_list = list(group[1][['UTM_X_fnl', 'UTM_Y_fnl']].values)
    c_list = list(group[1][['UTM_X_fnl_lead1', 'UTM_Y_fnl_lead1']].values)
    dist_mask = ~(np.any(np.isnan(a_list), axis=1) |  np.any(np.isnan(b_list), axis=1))
    sl_tmp = np.ones_like(dist_mask) * np.nan
    sl_tmp[dist_mask] = calc_dist(np.array(list(map(tuple, a_list)))[dist_mask], 
                                  np.array(list(map(tuple, b_list)))[dist_mask]).squeeze()
    df_gps.loc[(df_gps['Steer_ID'] == group[0][0]) & (df_gps['Fix_Date'] == group[0][1]),
               'steplength'] = sl_tmp
    df_gps.loc[(df_gps['Steer_ID'] == group[0][0]) & (df_gps['Fix_Date'] == group[0][1]),
               'turnangle'] = calc_ta(a_list, b_list, c_list)

# Calculate movement rate from distance and timestamp
df_gps['moverate'] = df_gps['steplength'] / df_gps['Fix_Duration']

  0%|          | 0/2142 [00:00<?, ?it/s]

  cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
  angle = np.arccos(cosine_angle)


In [11]:
# flag all locations suspected as jumps (movement rate > 42 m/min and turnangle > 120 degrees)
jump_flag = (df_gps['moverate'] > 42) & (df_gps['turnangle'] > 120)
df_gps['jump_flag'] = jump_flag.astype(int)

# flag all locations with movement rate > 84 m/min while grazing
fast_flag = (df_gps['moverate'] > 84) & (df_gps['GrazingAct'] == 1)
df_gps['fast_flag'] = fast_flag.astype(int)

# flag all days with more than the maximum number of allowed missing fixes
missingfix_flag = df_gps.groupby(['Fix_Date', 'Steer_ID'])['GrazingAct'].transform('count') < (24 * (60 / 5) - missing_max)
df_gps['missingfix_flag'] = missingfix_flag.astype(int)

# combine the three masks above to flag any data that should not be included when calculating grazing hours
badfix_flag = jump_flag | fast_flag | missingfix_flag
df_gps['badfix_flag'] = badfix_flag.astype(int)

# calculate total time in hrs spent grazing daily
df_gps['grazing_hrs'] = df_gps[~badfix_flag].groupby(['Fix_Date', 'Steer_ID'])['GrazingAct'].transform('sum') * 5 / 60

# flag all days with less than 6 hrs and more than 13 hrs grazing
grazehrs_flag = (df_gps['grazing_hrs'] < hrs_min) | (df_gps['grazing_hrs'] > hrs_max)
df_gps['grazehrs_flag'] = grazehrs_flag.astype(int)

In [12]:
df_gps_flagged = df_gps.copy()
df_gps_flagged.to_csv(os.path.join(outDIR, re.sub('.csv', '_flagged.csv', gps_f_name)), index=False)

df_gps = df_gps[~(badfix_flag | grazehrs_flag)]

In [13]:
# change bouts when grazing activity changes, unless the two fixes before and two fixes after are the same
# in detail: change to new bout if:
# activity is not the same as the previous row
# AND activity is the same as one of the next two rows
# AND activitiy is not the same as one of the two rows after it
# NOTE: this only works for bouts of 4+ fixes. We will manually classify bouts <= 3 fixes as 'Transition' bout
df_gps['grazing_bout'] = df_gps.groupby(['Fix_Date', 'Steer_ID'])['GrazingAct'].apply(
    lambda x: (((x != x.shift(1)) &
                ((x == x.shift(-1)) | (x == x.shift(-2))) & 
                ((x != x.shift(2)) | (x != x.shift(3))))).cumsum())

# calculate duration of each bout in minutes
df_gps['bout_mins'] = df_gps.groupby(['Fix_Date', 'Steer_ID', 'grazing_bout'])['Fix_DateTime'].transform(lambda x: (x.max() - x.min()).seconds/60 + 5.0)

# calculate the majority grazing activity for each bout to calculate bout activity
df_gps['bout_maj'] = df_gps.groupby(['Fix_Date', 'Steer_ID', 'grazing_bout'])['GrazingAct'].transform(lambda x: x.value_counts().index[0])

# create a bout activity column
bout_act_dict = {0: 'Nongrazing',
                1: 'Grazing'}
df_gps['bout_act'] = df_gps['bout_maj'].apply(lambda x: bout_act_dict[x])
df_gps.loc[df_gps['bout_mins'] < 20, 'bout_act'] = 'Transition'

# calculate the number of bouts per day in each activity
df_gps['act_bout_ct_daily'] = df_gps.groupby(['Fix_Date', 'Steer_ID', 'bout_act'])['grazing_bout'].transform('nunique')

# calculate grazing activity budgets for each day and steer
df_gps['act_budget_daily'] = df_gps.groupby(['Fix_Date', 'Steer_ID', 'bout_act'])['bout_mins'].transform('sum') / df_gps.groupby(['Fix_Date', 'Steer_ID'])['bout_mins'].transform('sum')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [14]:
df_gps.to_csv(os.path.join(outDIR, re.sub('.csv', '_cleaned.csv', gps_f_name)), index=False)

In [15]:
sum(grazehrs_flag)/len(df_gps_flagged)

0.06083673255249662

In [16]:
len(df_gps_flagged)

726783

In [18]:
len(df_gps)

671158