## For each bee id, be able to provide the number of detections in the hive in the given timeslot.

In [4]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
import pandas as pd
import psycopg2
import psycopg2.extras
from datetime import timedelta, datetime
import time

import bb_utils
import bb_utils.meta
import bb_utils.ids
import bb_backend
from bb_backend.api import FramePlotter, VideoPlotter
from bb_backend.api import get_plot_coordinates, transform_axis_coordinates, get_image_origin

bb_backend.api.server_adress = 'localhost:8000'
connect_str = """dbname='beesbook' user='reader' host='tonic.imp.fu-berlin.de' 
                 password='' application_name='mehmed'"""

  """)


In [5]:
#Parameters for loading data
num_hours = 1
datetime_start = datetime(2016, 8, 23)

#Parameters for presenting data
bin_size_in_hours = 1

#Hyperparameters for the data wrangling process
num_intervals_per_hour = 60
rolling_window_size = 7


print(datetime_start)
#(First detections are on 20.07.2016, last are 19.09.2016 (3 months duration))

2016-08-23 00:00:00


In [6]:
#Define bee IDs by getting a known forager group
meta = bb_utils.meta.BeeMetaInfo()

group_id = 20
bee_ids_from_group = map(lambda i: i.as_ferwar(), 
                list(map(bb_utils.ids.BeesbookID.from_dec_12, meta.get_foragergroup(group_id).dec12)))

bee_ids_from_group = list(bee_ids_from_group)
group = meta.get_foragergroup(group_id)
print(group.date)

2016-08-23 00:00:00


### Reading from saved CSV files

In [7]:
# def clean_detections(csv:
    

In [8]:
#for testing: a single csv with hardcoded name
#sample_df = pd.read_csv('2016-08-23_00:00:00.csv', parse_dates=['timestamp'])

location_prefix = "/mnt/storage/janek/" # or ""

#Loading first element before the loop, to have a table formatted nicely for appending
start_csv_name = (datetime_start).strftime("%Y-%m-%d_%H:%M:%S")+".csv"

print('Processing '+location_prefix+start_csv_name+' before the loop')
detections_df = pd.read_csv(location_prefix+start_csv_name, parse_dates=['timestamp'], usecols=['timestamp', 'bee_id'])

detections_df.head()

Processing /mnt/storage/janek/2016-08-23_00:00:00.csv before the loop


Unnamed: 0,timestamp,bee_id
0,2016-08-23 00:27:31.780472,3073
1,2016-08-23 00:27:32.116578,3073
2,2016-08-23 00:27:32.447851,3073
3,2016-08-23 00:27:32.783652,3073
4,2016-08-23 00:27:33.109483,3073


In [9]:
#read and concat a number of hour-long csvs (note: thekla memory crashes if >16)
for i in range(1, num_hours):
    csv_name = (datetime_start + timedelta(hours=i)).strftime("%Y-%m-%d_%H:%M:%S")+".csv"
    print('Processing '+csv_name)
    new_data = pd.read_csv(location_prefix+csv_name, parse_dates=['timestamp'], usecols=['timestamp', 'bee_id'])
    detections_df = pd.concat([detections_df, new_data])
    print('Num. rows after appending: '+str(detections_df.shape[0]))

In [10]:
print(detections_df.shape)

(80688851, 2)


In [10]:
#interval length is the total observation period divided by total number of intervals
total_num_intervals = (num_intervals_per_hour*num_hours)
interval_length = timedelta(hours=num_hours) // (num_intervals_per_hour*num_hours)

# prepare dataframe with zeros in the shape [bees x total_num_intervals]
# append bee_ids from the left
intervals = pd.DataFrame(data=np.zeros([len(bee_ids_from_group),total_num_intervals])) 
bee_ids = pd.DataFrame(data={'id': bee_ids_from_group})
presence_df = pd.concat([bee_ids, intervals], axis=1)

In [11]:
#Iterate over intervals and over detections
#If a bee from bee_ids is detected within a given interval, mark the cell for that bee and interval with a '1'

interval_starttime = datetime_start
# print("Processing intervals: ")
for interval in range(total_num_intervals): 
    #choose detections for interval
    interval_endtime = interval_starttime + interval_length
    before = detections_df['timestamp'] >= interval_starttime 
    after = detections_df['timestamp'] < interval_endtime
    interval_detections = detections_df[before & after]
    bee_row_number = 0
    for b in presence_df['id']:
        if b in interval_detections['bee_id'].unique():
            presence_df.set_value(bee_row_number, interval, 1)
        bee_row_number += 1 
    interval_starttime = interval_endtime
    #print(interval,", ", end='')

  from ipykernel import kernelapp as app


In [18]:
#Apply rolling median to filter noise
#clean up to get rid of the NaNs
#use diff to identify entries (with 1) and exits (with -1)
#(sum_of_abs / 2) gives us the presumed number of trips a bee takes 


#'clean up nans'
num_nans_to_clean = rolling_window_size/2

first_col = presence_df.iloc[:, 1:2]
last_col = presence_df.iloc[:, -2:-1]

# presence = presence_df.iloc[:, 1:]
for i in range(0,num_nans_to_clean):
    




rolled = presence_df.iloc[:, 1:].rolling(window=rolling_window_size,center=True,axis=1).median()



In [19]:
rolled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
1,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,,,,
2,,,,,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
3,,,,,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
4,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,,,,


In [13]:



diffed = rolled.diff(axis=1)


#probably ununsed
presence_df_copy = presence_df
presence_df_copy.iloc[:, 1:] = rolled #now contains bee ids and the rolled + diffed presence 


trips_df = presence_df_copy['id'] #for combining results in one table

The history saving thread hit an unexpected error (OperationalError('database or disk is full',)).History will not be written to the database.


In [14]:
#NOTE: Patch the data to get rid of NaNs

In [15]:
rolled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2870,2871,2872,2873,2874,2875,2876,2877,2878,2879
0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,,,
2,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,,,
3,,,,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [16]:
diffed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2870,2871,2872,2873,2874,2875,2876,2877,2878,2879
0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,
2,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,
3,,,,,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [15]:
presence_df_copy.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,2870,2871,2872,2873,2874,2875,2876,2877,2878,2879
0,1799,,,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,
1,1593,,,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,
2,2106,,,,,,,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,,,,,,
3,1662,,,,,,,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,,,,,,
4,1180,,,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,


In [16]:
#for loop config
bin_starttime = datetime_start
num_intervals_per_bin = num_intervals_per_hour*bin_size_in_hours
total_num_bins = int(num_hours / bin_size_in_hours)

print("num_intervals_per_bin: ", num_intervals_per_bin, "total_num_bins: ", total_num_bins)
for bin_nr in range(total_num_bins): 
    
    start_index = bin_nr*num_intervals_per_bin
    end_index = start_index + num_intervals_per_bin
    
    new_bin = diffed.iloc[:, start_index:end_index]
    
    #limit down to the right bin:
    #read num_intervals_per_hour*bin_size_per_hour columns (as each column represents one interval)
    
    summed = new_bin.abs().sum(axis=1) / 2
    trips_df = pd.concat([trips_df,summed],axis=1) #add this interval to the trips table
    #update loop index

num_intervals_per_bin:  120 total_num_bins:  24


In [17]:
print(trips_df.mean(axis=0))
trips_df

id    1724.454545
0        0.681818
0        0.409091
0        0.545455
0        0.545455
0        1.272727
0        1.500000
0        1.363636
0        1.863636
0        1.818182
0        1.409091
0        1.636364
0        2.000000
0        2.000000
0        1.727273
0        1.636364
0        2.590909
0        1.363636
0        2.272727
0        2.045455
0        1.954545
0        1.545455
0        2.000000
0        1.409091
0        1.409091
dtype: float64


Unnamed: 0,id,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.9,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18
0,1799,0.0,0.0,0.0,0.5,0.0,4.0,4.5,2.0,1.5,...,3.0,4.0,3.0,1.5,2.5,2.0,3.0,4.0,1.0,1.0
1,1593,2.5,0.5,0.0,0.0,2.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.5,2.5,1.0,2.0,2.0,2.5,2.0,0.5
2,2106,0.5,2.0,1.0,2.0,1.5,0.5,2.0,2.0,2.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.5,0.5,1.0
3,1662,1.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,...,2.0,1.0,0.0,3.0,2.0,1.0,0.0,1.0,0.0,3.0
4,1180,0.5,0.5,0.0,0.0,2.0,4.5,1.5,2.5,3.5,...,2.5,6.5,3.5,2.0,2.0,3.0,3.5,3.5,0.5,0.0
5,2984,0.5,0.5,1.0,0.0,0.5,0.5,1.0,1.5,1.5,...,2.5,3.0,1.5,4.5,4.0,3.0,0.0,4.0,2.5,1.5
6,1197,0.0,0.0,1.0,1.0,1.5,0.0,1.5,0.5,1.0,...,1.5,3.5,2.0,1.5,1.0,0.0,0.0,1.5,2.5,2.0
7,1714,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,...,3.5,4.5,2.0,0.5,4.0,3.0,2.0,1.5,1.5,0.5
8,1471,1.5,0.0,1.0,0.0,1.0,0.0,2.0,1.0,5.0,...,0.0,1.0,0.0,4.5,2.5,1.5,1.5,2.0,0.0,1.5
9,1232,0.0,0.0,0.0,0.5,3.5,2.0,0.0,2.0,2.0,...,2.0,3.0,1.0,4.0,1.0,1.5,1.5,0.0,2.0,2.0
