In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import sys
sys.path.append('../')
import smbus

# importing all source code
from src.features import build_features
from src.visualization import visualize
from src.data import make_dataset
from src.reports import make_report
from src.diagnostics import run_diagnostics

import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'smbus'

In [36]:
from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

class beacon_statistics():

    def __init__(self):
        pass

    def get_percent_completeness(self, df, start_time, end_time, sensor='CO2', beacon_no=-1):
        '''
        Gets the percent completeness for all beacons in the dataframe
        
        Parameters:
        - df: dataframe holding the beacon data with one column titled "Beacon"
        - start_time: datetime for the start of the analysis period
        - end_time: datetime for the end of the analysis period
        - sensor: string corresponding to the sensor variable to check for completeness
        - beacon_no: integer corresponding to the beacon of choice - default is -1 (all)
        
        Returns:
        - aggregate_completeness: dictionary with beacon number as key and percent completeness as 
          value
        - hourly_completeness: dictionary with beacon number as key and list of hourly percent
          compeleteness as values
        '''

        # vars to return
        aggregate_completeness = {}
        hourly_completeness = {}

        if beacon_no == -1:
            # all beacons - default
            beacon_list = df['Beacon'].unique()
        else:
            # list of just one beacon - must specify
            beacon_list = [beacon_no]

        # getting percent complete through list of desired beacons
        for beacon_no in beacon_list:
            data_by_id = df[df['Beacon'] == beacon_no]
            data_by_id_by_time = data_by_id[start_time:end_time]

            data_counts = data_by_id_by_time.resample(timedelta(hours=1)).count()
            # hourly completeness
            data_percentages = data_counts / 12
            hourly_completeness[beacon_no] = data_percentages

            # aggregate completeness
            overall_percentage = np.nansum(data_counts[sensor])/(len(data_by_id_by_time))
            aggregate_completeness[beacon_no] = overall_percentage

        return aggregate_completeness, hourly_completeness

    def get_measurement_time(self, df, start_time, end_time, sensor='CO2', threshold=0, below=True, beacon_no=-1, measurement_interval=5):
        '''
        Determine the number of measurements above or below certain threshold

        Parameters:
        - df: dataframe holding the beacon data with one column titled "Beacon"
        - start_time: datetime for the start of the analysis period
        - end_time: datetime for the end of the analysis period
        - sensor: string corresponding to the sensor variable to check for completeness
        - beacon_no: integer corresponding to the beacon of choice - default is -1 (all)
        - measurement_interval: integer specifying the typical time between measurements
        - threshold: integer or float specifying the value to compare against - default
          is zero
        - below: boolean specifying to look for values below (True) or above (False) the
          specified threshold

        Returns:
        - time: dictionary with keys as beacon numbers and the amount of time spent
          above/below a certain threshold the value
        '''
        # vars to return
        time = {}

        if beacon_no == -1:
            # all beacons - default
            beacon_list = df['Beacon'].unique()
        else:
            # list of just one beacon - must specify
            beacon_list = [beacon_no]

        # getting measurement times through list of desired beacons
        for beacon_no in beacon_list:
            data_by_id = df[df['Beacon'] == beacon_no]
            data_by_id_by_time = data_by_id[start_time:end_time]
            
            # counting the number of values above/below the threshold
            measurements = data_by_id_by_time[sensor].values
            if below:
                count = sum(map(lambda x : x<threshold, measurements))
            else:
                count = sum(map(lambda x : x>threshold, measurements))

            # adding result to dictionary
            time[beacon_no] = count*measurement_interval

        return time
    
    def compare_temperature_readings(self, df):
        '''
        Compares temperature readings from the DGS and Sensirion sensors
        
        Parameters:
        - df: dataframe holding the beacon data with columns titled "T_NO2", "T_CO", and "Temperature [C]", "Beacon"
        
        Returns:
        - t_raw: dataframe holding the measured temperature values for all beacons
        - t_summary: dictionary with beacon numbers as keys and dataframe of statistical values for each t sensor
        '''
        
        t_raw = df[['T_NO2','T_CO','Temperature [C]','Beacon']]
        t_raw.columns = ['DGS1','DGS2','Sensirion','Beacon']
        def avg_dgs(x,y):
            if x < 0:
                return y
            elif y < 0:
                return x
            else:
                return (x+y)/2
            
        t_raw['DGS_AVG'] = t_raw.apply(lambda row: avg_dgs(row[0],row[1]),axis=1)
        t_raw['Difference'] = t_raw['Sensirion'] - t_raw['DGS_AVG']
        
        t_summary = {}
        for beacon in t_raw['Beacon'].unique():
            data_by_beacon = t_raw[t_raw['Beacon'] == beacon]
            means = [np.nanmean(data_by_beacon['DGS1']),np.nanmean(data_by_beacon['DGS2']),np.nanmean(data_by_beacon['Sensirion'])]
            mins = [np.nanmin(data_by_beacon['DGS1']),np.nanmin(data_by_beacon['DGS2']),np.nanmin(data_by_beacon['Sensirion'])]
            maxs = [np.nanmax(data_by_beacon['DGS1']),np.nanmax(data_by_beacon['DGS2']),np.nanmax(data_by_beacon['Sensirion'])]
            p25s = [np.nanpercentile(data_by_beacon['DGS1'],25),np.nanpercentile(data_by_beacon['DGS2'],25),np.nanpercentile(data_by_beacon['Sensirion'],25)]
            p75s = [np.nanpercentile(data_by_beacon['DGS1'],75),np.nanpercentile(data_by_beacon['DGS2'],75),np.nanpercentile(data_by_beacon['Sensirion'],75)]
        
            beacon_df = pd.DataFrame(data={'Min':mins,'Max':maxs,'Mean':means,'25th':p25s,'75th':p75s},
                  index=['DGS1', 'DGS2', 'Sensirion'])
            t_summary[beacon] = beacon_df
            
        return t_raw, t_summary

In [37]:
#data = pd.read_csv('../data/processed/bpeace2-beacon.csv',index_col='Timestamp',parse_dates=True)

In [38]:
featuring = beacon_statistics()

In [40]:
df1, dict1 = featuring.compare_temperature_readings(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [46]:
b7 = data[data['Beacon'] == 7]
np.nanpercentile(b7['CO2'].values,25)

1216.765734863281

In [41]:
df1

Unnamed: 0_level_0,DGS1,DGS2,Sensirion,Beacon,DGS_AVG,Difference
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-06-01 17:15:00,22.0,-100.00,23.340256,30,22.00,1.340256
2020-06-01 17:20:00,22.0,-100.00,24.447882,30,22.00,2.447882
2020-06-01 17:25:00,23.0,-100.00,24.798017,30,23.00,1.798017
2020-06-01 17:30:00,23.0,-100.00,24.937622,30,23.00,1.937622
2020-06-01 17:35:00,23.0,-100.00,24.997437,30,23.00,1.997437
...,...,...,...,...,...,...
2020-07-11 08:35:00,25.0,25.52,28.097687,7,25.26,2.837687
2020-07-11 08:40:00,25.0,26.00,27.940459,7,25.50,2.440459
2020-07-11 08:45:00,25.0,26.00,26.967072,7,25.50,1.467072
2020-07-11 08:50:00,25.0,25.04,26.667141,7,25.02,1.647141


In [42]:
dict1

{30:                   Min         Max        Mean  25th  75th
 DGS1        22.000000   29.000000   26.440917   NaN   NaN
 DGS2      -100.000000 -100.000000 -100.000000   NaN   NaN
 Sensirion   23.340256   32.318733   29.498940   NaN   NaN,
 1:                  Min        Max       Mean  25th  75th
 DGS1       22.000000  34.000000  26.837056   NaN   NaN
 DGS2       22.000000  33.000000  26.820551   NaN   NaN
 Sensirion  22.144455  36.111557  29.943940   NaN   NaN,
 21:                  Min       Max       Mean  25th  75th
 DGS1       23.000000  29.00000  25.294767   NaN   NaN
 DGS2       23.000000  29.00000  25.343167   NaN   NaN
 Sensirion  24.817886  31.54818  28.472912   NaN   NaN,
 34:                   Min         Max        Mean  25th  75th
 DGS1        22.000000   29.000000   25.565598   NaN   NaN
 DGS2      -100.000000 -100.000000 -100.000000   NaN   NaN
 Sensirion   23.241748   32.393822   27.963151   NaN   NaN,
 22:                  Min      Max       Mean  25th  75th
 DGS1  