# WiSenseOutdoorData, Cleaning of Data, Finding Error % in each node

# 1. Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 2. Reading WiSenseData

In [2]:
dff = pd.read_csv('WiSenseOutdoorData.csv' , header = None)  #reading the Outdoor dataset

In [3]:
#There are 14 columns
dff.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')

# 3. Renaming the Columns

In [4]:
df = dff.rename(columns={0: 'timeStamp', 1: 'nodeAddress' , 2: 'packteID', 3: 'nodeRSSI', 4: 'nodeVolt', 5: 'temperature1', 6: 'temperature2', 7: 'temperature3', 8: 'pressure', 9: 'luminosity' , 10: 'rainfall', 11: 'solarPanelVolt', 12: 'solarPanelBattVolt', 13: 'solarPanelCurr'})

In [5]:
# Getting some info about Dataset
df.info()     

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13107 entries, 0 to 13106
Data columns (total 14 columns):
timeStamp             13107 non-null object
nodeAddress           13107 non-null object
packteID              13107 non-null int64
nodeRSSI              13107 non-null int64
nodeVolt              13107 non-null float64
temperature1          13107 non-null float64
temperature2          0 non-null float64
temperature3          13107 non-null float64
pressure              13107 non-null int64
luminosity            13107 non-null int64
rainfall              13107 non-null int64
solarPanelVolt        13107 non-null float64
solarPanelBattVolt    13107 non-null float64
solarPanelCurr        13107 non-null float64
dtypes: float64(7), int64(5), object(2)
memory usage: 1.4+ MB


In [6]:
#Copying the original dataset ('df') into data1
data1 = df.copy()  

In [7]:
#Describing different features of DataSet
data1.describe()   

Unnamed: 0,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
count,13107.0,13107.0,13107.0,13107.0,0.0,13107.0,13107.0,13107.0,13107.0,13107.0,13107.0,13107.0
mean,4944.839246,-75.089265,3.356035,27.251118,,29.100995,881.666361,14730.566644,2763.637293,3.101344,3.941621,-8.208438
std,4082.036564,23.564888,0.028765,3.768661,,28.889033,98.311192,23881.808964,7763.948991,3.700875,0.216309,58.909454
min,1.0,-105.0,2.88,22.5,,10.91,220.0,0.0,0.0,0.01,2.85,-109.8
25%,1606.0,-92.0,3.35,25.37,,21.59,895.0,0.0,0.0,0.06,3.76,-30.6
50%,4089.0,-89.0,3.36,26.5,,25.25,896.0,6.0,0.0,0.61,4.01,-30.1
75%,7691.5,-69.0,3.37,28.0,,29.02,897.0,19358.0,0.0,8.59,4.12,1.0
max,16631.0,-24.0,3.4,43.37,,220.0,901.0,65535.0,105233.0,8.74,4.2,572.2


In [8]:
#Getting total number of Unique Nodes from the DataSet
data1.groupby('nodeAddress').count()   

Unnamed: 0_level_0,timeStamp,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
nodeAddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fc:c2:3d:00:00:00:89:e8,1758,1758,1758,1758,1758,0,1758,1758,1758,1758,1758,1758,1758
fc:c2:3d:00:00:01:10:8e,7394,7394,7394,7394,7394,0,7394,7394,7394,7394,7394,7394,7394
fc:c2:3d:00:00:01:2f:2d,1038,1038,1038,1038,1038,0,1038,1038,1038,1038,1038,1038,1038
fc:c2:3d:00:00:01:33:2a,2917,2917,2917,2917,2917,0,2917,2917,2917,2917,2917,2917,2917


In [9]:
#Converting datatype of 'timeStamp' to datetime type
data1['timeStamp'] = pd.to_datetime(data1['timeStamp'])  

In [10]:
#Now we can se the data type of 'timeStamp', it is now datetime type
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13107 entries, 0 to 13106
Data columns (total 14 columns):
timeStamp             13107 non-null datetime64[ns]
nodeAddress           13107 non-null object
packteID              13107 non-null int64
nodeRSSI              13107 non-null int64
nodeVolt              13107 non-null float64
temperature1          13107 non-null float64
temperature2          0 non-null float64
temperature3          13107 non-null float64
pressure              13107 non-null int64
luminosity            13107 non-null int64
rainfall              13107 non-null int64
solarPanelVolt        13107 non-null float64
solarPanelBattVolt    13107 non-null float64
solarPanelCurr        13107 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(5), object(1)
memory usage: 1.4+ MB


# To Demonstrate how we can get time interval between two rows

In [195]:
# This is how we can get the timeStamp of any row by Indexing
data1.timeStamp[0] 

Timestamp('2019-05-08 22:12:19')

In [196]:
#Getting timeinterval b/w first two rows
interval = (data1.timeStamp[1] - data1.timeStamp[0]) 

In [197]:
#Converting this timeinterval into minutes
minutes = interval.total_seconds() / 60

In [198]:
minutes

2.316666666666667

In [199]:
type(minutes)

float

# Functions to Clean the DataSet

In [85]:
'''
nodes = data1['nodeAddress'].unique()
def Temperature3_clean(df):
    for n in nodes:
        k = 0
        for i in range(k , df.shape[0]-1):
          if(df.loc[i, 'nodeAddress'] == n):
            val0 = float(df.loc[i,'temperature3'])
            time0 = df.loc[i,'timeStamp' ]
            for j in range(i+1, df.shape[0]-1):
              if(df.loc[j, 'nodeAddress'] == n):
                val1 = float(df.loc[j , 'temperature3'])
                time1 = df.loc[j , 'timeStamp']
                timedelta = time1 - time0
                minutes = timedelta.total_seconds() / 60
                
                if (abs(val1 - val0) > 10 and minutes < 30.0):
                  df.loc[j,'temperature3'] = val0
                  k = j
                  break
                elif(abs(val1 - val0) > 10 and minutes > 30.0):
                  df.loc[j,'temperature3'] = 'NaN'
                  k = j
                  break
                else:
                  k = j
                  break '''

In [11]:
# Now We will create two new columns in our Dataset namely, 'temp3_changed' and 'pressure_changed'
# These column will contain value '1' if temperature3 or pressure is changed else it will contain 0
data1['temp3_changed'] = 0
data1['pressure_changed'] = 0

In [12]:
# Grouping the DataSet by 'temp3_changed'
data1.groupby('temp3_changed').count()

Unnamed: 0_level_0,timeStamp,nodeAddress,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr,pressure_changed
temp3_changed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,13107,13107,13107,13107,13107,13107,0,13107,13107,13107,13107,13107,13107,13107,13107


In [13]:
#Checking Outliers
'''
# Following Scripts will deal with first value of each node if it is outlier

1. We'll just check if first value of each node for a particular column is outlier (i.e temperature > 100 or temperature < 0), if it is outlier then we'll change its value to next row value

'''


from tqdm import tqdm_notebook

nodes = data1['nodeAddress'].unique() # this line will create an array having total unique nodes

print('Checking Outlier for temperature3')
for n in tqdm_notebook(nodes , desc = 'Processing records for Outlier'):
    for i in range(data1.shape[0] - 1):
        if(data1.loc[i , 'nodeAddress'] == n):
            val0 = float(data1.loc[i,'temperature3'])
            if(val0 < 0 or val0 > 100):
                data1.loc[i,'temperature3'] = data1.loc[i+1,'temperature3']
                print('Outlier Found at', i , 'for node' , n)
                break
            else:
                break
                
                
                
print('Checking Outlier for pressure')
for n in tqdm_notebook(nodes , desc = 'Processing records for Outlier'):
    for i in range(data1.shape[0] - 1):
        if(data1.loc[i , 'nodeAddress'] == n):
            val0 = float(data1.loc[i,'pressure'])
            if(val0 < 750 or val0 > 1000):
                data1.loc[i,'pressure'] = data1.loc[i+1,'pressure']
                print('Outlier Found at',i, 'for node' , n)
                break
            else:
                break

                                

Checking Outlier for temperature3


HBox(children=(IntProgress(value=0, description='Processing records for Outlier', max=4, style=ProgressStyle(d…


Checking Outlier for pressure


HBox(children=(IntProgress(value=0, description='Processing records for Outlier', max=4, style=ProgressStyle(d…




# Following is the function to clean 'temperature3' and 'pressure'

# Logic behind cleaning the data

Example: Cleaning temperature1

To clean 'temperature1', we'll iterate through this column and select two values(rows) of a particular node and compare it.
1. If there absolute difference is more than 10C and timeinterval is less than 30 minutes then we'll replace later value with previous one.
2. If later value(row) is showing an Outlier and time interval is more than 30 minutes then will just replace it with 'NaN'.

Same logic is Implemented for 'temperature3' and pressure.


In [33]:
nodes = data1['nodeAddress'].unique() # this line will create an array having total unique nodes

#Function to clean 'temperature3'

def Temperature3_clean(df):
    for n in nodes:
        k = 0
        for i in range(k , df.shape[0]-1):
          if(df.loc[i, 'nodeAddress'] == n):
            val0 = float(df.loc[i,'temperature3'])
            time0 = (df.loc[i,'timeStamp' ])
            for j in range(i+1, df.shape[0]-1):
              if(df.loc[j, 'nodeAddress'] == n):
                val1 = float(df.loc[j , 'temperature3'])
                time1 = (df.loc[j , 'timeStamp'])
                timedelta = time1 - time0
                minutes = timedelta.total_seconds() / 60
                
                if (abs(val1 - val0) > 10 and minutes < 30.0):
                  df.loc[j,'temperature3'] = val0
                  df.loc[j, 'temp3_changed'] = 1
                  k = j
                  break
                elif(((val1) > 100 or (val1) < 0 ) and minutes > 30.0):
                  df.loc[j,'temperature3'] = 'NaN'
                  k = j
                  break
                else:
                  k = j
                  break
                    
                    
                    
# Function to clean 'pressure'

def pressure_clean(df):
    for n in nodes:
        k = 0
        for i in range(k , df.shape[0]-1):
          if(df.loc[i, 'nodeAddress'] == n):
            val0 = float(df.loc[i,'pressure'])
            time0 = (df.loc[i,'timeStamp' ])
            for j in range(i+1, df.shape[0]-1):
              if(df.loc[j, 'nodeAddress'] == n):
                val1 = float(df.loc[j , 'pressure'])
                time1 = (df.loc[j , 'timeStamp'])
                timedelta = time1 - time0
                minutes = timedelta.total_seconds() / 60
                
                if (abs(val1 - val0) > 10 and minutes < 30.0):
                  df.loc[j,'pressure'] = val0
                  df.loc[j, 'pressure_changed'] = 1
                  k = j
                  break
                elif(((val1) > 1000 or (val1) < 750 ) and minutes > 30.0):
                  df.loc[j,'pressure'] = 'NaN'
                  k = j
                  break
                else:
                  k = j
                  break

In [34]:
#Call Above functions to clean the dataset
Temperature3_clean(data1)
pressure_clean(data1)

In [35]:
data1.describe() #Cleaned Data

Unnamed: 0,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr,temp3_changed,pressure_changed
count,13107.0,13107.0,13107.0,13107.0,0.0,13107.0,13107.0,13107.0,13107.0,13107.0,13107.0,13107.0,13107.0,13107.0
mean,4944.839246,-75.089265,3.356035,27.251118,,25.334563,896.251392,14730.566644,2763.637293,3.101344,3.941621,-8.208438,0.044556,0.021592
std,4082.036564,23.564888,0.028765,3.768661,,5.42663,1.723764,23881.808964,7763.948991,3.700875,0.216309,58.909454,0.206335,0.145351
min,1.0,-105.0,2.88,22.5,,10.91,892.0,0.0,0.0,0.01,2.85,-109.8,0.0,0.0
25%,1606.0,-92.0,3.35,25.37,,22.06,895.0,0.0,0.0,0.06,3.76,-30.6,0.0,0.0
50%,4089.0,-89.0,3.36,26.5,,25.4,896.0,6.0,0.0,0.61,4.01,-30.1,0.0,0.0
75%,7691.5,-69.0,3.37,28.0,,28.97,897.0,19358.0,0.0,8.59,4.12,1.0,0.0,0.0
max,16631.0,-24.0,3.4,43.37,,38.34,901.0,65535.0,105233.0,8.74,4.2,572.2,1.0,1.0


In [36]:
df.describe() #Raw Data

Unnamed: 0,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
count,13107.0,13107.0,13107.0,13107.0,0.0,13107.0,13107.0,13107.0,13107.0,13107.0,13107.0,13107.0
mean,4944.839246,-75.089265,3.356035,27.251118,,29.100995,881.666361,14730.566644,2763.637293,3.101344,3.941621,-8.208438
std,4082.036564,23.564888,0.028765,3.768661,,28.889033,98.311192,23881.808964,7763.948991,3.700875,0.216309,58.909454
min,1.0,-105.0,2.88,22.5,,10.91,220.0,0.0,0.0,0.01,2.85,-109.8
25%,1606.0,-92.0,3.35,25.37,,21.59,895.0,0.0,0.0,0.06,3.76,-30.6
50%,4089.0,-89.0,3.36,26.5,,25.25,896.0,6.0,0.0,0.61,4.01,-30.1
75%,7691.5,-69.0,3.37,28.0,,29.02,897.0,19358.0,0.0,8.59,4.12,1.0
max,16631.0,-24.0,3.4,43.37,,220.0,901.0,65535.0,105233.0,8.74,4.2,572.2


# From here we'll see how we can convert all outliers to "NaN" and we'll observe what is the error percentage in each node

In [185]:
# Just Trying to Convert Wrong (Outliers) Values to NaN

nodes = data1['nodeAddress'].unique()
def Temperature3_clean(df):
    for n in nodes:
        for i in range(df.shape[0]-1):
            val0 = float(df.loc[i,'temperature3'])
            if (val0 > 100 or val0 < -1):
                df.loc[i,'temperature3'] = 'NaN'

Temperature3_clean(data1)

In [187]:
#Getting indexes where temp3 contains NaN value
kaish = np.where(data1.temperature3  == 'NaN')  

In [188]:
#Converting kaish into numpy array
kaish = np.array(kaish)

In [191]:
#Getting how many outliers
print(kaish.shape)

(1, 283)


In [189]:
data1[900:950]

Unnamed: 0,timeStamp,nodeAddress,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
900,2019-05-09 09:14:03,fc:c2:3d:00:00:01:10:8e,2196,-90,3.36,25.37,,28.76,896,56430,0,8.6,4.07,234.6
901,2019-05-09 09:14:22,fc:c2:3d:00:00:01:2f:2d,2197,-68,3.38,33.0,,35.31,895,65535,0,8.71,3.72,2.0
902,2019-05-09 09:14:52,fc:c2:3d:00:00:00:89:e8,2199,-68,3.31,26.75,,27.9,894,18198,0,8.6,4.02,144.0
903,2019-05-09 09:15:04,fc:c2:3d:00:00:01:10:8e,2200,-90,3.35,25.37,,28.91,896,56205,0,8.6,4.07,237.2
904,2019-05-09 09:16:05,fc:c2:3d:00:00:01:10:8e,2202,-90,3.36,25.37,,29.07,896,56810,0,8.6,4.08,238.8
905,2019-05-09 09:17:07,fc:c2:3d:00:00:01:10:8e,2204,-90,3.36,25.37,,29.24,896,58749,0,8.6,4.08,241.6
906,2019-05-09 09:18:07,fc:c2:3d:00:00:01:10:8e,2206,-90,3.36,25.37,,29.36,896,61067,0,8.6,4.08,245.0
907,2019-05-09 09:19:08,fc:c2:3d:00:00:01:10:8e,2208,-89,3.36,25.37,,29.47,896,62746,0,8.6,4.08,247.5
908,2019-05-09 09:19:23,fc:c2:3d:00:00:01:2f:2d,2209,-68,3.38,33.0,,220.0,220,65535,0,8.7,3.72,2.0
909,2019-05-09 09:19:53,fc:c2:3d:00:00:00:89:e8,2211,-68,3.31,26.75,,28.53,894,19567,0,8.59,4.02,148.9


In [117]:
data1[10225:10248]

Unnamed: 0,timeStamp,nodeAddress,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
10225,2019-05-15 02:33:04,fc:c2:3d:00:00:00:89:e8,3503,-85,3.31,26.75,,24.77,897,0,0,0.03,3.95,-30.7
10226,2019-05-15 02:38:07,fc:c2:3d:00:00:00:89:e8,3514,-85,3.31,26.75,,24.77,897,0,0,0.02,3.94,-30.9
10227,2019-05-15 08:20:28,fc:c2:3d:00:00:01:10:8e,4159,-44,3.35,26.75,,22.24,898,0,0,0.01,3.64,-30.2
10228,2019-05-15 08:21:28,fc:c2:3d:00:00:01:10:8e,4162,-95,3.36,25.37,,22.42,900,40209,0,8.6,3.99,105.4
10229,2019-05-15 08:22:30,fc:c2:3d:00:00:01:10:8e,4165,-89,3.35,25.37,,22.62,900,42234,0,8.6,3.99,125.4
10230,2019-05-15 08:23:31,fc:c2:3d:00:00:01:10:8e,4168,-94,3.35,25.37,,22.78,900,40944,0,8.6,3.99,122.2
10231,2019-05-15 08:24:33,fc:c2:3d:00:00:01:10:8e,4171,-103,3.35,26.75,,22.28,898,0,0,0.02,3.64,-30.5
10232,2019-05-15 08:25:34,fc:c2:3d:00:00:01:10:8e,4174,-95,3.36,25.37,,23.08,900,34852,0,8.59,3.99,100.7
10233,2019-05-15 08:26:37,fc:c2:3d:00:00:01:10:8e,4177,-95,3.36,25.37,,23.19,900,35038,0,8.6,4.0,108.9
10234,2019-05-15 08:27:51,fc:c2:3d:00:00:01:10:8e,4180,-94,3.36,25.37,,23.41,900,36525,0,8.6,4.0,131.3


In [94]:
df[10225:10248]

Unnamed: 0,timeStamp,nodeAddress,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
10225,2019-05-15 02:33:04,fc:c2:3d:00:00:00:89:e8,3503,-85,3.31,26.75,,13.15,897,0,0,0.03,3.95,-30.7
10226,2019-05-15 02:38:07,fc:c2:3d:00:00:00:89:e8,3514,-85,3.31,26.75,,13.17,897,0,0,0.02,3.94,-30.9
10227,2019-05-15 08:20:28,fc:c2:3d:00:00:01:10:8e,4159,-44,3.35,26.75,,22.24,898,0,0,0.01,3.64,-30.2
10228,2019-05-15 08:21:28,fc:c2:3d:00:00:01:10:8e,4162,-95,3.36,25.37,,22.42,900,40209,0,8.6,3.99,105.4
10229,2019-05-15 08:22:30,fc:c2:3d:00:00:01:10:8e,4165,-89,3.35,25.37,,22.62,900,42234,0,8.6,3.99,125.4
10230,2019-05-15 08:23:31,fc:c2:3d:00:00:01:10:8e,4168,-94,3.35,25.37,,22.78,900,40944,0,8.6,3.99,122.2
10231,2019-05-15 08:24:33,fc:c2:3d:00:00:01:10:8e,4171,-103,3.35,26.75,,22.28,898,0,0,0.02,3.64,-30.5
10232,2019-05-15 08:25:34,fc:c2:3d:00:00:01:10:8e,4174,-95,3.36,25.37,,23.08,900,34852,0,8.59,3.99,100.7
10233,2019-05-15 08:26:37,fc:c2:3d:00:00:01:10:8e,4177,-95,3.36,25.37,,23.19,900,35038,0,8.6,4.0,108.9
10234,2019-05-15 08:27:51,fc:c2:3d:00:00:01:10:8e,4180,-94,3.36,25.37,,23.41,900,36525,0,8.6,4.0,131.3


In [40]:
df

Unnamed: 0,timeStamp,nodeAddress,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
0,2019-05-08 22:12:19,fc:c2:3d:00:00:00:89:e8,932,-67,3.31,26.75,,19.09,896,0,0,0.02,4.08,-29.1
1,2019-05-08 22:14:38,fc:c2:3d:00:00:01:10:8e,934,-91,3.35,25.37,,20.26,897,0,0,0.06,4.07,-30.4
2,2019-05-08 22:15:39,fc:c2:3d:00:00:01:10:8e,936,-91,3.35,25.37,,20.23,897,0,0,0.06,4.07,-30.3
3,2019-05-08 22:16:22,fc:c2:3d:00:00:01:2f:2d,937,-68,3.38,33.00,,15.65,897,0,0,0.03,3.58,-30.6
4,2019-05-08 22:16:40,fc:c2:3d:00:00:01:10:8e,939,-90,3.35,25.37,,20.21,897,0,0,0.06,4.07,-30.3
5,2019-05-08 22:17:21,fc:c2:3d:00:00:00:89:e8,940,-68,3.31,26.75,,18.98,896,0,0,0.02,4.07,-30.5
6,2019-05-08 22:17:41,fc:c2:3d:00:00:01:10:8e,942,-90,3.35,25.37,,20.22,897,0,0,0.06,4.07,-29.6
7,2019-05-08 22:18:42,fc:c2:3d:00:00:01:10:8e,944,-91,3.35,25.37,,20.19,897,0,0,0.06,4.07,-30.5
8,2019-05-08 22:19:43,fc:c2:3d:00:00:01:10:8e,946,-90,3.35,25.37,,20.18,897,0,0,0.06,4.07,-30.2
9,2019-05-08 22:20:44,fc:c2:3d:00:00:01:10:8e,948,-92,3.35,25.37,,20.19,897,0,0,0.06,4.07,-30.1


# Error %age in each node for whole dataset

In [137]:
#Getting total unique nodes in dataset
df.groupby('nodeAddress').count()

Unnamed: 0_level_0,timeStamp,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
nodeAddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fc:c2:3d:00:00:00:89:e8,1758,1758,1758,1758,1758,0,1758,1758,1758,1758,1758,1758,1758
fc:c2:3d:00:00:01:10:8e,7394,7394,7394,7394,7394,0,7394,7394,7394,7394,7394,7394,7394
fc:c2:3d:00:00:01:2f:2d,1038,1038,1038,1038,1038,0,1038,1038,1038,1038,1038,1038,1038
fc:c2:3d:00:00:01:33:2a,2917,2917,2917,2917,2917,0,2917,2917,2917,2917,2917,2917,2917


In [13]:
#Creating a new dataframe which contains 'timeStamp' when NaN(Outlier) occurs 
diagnostic_df = pd.DataFrame(columns=['temp3_89:e8' , 'temp3_10:8e' , 'temp3_2f:2d' , 'temp3_33:2a'] )

In [14]:
diagnostic_df

Unnamed: 0,temp3_89:e8,temp3_10:8e,temp3_2f:2d,temp3_33:2a


In [15]:
#Getting values of 'timestamp' in diagnostic_df when NaN occurs for each node

for i in range( df.shape[0]-1):
    if(df.loc[i, 'nodeAddress'] == 'fc:c2:3d:00:00:00:89:e8'):
        val0 = float(df.loc[i,'temperature3'])
        time = df.loc[i,'timeStamp' ]
        if(val0 > 50 or val0 < 0):
            diagnostic_df.loc[i ,'temp3_89:e8'] = time
        
for i in range( df.shape[0]-1):
    if(df.loc[i, 'nodeAddress'] == 'fc:c2:3d:00:00:01:10:8e'):
        val0 = float(df.loc[i,'temperature3'])
        time = df.loc[i,'timeStamp' ]
        if(val0 > 50 or val0 < 0):
            diagnostic_df.loc[i , 'temp3_10:8e'] = time
            
for i in range( df.shape[0]-1):
    if(df.loc[i, 'nodeAddress'] == 'fc:c2:3d:00:00:01:2f:2d'):
        val0 = float(df.loc[i,'temperature3'])
        time = df.loc[i,'timeStamp' ]
        if(val0 > 50 or val0 < 0):
            diagnostic_df.loc[i , 'temp3_2f:2d'] = df.loc[i,'timeStamp' ]
            
for i in range( df.shape[0]-1):
    if(df.loc[i, 'nodeAddress'] == 'fc:c2:3d:00:00:01:33:2a'):
        val0 = float(df.loc[i,'temperature3'])
        time = df.loc[i,'timeStamp' ]
        if(val0 > 50 or val0 < 0):
            diagnostic_df.loc[i , 'temp3_33:2a'] = time            

In [175]:
#diagnostic_df

In [16]:
#Following line will print whole diagnostic_df
print(diagnostic_df.to_string())

               temp3_89:e8          temp3_10:8e          temp3_2f:2d          temp3_33:2a
6244   2019-05-13 11:23:44                  NaN                  NaN                  NaN
6256   2019-05-13 11:28:45                  NaN                  NaN                  NaN
6267   2019-05-13 11:33:45                  NaN                  NaN                  NaN
6368   2019-05-13 12:18:59                  NaN                  NaN                  NaN
6291                   NaN  2019-05-13 11:44:21                  NaN                  NaN
6375                   NaN  2019-05-13 12:22:04                  NaN                  NaN
6380                   NaN  2019-05-13 12:24:07                  NaN                  NaN
6382                   NaN  2019-05-13 12:25:07                  NaN                  NaN
6387                   NaN  2019-05-13 12:27:10                  NaN                  NaN
6389                   NaN  2019-05-13 12:28:10                  NaN                  NaN
6392      

In [17]:
#Compressing the Diagnostic_df to get better look
diagnostic_df = pd.concat([diagnostic_df[x].dropna().reset_index(drop=True) for x in diagnostic_df], axis=1)


In [18]:
#Again printing whole diagnostic_df
print(diagnostic_df.to_string())

             temp3_89:e8          temp3_10:8e          temp3_2f:2d          temp3_33:2a
0    2019-05-13 11:23:44  2019-05-13 11:44:21  2019-05-09 09:19:23  2019-05-13 11:20:42
1    2019-05-13 11:28:45  2019-05-13 12:22:04  2019-05-09 09:24:24  2019-05-13 11:21:43
2    2019-05-13 11:33:45  2019-05-13 12:24:07  2019-05-09 09:29:26  2019-05-13 11:24:51
3    2019-05-13 12:18:59  2019-05-13 12:25:07  2019-05-09 09:34:26  2019-05-13 11:25:52
4                    NaN  2019-05-13 12:27:10  2019-05-09 09:39:27  2019-05-13 11:26:53
5                    NaN  2019-05-13 12:28:10  2019-05-09 09:44:28  2019-05-13 11:27:54
6                    NaN  2019-05-13 12:29:12  2019-05-09 09:49:29  2019-05-13 11:29:59
7                    NaN  2019-05-13 12:30:13  2019-05-09 09:54:30  2019-05-13 11:30:59
8                    NaN  2019-05-13 12:31:13  2019-05-09 09:59:31  2019-05-13 11:32:00
9                    NaN  2019-05-13 12:33:16  2019-05-09 10:04:32  2019-05-13 11:33:01
10                   NaN  2019-0

In [23]:
#Describing 'diagnostic_df' into NaN_Data
NaN_Data = diagnostic_df.describe()
NaN_Data

Unnamed: 0,temp3_89:e8,temp3_10:8e,temp3_2f:2d,temp3_33:2a
count,4,112,137,30
unique,4,112,137,30
top,2019-05-13 11:23:44,2019-05-13 12:56:43,2019-05-10 10:52:44,2019-05-13 13:15:50
freq,1,1,1,1


In [32]:
NaN_Data_89e8 = NaN_Data['temp3_89:e8'][0] #Total NaN in node 89:e8
NaN_Data_108e = NaN_Data['temp3_10:8e'][0] #Total NaN in node 10:8e
NaN_Data_2f2d = NaN_Data['temp3_2f:2d'][0] #Total NaN in node 2f:2d
NaN_Data_332a = NaN_Data['temp3_33:2a'][0] #Total NaN in node 33:2a

In [33]:
NaN_Data_89e8

4

In [10]:
# Grouping original dataset (df) into 'groupby' to get total datapoints in each node
groupby = df.groupby('nodeAddress').count()
groupby

Unnamed: 0_level_0,timeStamp,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr
nodeAddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fc:c2:3d:00:00:00:89:e8,1758,1758,1758,1758,1758,0,1758,1758,1758,1758,1758,1758,1758
fc:c2:3d:00:00:01:10:8e,7394,7394,7394,7394,7394,0,7394,7394,7394,7394,7394,7394,7394
fc:c2:3d:00:00:01:2f:2d,1038,1038,1038,1038,1038,0,1038,1038,1038,1038,1038,1038,1038
fc:c2:3d:00:00:01:33:2a,2917,2917,2917,2917,2917,0,2917,2917,2917,2917,2917,2917,2917


In [34]:
#For all dataset
total_data_89e8 = groupby.loc['fc:c2:3d:00:00:00:89:e8' , 'temperature3'] #Total Datapoints in node 89:e8
total_data_108e = groupby.loc['fc:c2:3d:00:00:01:10:8e' , 'temperature3'] #Total Datapoints in node 10:8e
total_data_2f2d = groupby.loc['fc:c2:3d:00:00:01:2f:2d' , 'temperature3'] #Total Datapoints in node 2f:2d
total_data_332a = groupby.loc['fc:c2:3d:00:00:01:33:2a' , 'temperature3'] #Total Datapoints in node 33:2a

In [35]:
total_data_89e8

1758

In [37]:
# Calculating error %age for each node
err_89e8 = (NaN_Data_89e8 / total_data_89e8)*100
err_108e = (NaN_Data_108e / total_data_108e)*100
err_2f2d = (NaN_Data_2f2d / total_data_2f2d)*100
err_332a = (NaN_Data_332a / total_data_332a)*100

In [39]:
print("Error %age For Whole Data Set in each node")
print('Error in 89e8' , err_89e8)
print('Error in 108e' , err_108e)
print('Error in 2f2d' , err_2f2d)
print('Error in 332a' , err_332a)

For Whole Data Set
Error in 89e8 0.22753128555176336
Error in 108e 1.5147416824452258
Error in 2f2d 13.19845857418112
Error in 332a 1.0284538909838876


# Finding Error %age in each node per day

In [123]:
df.info() # df is our original dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13107 entries, 0 to 13106
Data columns (total 14 columns):
timeStamp             13107 non-null object
nodeAddress           13107 non-null object
packteID              13107 non-null int64
nodeRSSI              13107 non-null int64
nodeVolt              13107 non-null float64
temperature1          13107 non-null float64
temperature2          0 non-null float64
temperature3          13107 non-null float64
pressure              13107 non-null int64
luminosity            13107 non-null int64
rainfall              13107 non-null int64
solarPanelVolt        13107 non-null float64
solarPanelBattVolt    13107 non-null float64
solarPanelCurr        13107 non-null float64
dtypes: float64(7), int64(5), object(2)
memory usage: 1.4+ MB


In [124]:
#Creating a new dataframe 'test' i.e. copy of 'df'
test = df.copy() 

In [125]:
#Converting 'timeStamp' to 'datetime' type
test['timeStamp'] = pd.to_datetime(test['timeStamp'])  

In [126]:
 # creating to new columns of 'date' separetely i.e. 'new_date'
test['new_date'] = [d.date() for d in test['timeStamp']]  


In [127]:
#Converting 'new_date' to 'datetime' type
test['new_date'] = pd.to_datetime(test['new_date'])  

In [128]:
test

Unnamed: 0,timeStamp,nodeAddress,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr,new_date
0,2019-05-08 22:12:19,fc:c2:3d:00:00:00:89:e8,932,-67,3.31,26.75,,19.09,896,0,0,0.02,4.08,-29.1,2019-05-08
1,2019-05-08 22:14:38,fc:c2:3d:00:00:01:10:8e,934,-91,3.35,25.37,,20.26,897,0,0,0.06,4.07,-30.4,2019-05-08
2,2019-05-08 22:15:39,fc:c2:3d:00:00:01:10:8e,936,-91,3.35,25.37,,20.23,897,0,0,0.06,4.07,-30.3,2019-05-08
3,2019-05-08 22:16:22,fc:c2:3d:00:00:01:2f:2d,937,-68,3.38,33.00,,15.65,897,0,0,0.03,3.58,-30.6,2019-05-08
4,2019-05-08 22:16:40,fc:c2:3d:00:00:01:10:8e,939,-90,3.35,25.37,,20.21,897,0,0,0.06,4.07,-30.3,2019-05-08
5,2019-05-08 22:17:21,fc:c2:3d:00:00:00:89:e8,940,-68,3.31,26.75,,18.98,896,0,0,0.02,4.07,-30.5,2019-05-08
6,2019-05-08 22:17:41,fc:c2:3d:00:00:01:10:8e,942,-90,3.35,25.37,,20.22,897,0,0,0.06,4.07,-29.6,2019-05-08
7,2019-05-08 22:18:42,fc:c2:3d:00:00:01:10:8e,944,-91,3.35,25.37,,20.19,897,0,0,0.06,4.07,-30.5,2019-05-08
8,2019-05-08 22:19:43,fc:c2:3d:00:00:01:10:8e,946,-90,3.35,25.37,,20.18,897,0,0,0.06,4.07,-30.2,2019-05-08
9,2019-05-08 22:20:44,fc:c2:3d:00:00:01:10:8e,948,-92,3.35,25.37,,20.19,897,0,0,0.06,4.07,-30.1,2019-05-08


In [130]:
#Printing the first element of 'new_date'
kaish = test['new_date'][0]
print(kaish)

2019-05-08 00:00:00


In [131]:
#Getting all the indexes where date is 2019-05-13 in 'date' variable
date = np.where(test.new_date == '2019-05-13 00:00:00')

In [132]:
date

(array([5865, 5866, 5867, ..., 7982, 7983, 7984]),)

In [133]:
#Converting 'date' to numpy array
date = np.array(date)

In [134]:
#Getting the first and last index for which date is 2019-05-13
date_max = date.max()
date_min = date.min()

In [135]:
#Now test will contain only datapoints for which date is 2019-05-13
test = test[date_min:(date_max + 1)]

In [136]:
#Check the start and Stop index
test.index

RangeIndex(start=5865, stop=7985, step=1)

In [138]:
#test

In [139]:
#Convert Wrong Values(Outliers) to NaN

nodes = test['nodeAddress'].unique()
def temp3(df):
    for n in nodes:
        for i in range(5865 , 7985):
            val0 = float(df.loc[i,'temperature3'])
            if (val0 > 100 or val0 < -1):
                df.loc[i,'temperature3'] = 'NaN'

temp3(test)

In [None]:
#Get indexes where temp3 is NaN
np.where(test.temperature3 == 'NaN')

In [141]:
test.head()

Unnamed: 0,timeStamp,nodeAddress,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr,new_date
5865,2019-05-13 00:04:27,fc:c2:3d:00:00:00:89:e8,15386,-31,3.35,32.25,,26.22,897,0,27918,0.6,3.45,-31.0,2019-05-13
5866,2019-05-13 00:09:28,fc:c2:3d:00:00:00:89:e8,15400,-90,3.27,32.12,,25.92,896,0,17625,0.6,3.28,-30.8,2019-05-13
5867,2019-05-13 00:14:29,fc:c2:3d:00:00:00:89:e8,15415,-96,3.27,32.12,,25.88,896,0,17625,0.6,3.27,-30.9,2019-05-13
5868,2019-05-13 00:19:30,fc:c2:3d:00:00:00:89:e8,15431,-96,3.25,32.12,,25.85,896,0,17625,0.6,3.26,-31.0,2019-05-13
5869,2019-05-13 00:24:31,fc:c2:3d:00:00:00:89:e8,15446,-32,3.25,32.12,,25.82,896,0,17625,0.6,3.26,-30.7,2019-05-13


In [142]:
#Grouping the test dataset by 'nodeAddress'
test.groupby('nodeAddress').count()

Unnamed: 0_level_0,timeStamp,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr,new_date
nodeAddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fc:c2:3d:00:00:00:89:e8,275,275,275,275,275,0,275,275,275,275,275,275,275,275
fc:c2:3d:00:00:01:10:8e,932,932,932,932,932,0,932,932,932,932,932,932,932,932
fc:c2:3d:00:00:01:2f:2d,130,130,130,130,130,0,130,130,130,130,130,130,130,130
fc:c2:3d:00:00:01:33:2a,783,783,783,783,783,0,783,783,783,783,783,783,783,783


In [144]:
#Creating a new dataframe which contains 'timeStamp' when NaN(Outlier) occurs 

diagnostic_df = pd.DataFrame(columns=['temp3_89:e8' , 'temp3_10:8e' , 'temp3_2f:2d' , 'temp3_33:2a'] )

In [145]:
diagnostic_df

Unnamed: 0,temp3_89:e8,temp3_10:8e,temp3_2f:2d,temp3_33:2a


In [146]:
test.head()

Unnamed: 0,timeStamp,nodeAddress,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr,new_date
5865,2019-05-13 00:04:27,fc:c2:3d:00:00:00:89:e8,15386,-31,3.35,32.25,,26.22,897,0,27918,0.6,3.45,-31.0,2019-05-13
5866,2019-05-13 00:09:28,fc:c2:3d:00:00:00:89:e8,15400,-90,3.27,32.12,,25.92,896,0,17625,0.6,3.28,-30.8,2019-05-13
5867,2019-05-13 00:14:29,fc:c2:3d:00:00:00:89:e8,15415,-96,3.27,32.12,,25.88,896,0,17625,0.6,3.27,-30.9,2019-05-13
5868,2019-05-13 00:19:30,fc:c2:3d:00:00:00:89:e8,15431,-96,3.25,32.12,,25.85,896,0,17625,0.6,3.26,-31.0,2019-05-13
5869,2019-05-13 00:24:31,fc:c2:3d:00:00:00:89:e8,15446,-32,3.25,32.12,,25.82,896,0,17625,0.6,3.26,-30.7,2019-05-13


In [147]:
#Getting values of 'timestamp' in diagnostic_df when NaN occurs for each node

for i in range(5865 , 7985):
    if(test.loc[i, 'nodeAddress'] == 'fc:c2:3d:00:00:00:89:e8'):
        val0 = float(df.loc[i,'temperature3'])
        time = df.loc[i,'timeStamp' ]
        if(val0 > 50 or val0 < 0):
            diagnostic_df.loc[i ,'temp3_89:e8'] = time
        
for i in range( 5865 , 7985):
    if(test.loc[i, 'nodeAddress'] == 'fc:c2:3d:00:00:01:10:8e'):
        val0 = float(df.loc[i,'temperature3'])
        time = df.loc[i,'timeStamp' ]
        if(val0 > 50 or val0 < 0):
            diagnostic_df.loc[i , 'temp3_10:8e'] = time
            
for i in range(5865 , 7985):
    if(test.loc[i, 'nodeAddress'] == 'fc:c2:3d:00:00:01:2f:2d'):
        val0 = float(df.loc[i,'temperature3'])
        time = df.loc[i,'timeStamp' ]
        if(val0 > 50 or val0 < 0):
            diagnostic_df.loc[i , 'temp3_2f:2d'] = df.loc[i,'timeStamp' ]
            
for i in range( 5865 , 7985):
    if(test.loc[i, 'nodeAddress'] == 'fc:c2:3d:00:00:01:33:2a'):
        val0 = float(df.loc[i,'temperature3'])
        time = df.loc[i,'timeStamp' ]
        if(val0 > 50 or val0 < 0):
            diagnostic_df.loc[i , 'temp3_33:2a'] = time            

In [148]:
#Following line will print whole diagnostic_df
print(diagnostic_df.to_string())

              temp3_89:e8          temp3_10:8e          temp3_2f:2d          temp3_33:2a
6244  2019-05-13 11:23:44                  NaN                  NaN                  NaN
6256  2019-05-13 11:28:45                  NaN                  NaN                  NaN
6267  2019-05-13 11:33:45                  NaN                  NaN                  NaN
6368  2019-05-13 12:18:59                  NaN                  NaN                  NaN
6291                  NaN  2019-05-13 11:44:21                  NaN                  NaN
6375                  NaN  2019-05-13 12:22:04                  NaN                  NaN
6380                  NaN  2019-05-13 12:24:07                  NaN                  NaN
6382                  NaN  2019-05-13 12:25:07                  NaN                  NaN
6387                  NaN  2019-05-13 12:27:10                  NaN                  NaN
6389                  NaN  2019-05-13 12:28:10                  NaN                  NaN
6392                 

In [149]:
#Compressing the Diagnostic_df to get better look

diagnostic_df = pd.concat([diagnostic_df[x].dropna().reset_index(drop=True) for x in diagnostic_df], axis=1)


In [150]:
#Again printing whole diagnostic_df
print(diagnostic_df.to_string())

             temp3_89:e8          temp3_10:8e          temp3_2f:2d          temp3_33:2a
0    2019-05-13 11:23:44  2019-05-13 11:44:21  2019-05-13 11:40:07  2019-05-13 11:20:42
1    2019-05-13 11:28:45  2019-05-13 12:22:04  2019-05-13 11:45:09  2019-05-13 11:21:43
2    2019-05-13 11:33:45  2019-05-13 12:24:07  2019-05-13 12:40:25  2019-05-13 11:24:51
3    2019-05-13 12:18:59  2019-05-13 12:25:07  2019-05-13 12:45:30  2019-05-13 11:25:52
4                    NaN  2019-05-13 12:27:10  2019-05-13 14:00:48  2019-05-13 11:26:53
5                    NaN  2019-05-13 12:28:10                  NaN  2019-05-13 11:27:54
6                    NaN  2019-05-13 12:29:12                  NaN  2019-05-13 11:29:59
7                    NaN  2019-05-13 12:30:13                  NaN  2019-05-13 11:30:59
8                    NaN  2019-05-13 12:31:13                  NaN  2019-05-13 11:32:00
9                    NaN  2019-05-13 12:33:16                  NaN  2019-05-13 11:33:01
10                   NaN  2019-0

In [151]:
#Describing 'diagnostic_df' into NaN_Data
NaN_Data = diagnostic_df.describe()
NaN_Data

Unnamed: 0,temp3_89:e8,temp3_10:8e,temp3_2f:2d,temp3_33:2a
count,4,112,5,30
unique,4,112,5,30
top,2019-05-13 11:23:44,2019-05-13 12:56:43,2019-05-13 12:45:30,2019-05-13 13:15:50
freq,1,1,1,1


In [152]:
NaN_Data_89e8 = NaN_Data['temp3_89:e8'][0] #Total NaN in node 89:e8
NaN_Data_108e = NaN_Data['temp3_10:8e'][0] #Total NaN in node 10:8e
NaN_Data_2f2d = NaN_Data['temp3_2f:2d'][0] #Total NaN in node 2f:2d
NaN_Data_332a = NaN_Data['temp3_33:2a'][0] #Total NaN in node 33:2a

In [153]:
# Grouping original dataset (test) into 'groupby' to get total datapoints in each node
groupby = test.groupby('nodeAddress').count()
groupby

Unnamed: 0_level_0,timeStamp,packteID,nodeRSSI,nodeVolt,temperature1,temperature2,temperature3,pressure,luminosity,rainfall,solarPanelVolt,solarPanelBattVolt,solarPanelCurr,new_date
nodeAddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fc:c2:3d:00:00:00:89:e8,275,275,275,275,275,0,275,275,275,275,275,275,275,275
fc:c2:3d:00:00:01:10:8e,932,932,932,932,932,0,932,932,932,932,932,932,932,932
fc:c2:3d:00:00:01:2f:2d,130,130,130,130,130,0,130,130,130,130,130,130,130,130
fc:c2:3d:00:00:01:33:2a,783,783,783,783,783,0,783,783,783,783,783,783,783,783


In [154]:
#For all dataset
total_data_89e8 = groupby.loc['fc:c2:3d:00:00:00:89:e8' , 'temperature3'] #Total Datapoints in node 89:e8  
total_data_108e = groupby.loc['fc:c2:3d:00:00:01:10:8e' , 'temperature3'] #Total Datapoints in node 10:8e
total_data_2f2d = groupby.loc['fc:c2:3d:00:00:01:2f:2d' , 'temperature3'] #Total Datapoints in node 2f:2d
total_data_332a = groupby.loc['fc:c2:3d:00:00:01:33:2a' , 'temperature3'] #Total Datapoints in node 33:2a

In [155]:
# Calculating error %age for each node 
err_89e8 = (NaN_Data_89e8 / total_data_89e8)*100
err_108e = (NaN_Data_108e / total_data_108e)*100
err_2f2d = (NaN_Data_2f2d / total_data_2f2d)*100
err_332a = (NaN_Data_332a / total_data_332a)*100

In [156]:
print("For Whole Data Set on date 2019-05-13")
print('Error in 89e8' , err_89e8)
print('Error in 108e' , err_108e)
print('Error in 2f2d' , err_2f2d)
print('Error in 332a' , err_332a)

For Whole Data Set
Error in 89e8 1.4545454545454546
Error in 108e 12.017167381974248
Error in 2f2d 3.8461538461538463
Error in 332a 3.8314176245210727


# Code below this is just for testing purpose 

In [None]:
nodes = data1['nodeAddress'].unique()
def Temperature3_clean(df):
    for n in nodes:
        k = 0
        for i in range(k , df.shape[0]-1):
          if(df.loc[i, 'nodeAddress'] == n):
            val0 = float(df.loc[i,'temperature3'])
            time0 = df.loc[i,'timeStamp' ]
            for j in range(i+1, df.shape[0]-1):
              if(df.loc[j, 'nodeAddress'] == n):
                val1 = float(df.loc[j , 'temperature3'])
                time1 = df.loc[j , 'timeStamp']
                timedelta = time1 - time0
                minutes = timedelta.total_seconds() / 60
                
                if (abs(val1 - val0) > 10 and minutes < 30.0):
                  df.loc[j,'temperature3'] = val0
                  k = j
                  break
                elif(abs(val1) > 100 and minutes > 30.0):
                  df.loc[j,'temperature3'] = 'NaN'
                  k = j
                  break
                else:
                  k = j
                  break

In [None]:
# pip install tqdm
from tqdm import tqdm_notebook

# works on any iterable, including cursors. 
# for iterables with len(), no need to specify 'total'.
for rec in tqdm_notebook(items, 
                         total=total, 
                         desc="Processing records"):
    # any code processing the elements in the iterable
    len(rec.keys())

In [4]:
# Implementing Tqdm Process bar
'''kaish = range(1, 1001)

from tqdm import tqdm_notebook

for i in tqdm_notebook(kaish , desc = 'Processing records'):
    print(i)'''

"kaish = range(1, 1001)\n\nfrom tqdm import tqdm_notebook\n\nfor i in tqdm_notebook(kaish , desc = 'Processing records'):\n    print(i)"

In [20]:
for n in tqdm_notebook(nodes , desc = 'Processing records'):

4