# First Test of New Box

In [1]:
import pandas as pd
import numpy as np

import ujson
from pprint import pprint
from validictory import validate

## Load raw data

In [2]:
rawData = []
with open("data/2016-12-17T19'29'25.354Z - test 1.txt") as f:
    for line in f:
        rawData.append(line)
with open("data/2016-12-17T20'03'28.413Z - test 2.txt") as f:
    for line in f:
        rawData.append(line)

## Scan for errors

As shown below, there are many malformed messages. Typically they are missing a single character. They appear to only occur in the new boxes, indicating probably something wrong with the Serial-to-Ethernet converter.

In [3]:
for err in [ujson.loads(line)['data'] for line in rawData if 'serverlog' in line]:
    if err['level'] == 'ERROR':
        print err['message']
        print

could not parse message: {"kind": "caps", "ms": 13866137, "shunts": 0, "i": 135770, "v": 24.935, "c_in": 86.911, "c_out_fwd": 0.700, "c_out_rev: 0.000, "c_shunt": 0.000, "temp": 18.375, "fan": false, "error": 0}
 | 10.0.0.110 | SyntaxError: Unexpected token c

could not parse message: {"kind": "caps", "ms": 13871142, "shunts": 0, "i": 135819, "v": 27.665, "c_in": 62.511 "c_out_fwd": 1.797, "c_out_rev": 0.000, "c_shunt": 0.000, "temp": 18.375, "fan": false, "error": 0}
 | 10.0.0.110 | SyntaxError: Unexpected string

could not parse message: {"kind": "inv", "ms": 14151262, "inv": true, "soft": true, v": 25.750, "i": 141487}
 | 10.0.0.117 | SyntaxError: Unexpected token v

could not parse message: {"kind": "inv" "ms": 14196271, "inv": true, "soft": true, "v": 25.521, "i": 141937}
 | 10.0.0.117 | SyntaxError: Unexpected string

could not parse message: {"kind": "caps", "ms": 14356144, "shunts": 0, "i": 140570, "v": 27.375, "c_in": 53.454, "c_out_fwd: 48.382, "c_out_rev": 0.000, "c_shun

## Scan for invalid data

The errors above caught messages that could not be parsed. However, there could easily be messages that parsed correctly, but dropped other characters (in the keys and values). Here, we try to find those instances.

### Parse and organize the data

In [4]:
parsedData = {}

for entry in [ujson.loads(line) for line in rawData]:
    if not parsedData.get(entry['chan']):
        parsedData[entry['chan']] = []
    parsedData[entry['chan']].append(entry)

In [5]:
for k in parsedData.keys():
    print k
    pprint(parsedData[k][-1])
    print

serverlog
{u'chan': u'serverlog',
 u'data': {u'level': u'ERROR',
           u'message': u'could not parse message: {"kind": "caps", "ms": 17221157, "shunts": 0, "i": 168630, "v": 24.868, "c_in": 39.666, "c_out_fwd": 25.857, "c_out_rev": 0.000, "c_shunt": 0.000 "temp": 32.187, "fan": false, "error": 0}\r\n | 10.0.0.110 | SyntaxError: Unexpected string'},
 u'time': u'2016-12-17T20:25:33.351Z'}

logger.state.clap
{u'chan': u'logger.state.clap',
 u'data': True,
 u'time': u'2016-12-17T20:23:38.369Z'}

stats.network
{u'chan': u'stats.network',
 u'data': {u'4-ac': [{u'c_ckts': [0.028, 0.028, 0.028, 0.028],
                      u'device_id': u'10459',
                      u'labels': [u'T4 Sub L',
                                  u'T4 Sub R',
                                  u'T4 J Lights L',
                                  u'T4 J Lights R'],
                      u'v': 2.624},
                     {u'c_ckts': [0.028, 0.028, 0.028, 0.028],
                      u'device_id': u'11916',
   

### Define schemas for basic validation

In [6]:
wrapperSchema = {
    'type': 'object',
    'properties': {
        'chan': { 'type': 'string' },
        'data': {},
        'time': { 'type': 'string' },
    },
    'additionalProperties': False,
    'minProperties': 3,
}
validate(parsedData['serverlog'][0], wrapperSchema)

In [7]:
for k in parsedData:
    print 'validating wrappers for', k
    for entry in parsedData[k]:
        validate(entry, wrapperSchema)

validating wrappers for serverlog
validating wrappers for logger.state.clap
validating wrappers for stats.network
validating wrappers for network.data
validating wrappers for stats.labels
validating wrappers for stats.server
validating wrappers for logger.state.recording_state


In [8]:
serverLogDataSchema = {
    'type': 'object',
    'properties': {
        'level': { 'type': 'string' },
        'message': { 'type': 'string' },
    },
    'additionalProperties': False,
    'minProperties': 2,
}
validate(parsedData['serverlog'][0]['data'], serverLogDataSchema)

In [9]:
for entry in parsedData['serverlog']:
    validate(entry['data'], serverLogDataSchema)

In [10]:
serverStatsDataSchema = {
    'type': 'object',
    'properties': {
        'appUptime': { 'type': 'integer' },
        'arch': { 'type': 'string' },
        'freedisk': { 'type': 'integer' },
        'freemem': { 'type': 'integer' },
        'loadavg': { 'type': 'array' },
        'logs_overloaded': { 'type': 'boolean' },
        'time': { 'type': 'integer' },
        'totaldisk': { 'type': 'integer' },
        'totalmem': { 'type': 'integer' },
        'uptime': { 'type': 'integer' },
    },
    'additionalProperties': False,
    'minProperties': 10,
}
validate(parsedData['stats.server'][0]['data'], serverStatsDataSchema)

In [11]:
for entry in parsedData['stats.server']:
    validate(entry['data'], serverStatsDataSchema)

In [12]:
basicNetworkDataSchema = {
    'type': 'object',
    'properties': {
        'address': { 'type': 'string' },
        'msg': {
            'type': 'object',
            'properties': { 'kind': { 'type': 'string' } },
            'required': [ 'kind' ],
        },
        'size': { 'type': 'integer' },
    },
    'additionalProperties': False,
    'minProperties': 3,
}
validate(parsedData['network.data'][0]['data'], basicNetworkDataSchema)

In [13]:
for entry in parsedData['network.data']:
    try:
        validate(entry['data'], basicNetworkDataSchema)
    except Exception, err:
        print "ERROR on", entry
        print err

ERROR on {u'data': {u'msg': {u'uid': u'0000', u'i': 169132, u'knd': u'inv', u'ms': 16916278, u'v': 26.747, u'soft': True, u'inv': True}, u'size': 85, u'address': u'10.0.0.117'}, u'chan': u'network.data', u'time': u'2016-12-17T20:20:28.320Z'}
Required field 'kind' is missing


### Validate various kinds of network data

In [14]:
knownKinds = ['caps', 'inv', 'acnet', 'bike', '4-ac']

parsedNetworkData = {}
for entry in parsedData['network.data']:
    if not entry['data']['msg'].get('kind'): # should not happen
        continue
    kind = entry['data']['msg']['kind']
    if kind not in knownKinds:
        print "ERROR - unknown kind", kind
        print entry
        print
        continue
    if not parsedNetworkData.get(kind):
        parsedNetworkData[kind] = []
    parsedNetworkData[kind].append(entry)

ERROR - unknown kind iv
{u'data': {u'msg': {u'kind': u'iv', u'uid': u'0000', u'i': 163633, u'inv': True, u'ms': 16366276, u'v': 28.114, u'soft': True}, u'size': 85, u'address': u'10.0.0.117'}, u'chan': u'network.data', u'time': u'2016-12-17T20:11:18.303Z'}

ERROR - unknown kind cap
{u'data': {u'msg': {u'kind': u'cap', u'error': 0, u'uid': u'0000', u'temp': 27.625, u'i': 162704, u'c_in': 53.795, u'fan': False, u'shunts': 1, u'ms': 16616176, u'v': 28.185, u'c_shunt': 9.107, u'c_out_fwd': 47.38, u'c_out_rev': 0}, u'size': 188, u'address': u'10.0.0.110'}, u'chan': u'network.data', u'time': u'2016-12-17T20:15:28.366Z'}



In [15]:
for k in parsedNetworkData.keys():
    print k
    pprint(parsedNetworkData[k][-1])
    print

4-ac
{u'chan': u'network.data',
 u'data': {u'address': u'10.0.0.129',
           u'msg': {u'c_1': 0.028,
                    u'c_2': 0.028,
                    u'c_3': 0.028,
                    u'c_4': 0.028,
                    u'i': 172213,
                    u'kind': u'4-ac',
                    u'ms': 17285817,
                    u'uid': u'10459',
                    u'v': 2.624,
                    u'vers': u'v0.1.1'},
           u'size': 148},
 u'time': u'2016-12-17T20:26:42.198Z'}

inv
{u'chan': u'network.data',
 u'data': {u'address': u'10.0.0.117',
           u'msg': {u'i': 172870,
                    u'inv': False,
                    u'kind': u'inv',
                    u'ms': 17290147,
                    u'soft': False,
                    u'uid': u'0000',
                    u'v': 23.785},
           u'size': 88},
 u'time': u'2016-12-17T20:26:42.191Z'}

bike
{u'chan': u'network.data',
 u'data': {u'address': u'10.0.0.131',
           u'msg': {u'c_bits': 0,
              

In [16]:
msgSchemas = {}

In [17]:
msgSchemas['inv'] = {
    'type': 'object',
    'properties': {
        'i': { 'type': 'integer' },
        'inv': { 'type': 'boolean' },
        'kind': { 'type': 'string' },
        'ms': { 'type': 'integer' },
        'soft': { 'type': 'boolean' },
        'uid': { 'type': 'string' },
        'v': { 'type': 'number' },
    },
    'additionalProperties': False,
    'minProperties': 7,
}
validate(parsedNetworkData['inv'][0]['data']['msg'], msgSchemas['inv'])

In [18]:
msgSchemas['caps'] = {
    'type': 'object',
    'properties': {
        'c_in': { 'type': 'number' },
        'c_out_fwd': { 'type': 'number' },
        'c_out_rev': { 'type': 'number' },
        'c_shunt': { 'type': 'number' },
        'error': { 'type': 'integer' },
        'fan': { 'type': 'boolean' },
        'i': { 'type': 'integer' },
        'kind': { 'type': 'string' },
        'ms': { 'type': 'integer' },
        'shunts': { 'type': 'integer' },
        'temp': { 'type': 'number' },
        'uid': { 'type': 'string' },
        'v': { 'type': 'number' },
    },
    'additionalProperties': False,
    'minProperties': 13,
}
validate(parsedNetworkData['caps'][0]['data']['msg'], msgSchemas['caps'])

In [19]:
msgSchemas['acnet'] = {
    'type': 'object',
    'properties': {
        'btn': { 'type': 'integer' },
        'c_t1': { 'type': 'number' },
        'c_t2': { 'type': 'number' },
        'c_t3': { 'type': 'number' },
        'c_t4': { 'type': 'number' },
        'error': { 'type': 'integer' },
        'fan': { 'type': 'boolean' },
        'i': { 'type': 'integer' },
        'kind': { 'type': 'string' },
        'ms': { 'type': 'integer' },
        'reset': { 'type': 'integer' },
        'server': { 'type': 'integer' },
        'temp': { 'type': 'number' },
        'tiers': { 'type': 'integer' },
        'uid': { 'type': 'string' },
        'v_ac': { 'type': 'number' },
        'v_dc': { 'type': 'number' },
        'v_t1': { 'type': 'boolean' },
        'v_t2': { 'type': 'boolean' },
        'v_t3': { 'type': 'boolean' },
        'v_t4': { 'type': 'boolean' },
    },
    'additionalProperties': False,
    'minProperties': 21,
}
validate(parsedNetworkData['acnet'][0]['data']['msg'], msgSchemas['acnet'])

In [20]:
msgSchemas['bike'] = {
    'type': 'object',
    'properties': {
        'c_bits': { 'type': 'number' },
        'c_out': { 'type': 'number' },
        'i': { 'type': 'integer' },
        'kind': { 'type': 'string' },
        'ms': { 'type': 'integer' },
        'uid': { 'type': 'string' },
        'v': { 'type': 'number' },
        'v_bits': { 'type': 'number' },
        'vers': { 'type': 'string' },
    },
    'additionalProperties': False,
    'minProperties': 9,
}
validate(parsedNetworkData['bike'][0]['data']['msg'], msgSchemas['bike'])

In [21]:
msgSchemas['4-ac'] = {
    'type': 'object',
    'properties': {
        'c_1': { 'type': 'number' },
        'c_2': { 'type': 'number' },
        'c_3': { 'type': 'number' },
        'c_4': { 'type': 'number' },
        'i': { 'type': 'integer' },
        'kind': { 'type': 'string' },
        'ms': { 'type': 'integer' },
        'uid': { 'type': 'string' },
        'v': { 'type': 'number' },
        'vers': { 'type': 'string' },
    },
    'additionalProperties': False,
    'minProperties': 10,
}
validate(parsedNetworkData['4-ac'][0]['data']['msg'], msgSchemas['4-ac'])

In [22]:
validNetworkData = {}
for kind in parsedNetworkData:
    validNetworkData[kind] = []
    for entry in parsedNetworkData[kind]:
        try:
            validate(entry['data']['msg'], msgSchemas[kind])
        except Exception, err:
            print "ERROR on ", entry
            print err
            print
            continue
        validNetworkData[kind].append(entry)

ERROR on  {u'data': {u'msg': {u'kind': u'caps', u'error': 0, u'uid': u'0000', u'temp': 18.312, u'i': 135721, u'c_in': 0, u'c_shnt': 0, u'fan': False, u'shunts': 0, u'ms': 13861135, u'v': 22.594, u'c_out_fwd': 0.554, u'c_out_rev': 0}, u'size': 187, u'address': u'10.0.0.110'}, u'chan': u'network.data', u'time': u'2016-12-17T19:29:33.187Z'}
Value {u'kind': u'caps', u'error': 0, u'uid': u'0000', u'temp': 18.312, u'i': 135721, u'c_in': 0, u'c_shnt': 0, u'fan': False, u'shunts': 0, u'ms': 13861135, u'v': 22.594, u'c_out_fwd': 0.554, u'c_out_rev': 0} for field '<obj>' contains additional property 'c_shnt' not defined by 'properties' or 'patternProperties' and additionalProperties  is False

ERROR on  {u'data': {u'msg': {u'kind': u'caps', u'error': 0, u'uid': u'0000', u'temp': 19.187, u'i': 138513, u'c_in': 56.778, u'cout_fwd': 51.941, u'fan': False, u'shunts': 0, u'ms': 14146145, u'v': 26.456, u'c_shunt': 0, u'c_out_rev': 0}, u'size': 189, u'address': u'10.0.0.110'}, u'chan': u'network.data',

At this point, the data is all well-formed. However, note that it is probably inaccurate because characters could have been dropped from values, but still pass the validation

## Create big, flat data table

Now we try to normalize the data into one big table

In [23]:
def flatten(input):
    output = {}
    def _flat(obj):
        for k in obj.keys():
            if type(obj[k]) == dict:
                _flat(obj[k])
            else:
                output[k] = obj[k]
    _flat(input)
    return output

In [24]:
parsedData.keys()

[u'serverlog',
 u'logger.state.clap',
 u'stats.network',
 u'network.data',
 u'stats.labels',
 u'stats.server',
 u'logger.state.recording_state']

In [25]:
validNetworkData.keys()

[u'4-ac', u'inv', u'bike', u'acnet', u'caps']

In [26]:
flattenedDataArray = []
flattenedDataArray.extend([flatten(entry) for entry in parsedData['logger.state.clap']])
for kind in validNetworkData:
    flattenedDataArray.extend([flatten(entry) for entry in validNetworkData[kind]])

In [27]:
len(flattenedDataArray)

561537

In [28]:
df = pd.DataFrame(flattenedDataArray)

In [29]:
df['time'] = pd.to_datetime(df['time'])

In [31]:
df.columns

Index([  u'address',       u'btn',       u'c_1',       u'c_2',       u'c_3',
             u'c_4',    u'c_bits',      u'c_in',     u'c_out', u'c_out_fwd',
       u'c_out_rev',   u'c_shunt',      u'c_t1',      u'c_t2',      u'c_t3',
            u'c_t4',      u'chan',      u'data',     u'error',       u'fan',
               u'i',       u'inv',      u'kind',        u'ms',     u'reset',
          u'server',    u'shunts',      u'size',      u'soft',      u'temp',
           u'tiers',      u'time',       u'uid',         u'v',      u'v_ac',
          u'v_bits',      u'v_dc',      u'v_t1',      u'v_t2',      u'v_t3',
            u'v_t4',      u'vers'],
      dtype='object')

In [32]:
del df['data']
del df['vers']
del df['i']
del df['ms']

In [41]:
df[df.c_in > 1000]

Unnamed: 0,address,btn,c_1,c_2,c_3,c_4,c_bits,c_in,c_out,c_out_fwd,...,time,uid,v,v_ac,v_bits,v_dc,v_t1,v_t2,v_t3,v_t4
545404,10.0.0.110,,,,,,,57545,,56.352,...,2016-12-17 19:43:08.233000,0,25.133,,,,,,,


In [44]:
df = df.drop(545404)

In [54]:
df[(df.kind == 'caps') & (df.temp < 15)]

Unnamed: 0,address,btn,c_1,c_2,c_3,c_4,c_bits,c_in,c_out,c_out_fwd,...,time,uid,v,v_ac,v_bits,v_dc,v_t1,v_t2,v_t3,v_t4
551880,10.0.0.110,,,,,,,53.603,,47.146,...,2016-12-17 20:09:48.299000,0,25.667,,,,,,,


In [56]:
df = df.drop(551880)

In [57]:
len(df)

561535

In [59]:
df = df.set_index('time', verify_integrity=True)

In [60]:
df = df.sort_index()
df.to_pickle('all_data.pkl')