# A MongoDB performance test for sensor data

In [1]:
from pymongo import MongoClient

def get_database():
    CONNECTION_STRING = "mongodb://root:root@127.0.0.1"
    client = MongoClient(CONNECTION_STRING)
    return client['sensor-data-performance-test']

db = get_database()
db

Database(MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True), 'sensor-data-performance-test')

In [2]:
db.list_collection_names()

[]

# Read csv-files and insert into MongoDB

In [3]:
import os
import pandas as pd
import re

In [4]:
def parse_header(header):
    res = re.findall(r'\{.*?}', header)
    if len(res) > 0:
        return res[0].replace("{", "").replace("}", "")
    raise ParsingHeaderErrorException("No header variable within {} found")


class ParsingHeaderErrorException(Exception):
    pass


In [5]:
file_count = 0
row_count = 0
file_headers_error_count = 0
file_empty_count = 0
start_time = pd.Timestamp.now()
for file in sorted(os.listdir("sensor-data")):
    try:
        project, sensor, _ = re.findall(r'(.*)_(.*)_(.*)\.csv', file)[0]
        filename = os.fsdecode("sensor-data/" + file)
        df = pd.read_csv(filename, sep=";", encoding="ISO-8859-1")
        df.columns.values[0] = "{timestamp}"
        df.rename(columns=parse_header, inplace=True)

        df['timestamp'] = pd.to_datetime(df['timestamp'], dayfirst=True)
        collection_name = f"sensor_{project}_{sensor}"
        if collection_name not in db.list_collection_names():
            db.create_collection(
                collection_name,
                timeseries={
                    "timeField": "timestamp",
                    "metaField": "metadata",
                    "granularity": "minutes"
                }
            )
        db[collection_name].insert_many(df.to_dict('records'))
        file_count += 1
        row_count += len(df.index)
    except ParsingHeaderErrorException:
        file_headers_error_count += 1
        continue
    except pd.errors.EmptyDataError:
        file_empty_count += 1
        continue
    except pd.errors.ParserError:
        print("ParserError")
        continue
    except pd.errors.IndexingError:
        print("IndexingError")
        continue
    except UnicodeDecodeError:
        print("UnicodeDecodeError")
        continue
    except Exception as e:
        print(e)
        continue

end_time = pd.Timestamp.now()
print("Finished")
print(f"{file_count} files read")
print(f"{row_count} rows inserted")
print(f"Files with empty data: {file_empty_count}")
print(f"Files with header errors: {file_headers_error_count}")
print(f"Read duration: {int(end_time.timestamp() - start_time.timestamp())} seconds")

Finished
18121 files read
223853 rows inserted
Files with empty data: 1
Files with header errors: 13
Read duration: 50 seconds


In [6]:
db.list_collection_names()[0:10]

['sensor_DEU2_Becken3',
 'system.buckets.sensor_DEU2_Becken3',
 'sensor_DEU1_125',
 'system.buckets.sensor_DEU1_125',
 'sensor_DEU1_I-3',
 'system.buckets.sensor_DEU1_I-3',
 'system.views',
 'sensor_DEU1_127',
 'system.buckets.sensor_DEU1_127',
 'sensor_DEU1_126']

In [7]:
sensor_collection_DEU1_I2 = db['sensor_DEU1_I-2']

In [8]:
pd.DataFrame(sensor_collection_DEU1_I2.find_one(), index=[0])

Unnamed: 0,timestamp,ph,ldo,h_level,t_intern,h,_id,ec_25,t,ec,v_batt
0,2019-06-06 14:00:00,7.11,3.555,4.06,33.89,4.06,64eee76fa0f3b3c23d13b416,0.529239,12.44,0.383,5.29


In [9]:
results = list(sensor_collection_DEU1_I2.aggregate([
    {
        "$match": {
            "timestamp": {
                "$gte": pd.to_datetime("2020-01-01"),
                "$lt": pd.to_datetime("2021-01-02")
            }
        }
    },
    {
        "$group": {
            "_id": {
                "year": {"$year": "$timestamp"},
                "month": {"$month": "$timestamp"},
                "day": {"$dayOfMonth": "$timestamp"}
            },
            "avg": {"$avg": "$t"},
            "min": {"$min": "$t"},
            "max": {"$max": "$t"}
        }
    },
    {
        "$sort": {
            "_id": 1
        }
    }
]))
res = pd.DataFrame(results)
res

Unnamed: 0,_id,avg,min,max
0,"{'year': 2020, 'month': 1, 'day': 1}",13.107500,13.09,13.12
1,"{'year': 2020, 'month': 1, 'day': 2}",13.120000,13.12,13.12
2,"{'year': 2020, 'month': 1, 'day': 3}",13.117500,13.08,13.12
3,"{'year': 2020, 'month': 1, 'day': 4}",13.125833,13.12,13.14
4,"{'year': 2020, 'month': 1, 'day': 5}",13.132500,13.12,13.14
...,...,...,...,...
362,"{'year': 2020, 'month': 12, 'day': 28}",13.168750,13.15,13.17
363,"{'year': 2020, 'month': 12, 'day': 29}",13.168750,13.15,13.17
364,"{'year': 2020, 'month': 12, 'day': 30}",13.167500,13.15,13.17
365,"{'year': 2020, 'month': 12, 'day': 31}",13.170000,13.17,13.17


In [10]:
cursor = sensor_collection_DEU1_I2.aggregate([
    {"$project": {
        "data": {"$objectToArray": "$$ROOT"}
    }},
    {"$project": {"data": "$data.k"}},
    {"$unwind": "$data"},
    {"$group": {
        "_id": 'null',
        "keys": {"$addToSet": "$data"}
    }}
])

collection_keys = list(cursor)[0]['keys']
hide_keys = ['_id', 'metadata']
keys = list(set(collection_keys) - set(hide_keys))

print(keys)

['h', 'ec', 't_intern', 'h_level', 'ldo', 'ec_25', 'timestamp', 't', 'ph', 'v_batt']


In [11]:
db.command("dbstats")

{'db': 'sensor-data-performance-test',
 'collections': 25,
 'views': 24,
 'objects': 9797,
 'avgObjSize': 913.9963254057365,
 'dataSize': 8954422.0,
 'storageSize': 1277952.0,
 'indexes': 1,
 'indexSize': 20480.0,
 'totalSize': 1298432.0,
 'scaleFactor': 1.0,
 'fsUsedSize': 431450853376.0,
 'fsTotalSize': 494384795648.0,
 'ok': 1.0}

In [12]:
sensor_collection_DEU1_I2.count_documents({})

44471

In [13]:
pd.DataFrame(list(sensor_collection_DEU1_I2.find()))

Unnamed: 0,timestamp,ph,ldo,h_level,t_intern,h,_id,ec_25,t,ec,v_batt
0,2019-06-06 14:00:00,7.11,3.555,4.06,33.89,4.06,64eee76fa0f3b3c23d13b416,0.529239,12.44,0.383,5.29
1,2019-06-06 15:00:00,7.11,3.568,4.06,30.71,4.06,64eee76fa0f3b3c23d13b417,0.528918,12.46,0.383,5.28
2,2019-06-06 16:00:00,7.11,3.569,4.06,24.21,4.06,64eee76fa0f3b3c23d13b418,0.528918,12.46,0.383,5.27
3,2019-06-06 17:00:00,7.11,3.563,4.06,22.46,4.06,64eee76fa0f3b3c23d13b419,0.528918,12.46,0.383,5.27
4,2019-06-06 18:00:00,7.11,3.563,4.06,22.3,4.06,64eee76fa0f3b3c23d13b41a,0.529239,12.44,0.383,5.26
...,...,...,...,...,...,...,...,...,...,...,...
44466,2023-04-06 08:02:52,,3.08,5.31,0.87,5.31,64eee77aa0f3b3c23d1461c8,1827004.144,12.67,1331411.0,4.99
44467,2023-04-06 09:00:00,8.28,3.098,5.31,7.86,5.31,64eee77aa0f3b3c23d1461c9,1827004.144,12.67,1331411.0,5.04
44468,2023-04-06 10:00:00,8.27,3.115,5.31,13.41,5.31,64eee77aa0f3b3c23d1461ca,1827004.144,12.67,1331411.0,5.07
44469,2023-04-06 11:00:00,8.29,3.131,5.31,18.33,5.31,64eee77aa0f3b3c23d1461cb,1827004.144,12.67,1331411.0,5.09


In [14]:
cursor = sensor_collection_DEU1_I2.aggregate([
    {"$match": {
        "timestamp": {
            "$gte": pd.to_datetime("2020-01-01"),
            "$lt": pd.to_datetime("2021-01-02")
        },
        
    }},
    {"$project": {
        "timestamp": {
            "$divide": [
                {"$toLong": "$timestamp"},
                1000
            ]
        },
        "datetime": {
            "$dateToString": {
                "format": "%Y-%m-%dT%H:%M:%S.000Z",
                "date": "$timestamp"
            }
        },
        "t": 1,
        "_id": 0
    }}
])
list(cursor)

[{'t': 13.1,
  'timestamp': 1577836800.0,
  'datetime': '2020-01-01T00:00:00.000Z'},
 {'t': 13.1,
  'timestamp': 1577840400.0,
  'datetime': '2020-01-01T01:00:00.000Z'},
 {'t': 13.1,
  'timestamp': 1577844000.0,
  'datetime': '2020-01-01T02:00:00.000Z'},
 {'t': 13.09,
  'timestamp': 1577847600.0,
  'datetime': '2020-01-01T03:00:00.000Z'},
 {'t': 13.1,
  'timestamp': 1577851200.0,
  'datetime': '2020-01-01T04:00:00.000Z'},
 {'t': 13.1,
  'timestamp': 1577854800.0,
  'datetime': '2020-01-01T05:00:00.000Z'},
 {'t': 13.1,
  'timestamp': 1577858400.0,
  'datetime': '2020-01-01T06:00:00.000Z'},
 {'t': 13.1,
  'timestamp': 1577862000.0,
  'datetime': '2020-01-01T07:00:00.000Z'},
 {'t': 13.1,
  'timestamp': 1577865600.0,
  'datetime': '2020-01-01T08:00:00.000Z'},
 {'t': 13.12,
  'timestamp': 1577869200.0,
  'datetime': '2020-01-01T09:00:00.000Z'},
 {'t': 13.1,
  'timestamp': 1577872800.0,
  'datetime': '2020-01-01T10:00:00.000Z'},
 {'t': 13.12,
  'timestamp': 1577876400.0,
  'datetime': '2020-