# Import libraries

In [1]:
!pip install pymongo
from pymongo import MongoClient
from datetime import datetime
import json

Collecting pymongo
  Downloading pymongo-4.4.0-cp39-cp39-macosx_10_9_universal2.whl (512 kB)
[K     |████████████████████████████████| 512 kB 2.6 MB/s eta 0:00:01
[?25hCollecting dnspython<3.0.0,>=1.16.0
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
[K     |████████████████████████████████| 283 kB 10.5 MB/s eta 0:00:01
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.3.0 pymongo-4.4.0


# Create client connection

In [38]:
client = MongoClient('mongodb://127.0.0.1:27017/')
db = client.get_database("test")
collection = db.get_collection("smart-meter-benchmark")

# Simple Retrieval

## First Query Workload

In [100]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 1)

# Specify query
query = {
    "LCLid": "MAC000131",
    "day": {'$lt': end, '$gte': start}
}

# Start timer
start_time = time.time()

# Retrieve all documents
result_cursor = collection.find(query, {"energy_min": 1, "energy_max": 1})

# End timer
end_time = time.time()

# Calculate execution time
execution_time = (end_time - start_time)*1000

# Print values
count = 0
for document in result_cursor:
    energy_min = document.get("energy_min")
    energy_max = document.get("energy_max")
    print(f"energy_min: {energy_min}, energy_max: {energy_max}")
    count += 1
    if count == 5:
        break
    
# Print execution time
print(f"Query execution time: {execution_time} milliseconds")

energy_min: 0.064, energy_max: 0.312
energy_min: 0.065, energy_max: 0.299
energy_min: 0.064, energy_max: 0.7829999999999999
energy_min: 0.066, energy_max: 1.1619999
energy_min: 0.065, energy_max: 0.742
Query execution time: 0.32806396484375 milliseconds


## Second Query Workload

In [101]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 1)

# Specify query
query = {
    "LCLid": "MAC000131",
    "day": {'$lt': end, '$gte': start}
}

# Start timer
start_time = time.time()

# Retrieve all documents
result_cursor = collection.find(query, {"energy_min": 1, "energy_max": 1})

# End timer
end_time = time.time()

# Calculate execution time
execution_time = (end_time - start_time)*1000

# Print values
count = 0
for document in result_cursor:
    energy_min = document.get("energy_min")
    energy_max = document.get("energy_max")
    print(f"energy_min: {energy_min}, energy_max: {energy_max}")
    count += 1
    if count == 5:
        break
    
# Print execution time
print(f"Query execution time: {execution_time} milliseconds")

energy_min: 0.064, energy_max: 0.312
energy_min: 0.065, energy_max: 0.299
energy_min: 0.064, energy_max: 0.7829999999999999
energy_min: 0.066, energy_max: 1.1619999
energy_min: 0.065, energy_max: 0.742
Query execution time: 0.3161430358886719 milliseconds


## Third Query Workload

In [102]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 1)

# Specify query
query = {
    "LCLid": "MAC000131",
    "day": {'$lt': end, '$gte': start}
}

# Start timer
start_time = time.time()

# Retrieve all documents
result_cursor = collection.find(query, {"energy_min": 1, "energy_max": 1})

# End timer
end_time = time.time()

# Calculate execution time
execution_time = (end_time - start_time)*1000

# Print values
count = 0
for document in result_cursor:
    energy_min = document.get("energy_min")
    energy_max = document.get("energy_max")
    print(f"energy_min: {energy_min}, energy_max: {energy_max}")
    count += 1
    if count == 5:
        break
    
# Print execution time
print(f"Query execution time: {execution_time} milliseconds")

energy_min: 0.064, energy_max: 0.312
energy_min: 0.065, energy_max: 0.299
energy_min: 0.064, energy_max: 0.7829999999999999
energy_min: 0.066, energy_max: 1.1619999
energy_min: 0.065, energy_max: 0.742
Query execution time: 0.24890899658203125 milliseconds


## Overall Query Execution Time for Simple Retrieval

In [103]:
print(f"Average Query Execution Time: {((0.33+0.32+0.25)/3)} milliseconds")

Average Query Execution Time: 0.3 milliseconds


# Filter and Aggregation

## First Query Workload

In [172]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 2)

# Specify query
pipeline = [
    {
        '$match': {
            'day': {'$lt': end, '$gte': start}
        }
    },
    {
        '$group': {
            '_id': '$LCLid',
            'totalSum': {
                '$sum': '$energy_sum'
            }
        }
    },
    {
        '$sort': {
            '_id': 1
        }
    }
]

# Start timer
start_time = time.time()

# Execute query
results = collection.aggregate(pipeline)

# End timer
end_time = time.time()

# Calculate eecution time
execution_time = (end_time - start_time) * 1000

# Print results
for result in results:
    LCLid = result['_id']
    totalSum = result['totalSum']
    print(f"LCLid: {LCLid}, Total Sum: {totalSum}")

# Print query execution time
print(f"Query execution time: {execution_time} milliseconds")

LCLid: MAC000131, Total Sum: 3691.9949997
LCLid: MAC000132, Total Sum: 5258.8470011
LCLid: MAC000221, Total Sum: 4917.5459997
LCLid: MAC000228, Total Sum: 2707.6260003
LCLid: MAC000234, Total Sum: 4213.725
LCLid: MAC000235, Total Sum: 1528.1550001
Query execution time: 8.754968643188477 milliseconds


## Second Query Workload

In [178]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 2)

# Specify query
pipeline = [
    {
        '$match': {
            'day': {'$lt': end, '$gte': start}
        }
    },
    {
        '$group': {
            '_id': '$LCLid',
            'totalSum': {
                '$sum': '$energy_sum'
            }
        }
    },
    {
        '$sort': {
            '_id': 1
        }
    }
]

# Start timer
start_time = time.time()

# Execute query
results = collection.aggregate(pipeline)

# End timer
end_time = time.time()

# Calculate eecution time
execution_time = (end_time - start_time) * 1000

# Print results
for result in results:
    LCLid = result['_id']
    totalSum = result['totalSum']
    print(f"LCLid: {LCLid}, Total Sum: {totalSum}")

# Print query execution time
print(f"Query execution time: {execution_time} milliseconds")

LCLid: MAC000131, Total Sum: 3691.9949997
LCLid: MAC000132, Total Sum: 5258.8470011
LCLid: MAC000221, Total Sum: 4917.5459997
LCLid: MAC000228, Total Sum: 2707.6260003
LCLid: MAC000234, Total Sum: 4213.725
LCLid: MAC000235, Total Sum: 1528.1550001
Query execution time: 10.632991790771484 milliseconds


## Third Query Workload

In [182]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 2)

# Specify query
pipeline = [
    {
        '$match': {
            'day': {'$lt': end, '$gte': start}
        }
    },
    {
        '$group': {
            '_id': '$LCLid',
            'totalSum': {
                '$sum': '$energy_sum'
            }
        }
    },
    {
        '$sort': {
            '_id': 1
        }
    }
]

# Start timer
start_time = time.time()

# Execute query
results = collection.aggregate(pipeline)

# End timer
end_time = time.time()

# Calculate eecution time
execution_time = (end_time - start_time) * 1000

# Print results
for result in results:
    LCLid = result['_id']
    totalSum = result['totalSum']
    print(f"LCLid: {LCLid}, Total Sum: {totalSum}")

# Print query execution time
print(f"Query execution time: {execution_time} milliseconds")

LCLid: MAC000131, Total Sum: 3691.9949997
LCLid: MAC000132, Total Sum: 5258.8470011
LCLid: MAC000221, Total Sum: 4917.5459997
LCLid: MAC000228, Total Sum: 2707.6260003
LCLid: MAC000234, Total Sum: 4213.725
LCLid: MAC000235, Total Sum: 1528.1550001
Query execution time: 9.136199951171875 milliseconds


## Overall Query Execution Time for Filter and Aggregation

In [183]:
print(f"Average Query Execution Time: {((8.75+10.63+9.13)/3)} milliseconds")

Average Query Execution Time: 9.503333333333336 milliseconds


# Metadata

In [156]:
database_names = client.list_database_names()
print("Database Names:")
for name in database_names:
    print(name)

collection_names = db.list_collection_names()
print("\nCollection Names:")
for name in collection_names:
    print(name)

index_info = collection.index_information()
print("\nIndex Information:")

print(index_info)

Database Names:
admin
config
local
test

Collection Names:
smart-meter-benchmark

Index Information:
{'_id_': {'v': 2, 'key': [('_id', 1)]}}


In [157]:
# Retrieve all ObjectIDs from the collection
object_ids = collection.distinct("_id", {})

count = 0
for object_id in object_ids:
    print(f"ObjectId: {object_id}")
    count += 1
    if count == 5:
        break

ObjectId: 64a2f7f5a67a64dabfab0b26
ObjectId: 64a2f7f5a67a64dabfab0b27
ObjectId: 64a2f7f5a67a64dabfab0b28
ObjectId: 64a2f7f5a67a64dabfab0b29
ObjectId: 64a2f7f5a67a64dabfab0b2a


In [158]:
# Print some collection statistics
command = {"collStats": collection.name}
stats = db.command(command)

print("Namespace of collection:", stats["ns"])
print("Total size of collection in bytes:", stats["size"])
print("Number of documents:", stats["count"])
print("Average size of an object:", stats["avgObjSize"])
print("Storage size of collection in bytes:", stats["storageSize"])

Namespace of collection: test.smart-meter-benchmark
Total size of collection in bytes: 964458
Number of documents: 4871
Average size of an object: 198
Storage size of collection in bytes: 348160


In [163]:
# Print all the available statistics
print(stats)

{'ns': 'test.smart-meter-benchmark', 'size': 964458, 'count': 4871, 'avgObjSize': 198, 'numOrphanDocs': 0, 'storageSize': 348160, 'freeStorageSize': 0, 'capped': False, 'wiredTiger': {'metadata': {'formatVersion': 1}, 'creationString': 'access_pattern_hint=none,allocation_size=4KB,app_metadata=(formatVersion=1),assert=(commit_timestamp=none,durable_timestamp=none,read_timestamp=none,write_timestamp=off),block_allocation=best,block_compressor=snappy,cache_resident=false,checksum=on,colgroups=,collator=,columns=,dictionary=0,encryption=(keyid=,name=),exclusive=false,extractor=,format=btree,huffman_key=,huffman_value=,ignore_in_memory_cache_size=false,immutable=false,import=(compare_timestamp=oldest_timestamp,enabled=false,file_metadata=,metadata_file=,repair=false),internal_item_max=0,internal_key_max=0,internal_key_truncate=true,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=64MB,log=(enabled=true),lsm=(auto_throttle=true,b