# Import libraries

In [None]:
from pymongo import MongoClient
from datetime import datetime
import json
import gzip
import time

# Data compression

In [1]:
# Read json data
json_file = "/Users/jafarabdurrahmaan/Dropbox/00_Uni/Masterthesis/A_Thesis/1_Components/13_Benchmark_Study_Components/4_Document/MongoDB/iter2/daily_dataset_small.json"
with open(json_file, "r") as file:
    json_data = json.load(file)

gzip_file = "daily_dataset_small.json.gz"

# Compress into gzip file
with gzip.open(gzip_file, "wt", encoding="utf-8") as file:
    json.dump(json_data, file)

# Create client connection

In [9]:
client = MongoClient('mongodb://127.0.0.1:27017/')
db = client['test']
collection = db['smart-meter-benchmark_c']

# Data Insertion

In [13]:
# Access the desired database and collection
db = client.get_database("test")
collection = db.get_collection("smart-meter-benchmark_c")

# Start timer
start_time = time.time()

# Read file
with gzip.open("/Users/jafarabdurrahmaan/Dropbox/00_Uni/Masterthesis/A_Thesis/1_Components/13_Benchmark_Study_Components/4_Document/MongoDB/iter2/daily_dataset_small.json.gz", 'rt') as f:
    data = json.load(f)

# Specify data types
for document in data:
    day_str = document["day"]
    document["day"] = datetime.strptime(day_str, "%Y-%m-%d")
    document["energy_median"] = float(document["energy_median"])
    document["energy_mean"] = float(document["energy_mean"])
    document["energy_max"] = float(document["energy_max"])
    document["energy_count"] = int(document["energy_count"])
    document["energy_std"] = float(document["energy_std"])
    document["energy_sum"] = float(document["energy_sum"])
    document["energy_min"] = float(document["energy_min"])

# End timer    
end_time = time.time()

# Calculate ingestion time
execution_time = end_time - start_time
    
# Insert into collection
collection.insert_many(data)

# Print ingestion time
print(f"Data insertion took: {execution_time} seconds")

Data insertion took: 0.09462404251098633 seconds


# Simple Retrieval

## First Query Workload

In [21]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 1)

# Specify query
query = {
    "LCLid": "MAC000131",
    "day": {'$lt': end, '$gte': start}
}

# Start timer
start_time = time.time()

# Retrieve all documents
result_cursor = collection.find(query, {"energy_min": 1, "energy_max": 1})

# End timer
end_time = time.time()

# Calculate execution time
execution_time = (end_time - start_time)*1000

# Print values
count = 0
for document in result_cursor:
    energy_min = document.get("energy_min")
    energy_max = document.get("energy_max")
    print(f"energy_min: {energy_min}, energy_max: {energy_max}")
    count += 1
    if count == 5:
        break
    
# Print execution time
print(f"Query execution time: {execution_time} milliseconds")

energy_min: 0.064, energy_max: 0.312
energy_min: 0.065, energy_max: 0.299
energy_min: 0.064, energy_max: 0.7829999999999999
energy_min: 0.066, energy_max: 1.1619999
energy_min: 0.065, energy_max: 0.742
Query execution time: 0.19788742065429688 milliseconds


## Second Query Workload

In [22]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 1)

# Specify query
query = {
    "LCLid": "MAC000131",
    "day": {'$lt': end, '$gte': start}
}

# Start timer
start_time = time.time()

# Retrieve all documents
result_cursor = collection.find(query, {"energy_min": 1, "energy_max": 1})

# End timer
end_time = time.time()

# Calculate execution time
execution_time = (end_time - start_time)*1000

# Print values
count = 0
for document in result_cursor:
    energy_min = document.get("energy_min")
    energy_max = document.get("energy_max")
    print(f"energy_min: {energy_min}, energy_max: {energy_max}")
    count += 1
    if count == 5:
        break
    
# Print execution time
print(f"Query execution time: {execution_time} milliseconds")

energy_min: 0.064, energy_max: 0.312
energy_min: 0.065, energy_max: 0.299
energy_min: 0.064, energy_max: 0.7829999999999999
energy_min: 0.066, energy_max: 1.1619999
energy_min: 0.065, energy_max: 0.742
Query execution time: 0.18095970153808594 milliseconds


## Third Query Workload

In [23]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 1)

# Specify query
query = {
    "LCLid": "MAC000131",
    "day": {'$lt': end, '$gte': start}
}

# Start timer
start_time = time.time()

# Retrieve all documents
result_cursor = collection.find(query, {"energy_min": 1, "energy_max": 1})

# End timer
end_time = time.time()

# Calculate execution time
execution_time = (end_time - start_time)*1000

# Print values
count = 0
for document in result_cursor:
    energy_min = document.get("energy_min")
    energy_max = document.get("energy_max")
    print(f"energy_min: {energy_min}, energy_max: {energy_max}")
    count += 1
    if count == 5:
        break
    
# Print execution time
print(f"Query execution time: {execution_time} milliseconds")

energy_min: 0.064, energy_max: 0.312
energy_min: 0.065, energy_max: 0.299
energy_min: 0.064, energy_max: 0.7829999999999999
energy_min: 0.066, energy_max: 1.1619999
energy_min: 0.065, energy_max: 0.742
Query execution time: 0.15211105346679688 milliseconds


## Overall Query Execution Time for Simple Retrieval

In [24]:
print(f"Average Query Execution Time: {((0.2+0.18+0.15)/3)} milliseconds")

Average Query Execution Time: 0.17666666666666667 milliseconds


# Filter and Aggregation

## First Query Workload

In [41]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 2)

# Specify query
pipeline = [
    {
        '$match': {
            'day': {'$lt': end, '$gte': start}
        }
    },
    {
        '$group': {
            '_id': '$LCLid',
            'totalSum': {
                '$sum': '$energy_sum'
            }
        }
    },
    {
        '$sort': {
            '_id': 1
        }
    }
]

# Start timer
start_time = time.time()

# Execute query
results = collection.aggregate(pipeline)

# End timer
end_time = time.time()

# Calculate eecution time
execution_time = (end_time - start_time) * 1000

# Print results
for result in results:
    LCLid = result['_id']
    totalSum = result['totalSum']
    print(f"LCLid: {LCLid}, Total Sum: {totalSum}")

# Print query execution time
print(f"Query execution time: {execution_time} milliseconds")

LCLid: MAC000131, Total Sum: 3691.9949997
LCLid: MAC000132, Total Sum: 5258.8470011
LCLid: MAC000221, Total Sum: 4917.5459997
LCLid: MAC000228, Total Sum: 2707.6260003
LCLid: MAC000234, Total Sum: 4213.725
LCLid: MAC000235, Total Sum: 1528.1550001
Query execution time: 13.033866882324219 milliseconds


## Second Query Workload

In [49]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 2)

# Specify query
pipeline = [
    {
        '$match': {
            'day': {'$lt': end, '$gte': start}
        }
    },
    {
        '$group': {
            '_id': '$LCLid',
            'totalSum': {
                '$sum': '$energy_sum'
            }
        }
    },
    {
        '$sort': {
            '_id': 1
        }
    }
]

# Start timer
start_time = time.time()

# Execute query
results = collection.aggregate(pipeline)

# End timer
end_time = time.time()

# Calculate eecution time
execution_time = (end_time - start_time) * 1000

# Print results
for result in results:
    LCLid = result['_id']
    totalSum = result['totalSum']
    print(f"LCLid: {LCLid}, Total Sum: {totalSum}")

# Print query execution time
print(f"Query execution time: {execution_time} milliseconds")

LCLid: MAC000131, Total Sum: 3691.9949997
LCLid: MAC000132, Total Sum: 5258.8470011
LCLid: MAC000221, Total Sum: 4917.5459997
LCLid: MAC000228, Total Sum: 2707.6260003
LCLid: MAC000234, Total Sum: 4213.725
LCLid: MAC000235, Total Sum: 1528.1550001
Query execution time: 8.449077606201172 milliseconds


## Third Query Workload

In [54]:
# Convert to datetime
start = datetime(2012, 6, 1)
end = datetime(2013, 6, 2)

# Specify query
pipeline = [
    {
        '$match': {
            'day': {'$lt': end, '$gte': start}
        }
    },
    {
        '$group': {
            '_id': '$LCLid',
            'totalSum': {
                '$sum': '$energy_sum'
            }
        }
    },
    {
        '$sort': {
            '_id': 1
        }
    }
]

# Start timer
start_time = time.time()

# Execute query
results = collection.aggregate(pipeline)

# End timer
end_time = time.time()

# Calculate eecution time
execution_time = (end_time - start_time) * 1000

# Print results
for result in results:
    LCLid = result['_id']
    totalSum = result['totalSum']
    print(f"LCLid: {LCLid}, Total Sum: {totalSum}")

# Print query execution time
print(f"Query execution time: {execution_time} milliseconds")

LCLid: MAC000131, Total Sum: 3691.9949997
LCLid: MAC000132, Total Sum: 5258.8470011
LCLid: MAC000221, Total Sum: 4917.5459997
LCLid: MAC000228, Total Sum: 2707.6260003
LCLid: MAC000234, Total Sum: 4213.725
LCLid: MAC000235, Total Sum: 1528.1550001
Query execution time: 9.007930755615234 milliseconds


## Overall Query Execution Time for Filter and Aggregation

In [55]:
print(f"Average Query Execution Time: {((13+8.45+9)/3)} milliseconds")

Average Query Execution Time: 10.15 milliseconds


# Metadata

In [56]:
# Build the collStats command
command = {"collStats": collection.name}

# Execute the command on the database
stats = db.command(command)

# Print some statistics
print("Namespace of collection:", stats["ns"])
print("Total size of collection in bytes:", stats["size"])
print("Number of documents:", stats["count"])
print("Average size of an object:", stats["avgObjSize"])
print("Storage size of collection in bytes:", stats["storageSize"])

Namespace of collection: test.smart-meter-benchmark_c
Total size of collection in bytes: 964458
Number of documents: 4871
Average size of an object: 198
Storage size of collection in bytes: 348160
