# Import libraries

In [52]:
import pandas as pd
import random
import gzip
import csv
import time
from azure.cosmos import CosmosClient, PartitionKey, exceptions

# Compress data

In [22]:
# Prepare csv file
df = pd.read_csv('https://raw.githubusercontent.com/jabdurrahmaan/smart-meter-benchmark/main/datasets/daily_dataset_small.csv')

# Generate random values for id
random_values = [''.join(random.choices('0123456789abcdefghijklmnopqrstuvwxyz', k=5)) for _ in range(len(df))]
df['id'] = random_values

# Save csv locally
df.to_csv('/Users/jafarabdurrahmaan/Dropbox/00_Uni/Masterthesis/A_Thesis/1_Components/13_Benchmark_Study_Components/3_KVS/CosmosDB/iter2/daily_dataset_small_c.csv', index=False)

In [23]:
def compress_csv_file(input_file, output_file):
    with open(input_file, 'rb') as f_in, gzip.open(output_file, 'wb') as f_out:
        f_out.writelines(f_in)

input_file = 'daily_dataset_small_c.csv'
output_file = 'daily_dataset_small_c.gz'
compress_csv_file(input_file, output_file)

# Create client connection

In [19]:
endpoint = "https://smart-meter-benchmark.documents.azure.com:443/"
key = "HvrYXT5hnZmhRNJf2vJdXRy7Ms4YSKQE6lP77MAw4AES7N7sXLopbTyy1LoXYeBwRLgKEGms3S0JACDbKq9F3Q==;"
client = CosmosClient(endpoint, key)

# Create database and container

In [26]:
database_name = "smart-meter-benchmark_c"

try:
    database = client.create_database(id=database_name)
    print(f"Database created: {database.id}")
except exceptions.CosmosResourceExistsError:
    database = client.get_database_client(database=database_name)
    print("Container already exists.")
    

container_id = "energy_c"
partition_key_path = "/id_c"

try:
    partition_key = PartitionKey(path=partition_key_path)
    container = database.create_container(
        id=container_id,
        partition_key=partition_key,
        offer_throughput=400,
    )
    print(f"Container created: {container.id}")
except exceptions.CosmosResourceExistsError:
    print("Container already exists.")

Container already exists.
Container already exists.


# Data insertion

In [25]:
# Read gzip data
csv_file = "/Users/jafarabdurrahmaan/Dropbox/00_Uni/Masterthesis/A_Thesis/1_Components/13_Benchmark_Study_Components/3_KVS/CosmosDB/iter2/daily_dataset_small_c.gz"

# Start measuring time
start_time = time.time()  

# Insert items into container
with gzip.open(csv_file, "rt") as file:
    reader = csv.DictReader(file)
    for row in reader:
        item = {
            "id": row["id"],
            "LCLid": row["LCLid"],
            "day": row["day"],
            "energy_median": float(row["energy_median"]),
            "energy_mean": float(row["energy_mean"]),
            "energy_max": float(row["energy_max"]),
            "energy_count": int(row["energy_count"]),
            "energy_std": float(row["energy_std"]),
            "energy_sum": float(row["energy_sum"]),
            "energy_min": float(row["energy_min"]),
            "partitionKey": row["id"]
        }

        container.create_item(item)

# Stop measuring time
end_time = time.time()  

# Calculate execution time
execution_time = end_time - start_time
print(f"Data insertion took {execution_time:.2f} seconds")

Data insertion took 215.95 seconds


# Simple Retrieval

## First Query Workload

In [30]:
# Specify database and container
database_name = "smart-meter-benchmark_c"
container_name = "energy_c"
database = client.get_database_client(database_name)
container = database.get_container_client(container_name)

# Define query
query = "SELECT c.energy_min, c.energy_max FROM c WHERE c.LCLid = 'MAC000131' AND c.day >= '2012-06-01' AND c.day <= '2013-06-01'"

# Start measuring time
start_time = time.time()

# Execute query
try:
    items = list(container.query_items(query, enable_cross_partition_query=True))
    count = 0
    for item in items:
        print(item)
        count += 1
        if count >= 5:
            break
except exceptions.CosmosHttpResponseError as e:
    print("Error. {0}: {1}".format(e.status_code, e.message))
    
# Stop measuring time
end_time = time.time()  

# Calculate execution time
execution_time = end_time - start_time
print(f"Query took {execution_time:.2f} seconds")

{'energy_min': 0.064, 'energy_max': 0.312}
{'energy_min': 0.065, 'energy_max': 0.299}
{'energy_min': 0.064, 'energy_max': 0.7829999999999999}
{'energy_min': 0.066, 'energy_max': 1.1619999}
{'energy_min': 0.065, 'energy_max': 0.742}
Query took 0.05 seconds


## Second Query Workload

In [28]:
# Specify database and container
database_name = "smart-meter-benchmark_c"
container_name = "energy_c"
database = client.get_database_client(database_name)
container = database.get_container_client(container_name)

# Define query
query = "SELECT c.energy_min, c.energy_max FROM c WHERE c.LCLid = 'MAC000131' AND c.day >= '2012-06-01' AND c.day <= '2013-06-01'"

# Start measuring time
start_time = time.time()

# Execute query
try:
    items = list(container.query_items(query, enable_cross_partition_query=True))
    count = 0
    for item in items:
        print(item)
        count += 1
        if count >= 5:
            break
except exceptions.CosmosHttpResponseError as e:
    print("Error. {0}: {1}".format(e.status_code, e.message))
    
# Stop measuring time
end_time = time.time()  

# Calculate execution time
execution_time = end_time - start_time
print(f"Query took {execution_time:.2f} seconds")

{'energy_min': 0.064, 'energy_max': 0.312}
{'energy_min': 0.065, 'energy_max': 0.299}
{'energy_min': 0.064, 'energy_max': 0.7829999999999999}
{'energy_min': 0.066, 'energy_max': 1.1619999}
{'energy_min': 0.065, 'energy_max': 0.742}
Query took 0.04 seconds


## Third Query Workload

In [29]:
# Specify database and container
database_name = "smart-meter-benchmark_c"
container_name = "energy_c"
database = client.get_database_client(database_name)
container = database.get_container_client(container_name)

# Define query
query = "SELECT c.energy_min, c.energy_max FROM c WHERE c.LCLid = 'MAC000131' AND c.day >= '2012-06-01' AND c.day <= '2013-06-01'"

# Start measuring time
start_time = time.time()

# Execute query
try:
    items = list(container.query_items(query, enable_cross_partition_query=True))
    count = 0
    for item in items:
        print(item)
        count += 1
        if count >= 5:
            break
except exceptions.CosmosHttpResponseError as e:
    print("Error. {0}: {1}".format(e.status_code, e.message))
    
# Stop measuring time
end_time = time.time()  

# Calculate execution time
execution_time = end_time - start_time
print(f"Query took {execution_time:.2f} seconds")

{'energy_min': 0.064, 'energy_max': 0.312}
{'energy_min': 0.065, 'energy_max': 0.299}
{'energy_min': 0.064, 'energy_max': 0.7829999999999999}
{'energy_min': 0.066, 'energy_max': 1.1619999}
{'energy_min': 0.065, 'energy_max': 0.742}
Query took 0.06 seconds


## Overall Query Execution Time for Simple Retrieval

In [31]:
print(f"Average Query Execution Time: {((0.05+0.04+0.06)/3)*1000} milliseconds")

Average Query Execution Time: 49.99999999999999 milliseconds
