# Bulk Ops

In [1]:
from pymongo import MongoClient

client = MongoClient('mongodb://mongodb:27017/')
db = client.bulk_ops

# Ensure collections are clean
db.customers.drop()
db.subscriptions.drop()

# Create collections
db.create_collection('customers')
db.create_collection('subscriptions')

Collection(Database(MongoClient(host=['mongodb:27017'], document_class=dict, tz_aware=False, connect=True), 'bulk_ops'), 'subscriptions')

In [2]:
import json

def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            yield json.loads(line)

## Single Ops

In [3]:
from bson import ObjectId, DBRef
import time

def insert_single_ops(file_path):
    start_time = time.time()

    for record in load_jsonl(file_path):
        customer = record["customer"]
        customer_id = customer["merchant_user_id"]
        customer_obj_id = ObjectId()
        customer_doc = {
            "_id": customer_obj_id,
            "merchant": customer["merchant"],
            "data": customer
        }
        db.customers.insert_one(customer_doc)

        for subscription in record["subscriptions"]:
            subscription_obj_id = ObjectId()
            subscription_doc = {
                "_id": subscription_obj_id,
                "customer": DBRef("customers", customer_obj_id),
                "data": subscription
            }
            db.subscriptions.insert_one(subscription_doc)

    end_time = time.time()
    print("Single Operations Write Time:", end_time - start_time)

In [4]:
# File paths for the sample data
file_path = '10k.jsonl'

# Run single operations
print("Running single operations...")
insert_single_ops(file_path)

Running single operations...
Single Operations Write Time: 5.350555419921875


In [5]:
# Clear collections for fair comparison
db.customers.drop()
db.subscriptions.drop()
db.create_collection('customers')
db.create_collection('subscriptions')

Collection(Database(MongoClient(host=['mongodb:27017'], document_class=dict, tz_aware=False, connect=True), 'bulk_ops'), 'subscriptions')

## Bulk Ops

In [6]:
from pymongo import InsertOne

def insert_bulk_ops(file_path):
    customer_ops = []
    subscription_ops = []

    start_time = time.time()

    for record in load_jsonl(file_path):
        customer = record["customer"]
        customer_id = customer["merchant_user_id"]
        customer_obj_id = ObjectId()
        customer_doc = {
            "_id": customer_obj_id,  # Small but important detail: generate _id
            "merchant": customer["merchant"],
            "data": customer
        }
        customer_ops.append(InsertOne(customer_doc))

        for subscription in record["subscriptions"]:
            subscription_obj_id = ObjectId()
            subscription_doc = {
                "_id": subscription_obj_id,
                "customer": DBRef("customers", customer_obj_id),
                "data": subscription
            }
            subscription_ops.append(InsertOne(subscription_doc))

    if customer_ops:
        db.customers.bulk_write(customer_ops)
    if subscription_ops:
        db.subscriptions.bulk_write(subscription_ops)

    end_time = time.time()
    print("Bulk Operations Write Time:", end_time - start_time)

In [7]:
# Run bulk operations
print("Running bulk operations...")
insert_bulk_ops(file_path)

Running bulk operations...
Bulk Operations Write Time: 0.4874753952026367
