# Setup

In [16]:
MONGODB_START_FROM_SCRATCH = True
DOCKER_INTERNAL_HOST = "host.docker.internal"
DOCKER_DNS = ["10.15.20.1"]

MONGODB_REPLICA_SET = "replica_set_0"
MONGODB_TOTAL_NODES = 3

MONGODB_NODE_IPS = ["10.15.20.35"] * MONGODB_TOTAL_NODES
MONGODB_NODE_NAMES = [f"mongodb-node-{i + 1}" for i in range(MONGODB_TOTAL_NODES)]
MONGODB_NODE_HOSTNAMES = [
    f"{MONGODB_NODE_NAMES[i]}.lvilla17.vpn.itam.mx" for i in range(MONGODB_TOTAL_NODES)
]
MONGODB_NODE_PORTS = [27010 + (i + 1) for i in range(0, MONGODB_TOTAL_NODES)]

MONGODB_WORKDIR = "/data/db"

MONGO_INITDB_ROOT_USERNAME = "admin"
MONGO_INITDB_ROOT_PASSWORD = "admin"
MONGO_INITDB_DATABASE = "admin"

In [17]:
import os
from pathlib import Path

LOCALHOST_WORKDIR = f"{os.path.join(os.path.relpath(Path.cwd()))}"
DOCKER_MOUNTDIR = os.path.join(LOCALHOST_WORKDIR, "mount")
MONGODB_LOCAL_CLUSTER_KEY_PATH = os.path.join(DOCKER_MOUNTDIR, "mongo-keyfile")

mount_path = Path(DOCKER_MOUNTDIR)
mount_path.mkdir(parents=True, exist_ok=True)

### Create session

In [18]:
from pymongo import MongoClient

nodes_ports = [
    f"{MONGODB_NODE_HOSTNAMES[i]}:{MONGODB_NODE_PORTS[i]}"
    for i in range(MONGODB_TOTAL_NODES)
]
connection_string = (
    f"mongodb://{MONGO_INITDB_ROOT_USERNAME}:{MONGO_INITDB_ROOT_PASSWORD}@"
    f"{','.join(nodes_ports)}/"
    f"?replicaSet={MONGODB_REPLICA_SET}&authSource=admin&w=majority"
)
print(f"Connection URL: {connection_string}")

client = MongoClient(connection_string)

db = client["db"]
users_collection = db["users"]

Connection URL: mongodb://admin:admin@mongodb-node-1.lvilla17.vpn.itam.mx:27011,mongodb-node-2.lvilla17.vpn.itam.mx:27012,mongodb-node-3.lvilla17.vpn.itam.mx:27013/?replicaSet=replica_set_0&authSource=admin&w=majority


### Insert

In [19]:
from faker import Faker

fake = Faker()

In [20]:
# %%timeit -n 2 -r 2
# -n 1: run only 2 loop
# -r 1: repeat only 2 time

import random

print("Generating batch...")

users_batch = [
    {
        "name": (
            fake.unique.name() if random.random() > 0.5 else fake.unique.name().upper()
        ),
        "email": fake.ascii_free_email(),
        "profile": {
            "job": fake.job(),
            "company": fake.company(),
            "location": {
                "lat": float(fake.latitude()),
                "lng": float(fake.longitude()),
            },
        },
        "tags": [fake.word() for _ in range(random.randint(2, 5))],
        "login_count": random.randint(1, 1000),
        "last_login": fake.date_time_this_year().isoformat(),
        "active": fake.boolean(chance_of_getting_true=75),
    }
    for _ in range(10000)
]
print("Inserting batch...")
users_collection.insert_many(users_batch)

Generating batch...
Inserting batch...


InsertManyResult([ObjectId('697836b1b54156c31b6a59c3'), ObjectId('697836b1b54156c31b6a59c4'), ObjectId('697836b1b54156c31b6a59c5'), ObjectId('697836b1b54156c31b6a59c6'), ObjectId('697836b1b54156c31b6a59c7'), ObjectId('697836b1b54156c31b6a59c8'), ObjectId('697836b1b54156c31b6a59c9'), ObjectId('697836b1b54156c31b6a59ca'), ObjectId('697836b1b54156c31b6a59cb'), ObjectId('697836b1b54156c31b6a59cc'), ObjectId('697836b1b54156c31b6a59cd'), ObjectId('697836b1b54156c31b6a59ce'), ObjectId('697836b1b54156c31b6a59cf'), ObjectId('697836b1b54156c31b6a59d0'), ObjectId('697836b1b54156c31b6a59d1'), ObjectId('697836b1b54156c31b6a59d2'), ObjectId('697836b1b54156c31b6a59d3'), ObjectId('697836b1b54156c31b6a59d4'), ObjectId('697836b1b54156c31b6a59d5'), ObjectId('697836b1b54156c31b6a59d6'), ObjectId('697836b1b54156c31b6a59d7'), ObjectId('697836b1b54156c31b6a59d8'), ObjectId('697836b1b54156c31b6a59d9'), ObjectId('697836b1b54156c31b6a59da'), ObjectId('697836b1b54156c31b6a59db'), ObjectId('697836b1b54156c31b6a59

### Query

In [21]:
query = {"active": True, "login_count": {"$gt": 500}}
results = users_collection.find(query)
print(f"Found {users_collection.count_documents(query)} highly active users.")

Found 3784 highly active users.


In [22]:
projection = {"name": 1, "email": 1, "profile.job": 1, "_id": 0}
cursor = users_collection.find({"tags": "work"}, projection).limit(100)
for user in cursor:
    print(user)

{'name': 'Joshua Washington', 'email': 'benjamin65@yahoo.com', 'profile': {'job': 'Museum/gallery exhibitions officer'}}
{'name': 'DONALD SNYDER', 'email': 'jackgray@yahoo.com', 'profile': {'job': 'Merchandiser, retail'}}
{'name': 'Dr. Thomas Norris', 'email': 'anthonywilliams@yahoo.com', 'profile': {'job': 'Intelligence analyst'}}
{'name': 'PATRICIA ALLEN', 'email': 'nicholasjackson@hotmail.com', 'profile': {'job': 'Surveyor, minerals'}}
{'name': 'Alexander Mitchell', 'email': 'michael60@hotmail.com', 'profile': {'job': 'Nurse, adult'}}
{'name': 'GARY HAMILTON', 'email': 'stevensmichelle@gmail.com', 'profile': {'job': 'Pharmacist, community'}}
{'name': 'JENNIFER STEIN', 'email': 'linda93@gmail.com', 'profile': {'job': 'Immunologist'}}
{'name': 'JAIME PHELPS MD', 'email': 'sarah65@hotmail.com', 'profile': {'job': 'Sports administrator'}}
{'name': 'VALERIE KING', 'email': 'benjaminhernandez@hotmail.com', 'profile': {'job': 'Health physicist'}}
{'name': 'Regina Baker', 'email': 'teresamu

In [23]:
pipeline = [
    {"$match": {"active": True}},  # Stage 1: Filter only active users
    {  # Stage 2: Group by the nested 'job' field
        "$group": {
            "_id": "$profile.job",
            "avg_logins": {"$avg": "$login_count"},
            "user_count": {"$sum": 1},
        }
    },
    {"$sort": {"avg_logins": -1}},  # Stage 3: Sort by average logins descending
    {
        "$project": {
            "_id": 0,  # Hide the original _id
            "job_title": "$_id",  # Rename _id to job_title
            "stats": {  # Create a nested object for stats
                "average": "$avg_logins",
                "total_users": "$user_count",
            },
        }
    },
    {"$limit": 100},  # Stage 4: Limit to top 100 most active professions
]
results = list(users_collection.aggregate(pipeline))
for res in results:
    print(res)

{'job_title': 'Prison officer', 'stats': {'average': 771.4, 'total_users': 10}}
{'job_title': 'Community development worker', 'stats': {'average': 767.7142857142857, 'total_users': 7}}
{'job_title': 'Ergonomist', 'stats': {'average': 750.7, 'total_users': 10}}
{'job_title': 'Educational psychologist', 'stats': {'average': 735.2, 'total_users': 5}}
{'job_title': 'Bonds trader', 'stats': {'average': 728.6363636363636, 'total_users': 11}}
{'job_title': 'Engineer, agricultural', 'stats': {'average': 718.7692307692307, 'total_users': 13}}
{'job_title': 'Gaffer', 'stats': {'average': 707.3333333333334, 'total_users': 6}}
{'job_title': 'Geochemist', 'stats': {'average': 707.0, 'total_users': 6}}
{'job_title': 'Farm manager', 'stats': {'average': 701.2727272727273, 'total_users': 11}}
{'job_title': 'Medical illustrator', 'stats': {'average': 698.5555555555555, 'total_users': 9}}
{'job_title': 'Engineer, mining', 'stats': {'average': 697.3, 'total_users': 10}}
{'job_title': 'Building surveyor',

In [24]:
northern_users = users_collection.count_documents({"profile.location.lat": {"$gt": 0}})
print(f"Users in Northern Hemisphere: {northern_users}")

Users in Northern Hemisphere: 5034


In [25]:
# Standard Sort (Z-A-a-z) vs. Collation Sort (A-a-B-b...)
cursor = users_collection.find({}).sort("name", 1).collation({"locale": "en", "strength": 2}).limit(100)

for user in cursor:
    print(user["name"])

AARON ACOSTA
AARON BELL
AARON BURGESS
AARON CHAMBERS
Aaron Cooper
AARON DECKER
AARON DIAZ
Aaron Dominguez
AARON DOYLE
AARON FERRELL
Aaron Garcia
Aaron Gould
Aaron Harper
Aaron Holden
Aaron Howard
Aaron Johnson
AARON KELLY
AARON KIM
Aaron Larson
Aaron Long
Aaron Martinez
Aaron Meadows
Aaron Moore
Aaron Murphy
AARON NAVARRO
Aaron Olson
AARON ORTIZ
AARON PIERCE
AARON POOLE
AARON RIDDLE
AARON SANCHEZ
Aaron Terry
AARON THOMAS
AARON WAGNER
AARON WILLIAMSON
Aaron Wright
ABIGAIL BEASLEY
Abigail Bernard
Abigail Dougherty
ABIGAIL DOYLE
Abigail Gregory
Abigail Hawkins
Abigail Heath
ABIGAIL HERNANDEZ
Abigail Holloway
ABIGAIL MCCARTHY
Abigail Mercer
ABIGAIL PALMER
Abigail Stanton
ADAM BAKER
ADAM BATES
ADAM BRADFORD
Adam Brown
ADAM BURTON
Adam Cannon
ADAM CARTER
Adam Coffey
Adam Coleman
ADAM COX
ADAM DIXON
ADAM FLORES
Adam Fritz
ADAM GONZALES
ADAM GRIFFIN
ADAM HAYDEN
ADAM HOLLAND
ADAM HORNE
Adam Hughes
Adam Landry
Adam Martin
Adam Meza
Adam Mitchell
Adam Morales
ADAM MORGAN
Adam Nelson
Adam Perez
Ad

### Update

In [26]:
# 1. Get a single user to test with
target_user = users_collection.find_one({"active": True})
user_id = target_user["_id"]
initial_logins = target_user.get("login_count", 0)

print(f"User: {target_user['name']}")
print(f"Initial login count: {initial_logins}")

# 2. Increment the login counter for JUST this user
users_collection.update_one(
    {"_id": user_id}, 
    {"$inc": {"login_count": 1}}
)

# 3. Query again to see the change
updated_user = users_collection.find_one({"_id": user_id})
new_logins = updated_user.get("login_count", 0)

print(f"Updated login count: {new_logins}")
print(f"Change confirmed: {new_logins == initial_logins + 1}")

User: DENNIS JONES
Initial login count: 332
Updated login count: 333
Change confirmed: True


In [27]:
from pymongo import ReturnDocument

# This performs the update and returns the NEW version of the document immediately
updated_doc = users_collection.find_one_and_update(
    {"_id": user_id},
    {"$inc": {"login_count": 1}},
    return_document=ReturnDocument.AFTER
)

print(f"New count from single-step operation: {updated_doc['login_count']}")

New count from single-step operation: 334


In [28]:
query = {"profile.job": {"$regex": ".*engineer.*", "$options": "i"}}
update = {"$set": {"is_technical": True}}
result = users_collection.update_many(query, update)
print(f"Updated {result.modified_count} engineers.")

Updated 963 engineers.


In [29]:
query = {"email": "example@user.com"}
new_values = {"$set": {"active": False}}
users_collection.update_one(query, new_values)

UpdateResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000002'), 'opTime': {'ts': Timestamp(1769486007, 966), 't': 2}, 'nModified': 0, 'ok': 1.0, 'updatedExisting': False}, acknowledged=True)

### Delete

In [30]:
delete_result = users_collection.delete_many({})
print(f"Deleted {delete_result.deleted_count} documents.")

Deleted 10000 documents.


In [31]:
db.drop_collection(users_collection)
print("Deleted users collection.")

Deleted users collection.
