In [None]:
%pip install pymongo

In [2]:
from pymongo import MongoClient
from urllib.parse import quote_plus

In [3]:
# Direct MongoDB connection
username = quote_plus('mdsGlobalUser')
password = quote_plus('mds$uper$ecurePassword123')
host = 'mdsmongodb.sci.pitt.edu'
port = 27017
auth_db = 'admin'

# Format the connection string
uri = f"mongodb://{username}:{password}@{host}:{port}/?authSource={auth_db}"

client = MongoClient(uri)

# Test connection
client.admin.command('ping')
print("✓ Successfully connected to MongoDB")
print(f"Connected to: {host}")

✓ Successfully connected to MongoDB
Connected to: mdsmongodb.sci.pitt.edu


In [4]:
# Use the sakila database
database = client['sakila']
print(f"Using database: {database.name}")

Using database: sakila


In [5]:
# sakila has 'film' collection (similar to video_movies)
collection = database.get_collection('film')

### Aggregation _$group_

In [6]:
# Group by release_year to see unique years
query = [ 
    { "$group" : { "_id" : "$release_year" } } ]

data = collection.aggregate(query)

for doc in data:
    print(doc)

{'_id': 2006}


### Aggregation _limit_


In [7]:
# Group by release_year and limit to 5
query = [ 
    { "$group" : { "_id" : "$release_year" } },
    { "$limit" : 5}
    ]
data = collection.aggregate(query)

for doc in data:
    print(doc)

{'_id': 2006}


### Aggregation _project_


**In SQL, this would be the equivalent of:**

SELECT title, release_year FROM film LIMIT 5;

In [8]:
# Project only title and release_year fields
query = [ 
    { "$project" : {
        "title" : 1,
        "release_year" : 1,
        "_id" : 0
    }},
    { "$limit" : 5}
    ]
data = collection.aggregate(query)

for doc in data:
    print(doc)

{'title': 'ACADEMY DINOSAUR', 'release_year': 2006}
{'title': 'ACE GOLDFINGER', 'release_year': 2006}
{'title': 'ADAPTATION HOLES', 'release_year': 2006}
{'title': 'AFFAIR PREJUDICE', 'release_year': 2006}
{'title': 'AFRICAN EGG', 'release_year': 2006}


### Aggregation _sort_

In [9]:
# Sort by release_year descending
query = [ 
    { "$project" : {
        "title" : 1,
        "release_year" : 1,
        "_id" : 0
    }},
    { "$sort" : {"release_year" : -1}},
    { "$limit" : 5}
    ]
data = collection.aggregate(query)

for doc in data:
    print(doc)

{'title': 'AFFAIR PREJUDICE', 'release_year': 2006}
{'title': 'AFRICAN EGG', 'release_year': 2006}
{'title': 'ACE GOLDFINGER', 'release_year': 2006}
{'title': 'ACADEMY DINOSAUR', 'release_year': 2006}
{'title': 'ADAPTATION HOLES', 'release_year': 2006}


### Aggregation _$match_

SELECT title, release_year FROM film
WHERE release_year = 2006
ORDER BY title ASC
LIMIT 5;

In [10]:
# Match films from 2006, sort by title
query = [ 
    { "$match" : { "release_year" : 2006}},
    { "$project" : {
        "title" : 1,
        "release_year" : 1,
        "_id" : 0
    }},
    { "$sort" : {"title" : 1}},
    { "$limit" : 5}
    ]
data = collection.aggregate(query)

for doc in data:
    print(doc)

{'title': 'ACADEMY DINOSAUR', 'release_year': 2006}
{'title': 'ACADEMY DINOSAUR', 'release_year': 2006}
{'title': 'ACE GOLDFINGER', 'release_year': 2006}
{'title': 'ACE GOLDFINGER', 'release_year': 2006}
{'title': 'ADAPTATION HOLES', 'release_year': 2006}


### Aggregation _$addFields_

SELECT rating, AVG(rental_rate) as avgRate FROM film
GROUP BY rating
LIMIT 5;

In [11]:
# Group by rating and calculate average rental_rate
query = [ 
    { "$group": {
        "_id": "$rating",
        "avgRate": { "$avg": "$rental_rate" },
        "count": { "$sum": 1 }
    }},
    { "$sort": { "_id": 1 }},
    { "$limit" : 5}
]
data = collection.aggregate(query)

for doc in data:
    print(doc)

{'_id': 'G', 'avgRate': 2.888876404494382, 'count': 356}
{'_id': 'NC-17', 'avgRate': 2.970952380952381, 'count': 420}
{'_id': 'PG', 'avgRate': 3.051855670103093, 'count': 388}
{'_id': 'PG-13', 'avgRate': 3.034843049327354, 'count': 446}
{'_id': 'R', 'avgRate': 2.938717948717949, 'count': 390}


### Aggregation _$count_

In [12]:
# Count films from 2006
query = [ 
    { "$match" : { "release_year" : 2006}},
    { "$count" : "totalFilms2006"}    ]
data = collection.aggregate(query)

for doc in data:
    print(doc)

{'totalFilms2006': 2000}
