# MongoDB/Pymongo Query Basics: Mastodon Public Timeline

Work in driver/navigator pairs with a single laptop. Talk through each idea before you code so both partners understand the plan.

## 1. Setup

Import the core libraries we will need for HTTP requests, JSON inspection, and quick analyses.

In [None]:
import requests
import re
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import Image, display

## 2. Fetch the timeline data

Use the provided Mastodon public timeline endpoint. Confirm we received an HTTP 200 response and note the content type.

In [None]:
SOURCE_URL = "https://hci.social/api/v1/timelines/public?limit=40"

response = requests.get(SOURCE_URL, timeout=10)
response.raise_for_status()

print(f"Status code: {response.status_code}")
print(f"Content type: {response.headers.get('content-type')}")

## 3. Load the JSON payload

Convert the HTTP response into Python objects. Verify we received a list of status dictionaries.

In [None]:
posts = response.json()

print(f"Number of posts retrieved: {len(posts)}")
print(f"Type of top-level object: {type(posts).__name__}")
if posts:
    print(f"Type of an individual post: {type(posts[0]).__name__}")

## 4. Explore the raw structure

Start by looking at the keys provided for each post and pretty-printing one representative object.

In [None]:
if posts:
    pprint(posts[0])

In [None]:
if posts:
    sample_post = posts[0]
    print(sorted(sample_post.keys()))
else:
    print('No posts returned from the API.')

# 5. Load posts into a local MongoDB database
Create a new database and collection, then insert all the posts we retrieved from the Mastodon API.

In [None]:
# create new database
client = MongoClient()
db = client.mastodon_test
db

In [None]:
# create new collection in mastodon_test
coll = ...
coll

In [None]:
# insert posts into collection
coll...

In [None]:
# confirm number of documents inserted
# ...

In [None]:
# show collections in mastodon_test
...

In [None]:
# show a sample post from the collection
sample = ...
pprint(sample)

# 6. Basic queries

In [None]:
# List all the distinct account usernames in the collection
usernames = ...

print(f"Unique usernames: {usernames}")

In [None]:
# Return all of the posts from one of those users
username_to_query = usernames[0]

user_posts = ...

print(f"Posts from user '{username_to_query}':")

for post in user_posts:
    pprint(post)

In [None]:
# Print just the content of those posts
user_posts = ...

print(f"Contents of posts from user '{username_to_query}':\n")

for post in user_posts:
    print(... + '\n')

In [None]:
# Count the number of posts per account (acct)
# For each username, get their post count, then append the acct and post count to a list of lists, convert to a dataframe, sort in descending order (to show the users with the most posts), and display it

posts_per_user = []

for username in usernames:
    post_count = ...
    posts_per_user.append([username, post_count])

df = pd.DataFrame(posts_per_user, columns=['username', 'post_count'])
df = df.sort_values(by='post_count', ascending=False).reset_index(drop=True)
df

In [None]:
# List all of the hashtags used in the posts
all_hashtags = ...

print(f"All hashtags used in posts: {all_hashtags}")

In [None]:
# Count how many times each hashtag was used across all posts and list in descending order
# (Use the one-by-one method used above — we will use a more advanced method later)

hashtag_counts = []

for hashtag in all_hashtags:
    count = ...
    hashtag_counts.append([hashtag, count])

df_hashtags = pd.DataFrame(hashtag_counts, columns=['hashtag', 'count'])
df_hashtags = df_hashtags.sort_values(by='count', ascending=False).reset_index(drop=True)
df_hashtags

In [None]:
# Now take one of the most popular hashtags and return all posts that used it
# Return the post content only

popular_hashtag = df_hashtags.iloc[0]['hashtag']
posts_with_popular_hashtag = ...

print(f"Posts with hashtag '{popular_hashtag}':\n")

for post in posts_with_popular_hashtag:
    print(... + '\n')

In [None]:
# Now use BeautifulSoup to strip the HTML tags from those posts and print just the plain text content
posts_with_popular_hashtag = ...

print(f"Plain text contents of posts with hashtag '{popular_hashtag}':\n")

for post in posts_with_popular_hashtag:
    soup = BeautifulSoup(post['content'], 'html.parser')
    plain_text = soup.get_text()
    print(plain_text + '\n')

In [None]:
# Find one post that includes an image and display it
post_with_image = coll.find_one({'media_attachments': {'$ne': []}})

if post_with_image:
    image_url = ...
    display(Image(url=image_url))
else:
    print("No post with an image found.")

# 7. MOAR DATA

In [None]:
# Download 40 posts each from two other servers and insert them into the same collection
additional_servers = ["https://kind.social", "https://infosec.exchange"]

for server_url in additional_servers:
    response = requests.get(f"{server_url}/api/v1/timelines/public?limit=40", timeout=10)
    response.raise_for_status()
    additional_posts = response.json()
    coll...
    print(f"Inserted {len(additional_posts)} posts from {server_url}")

# confirm total number of documents in collection after additions
total_docs = ...
print(f"Total number of documents in collection after additions: {total_docs}")

In [None]:
# Now download 40 more posts from the original server and insert them into the collection
# Only insert posts that are not already present in the collection (check by 'id' field)

response = requests.get(SOURCE_URL, timeout=10)
response.raise_for_status()
new_posts = response.json()

# pull existing IDs from the collection
existing_ids = ...

# filter new posts to only those not already in the collection
new_posts_to_insert = [post for post in new_posts if ...]

if new_posts_to_insert:
    coll...

print(f"Inserted {len(new_posts_to_insert)} new posts from the original server.")

# 7a. Now re-run some of the earlier queries to see how the results have changed

_Do not proceed to the following until reading the assignment for Oct 23._

# 8. Using query operators

In [None]:
# Choose two users and post the content of all their posts (use $or)
user1 = df.iloc[0]['username']
user2 = df.iloc[1]['username']

posts_from_two_users = ...

print(f"Posts from users '{user1}' and '{user2}':\n")

for post in posts_from_two_users:
    print(... + '\n')

In [None]:
# Choose two users and post the content of all their posts (use $in)
users_to_query = [user1, user2]

posts_from_users_in = ...

print(f"Posts from users '{user1}' and '{user2}':\n")

for post in posts_from_users_in:
    print(... + '\n')

In [None]:
# Pick a minimum follower count and return all posts from accounts with at least that many followers (use $gte)
# Use BeautifulSoup to strip HTML tags and print just the plain text content

min_followers = 5000

posts_from_popular_accounts = ...

print(f"Posts from accounts with at least {min_followers} followers:\n")

for post in posts_from_popular_accounts:
    soup = ...
    plain_text = ...
    print(plain_text + '\n')

In [None]:
# Pick a maximum follower count and return all posts from accounts with no more than that many followers (use $lte)
# Use BeautifulSoup to strip HTML tags and print just the plain text content

max_followers = 100

posts_from_less_popular_accounts = ...

print(f"Posts from accounts with no more than {max_followers} followers:\n")

for post in posts_from_less_popular_accounts:
    soup = ...
    plain_text = ...
    print(plain_text + '\n')

In [None]:
# Find all emojis used in user profiles (account.emojis — use $ne) and display them along with the username

users_with_emojis = ...

print("Usernames with emojis in their profiles:\n")

for user in users_with_emojis:
    username = ...
    emojis = [... for emoji in ...]
    print(f"{username}: {emojis}")


In [None]:
# Pick a keyword and return all posts that contain it (use $regex; include '$options': 'i' for a case-insensitive search)
# Use BeautifulSoup to strip HTML tags and print just the plain text content

keyword = "mastodon"

posts_with_keyword = ...

print(f"Posts containing the keyword '{keyword}':\n")

for post in posts_with_keyword:
    soup = ...
    plain_text = ...
    print(plain_text + '\n')

## Why can't we use $text here?

_Put your answer here._

# 9. More Advanced Queries (see what you can do, remember these aren't graded, just give them a try!)

In [None]:
# List the unique usernames of accounts with more than 1000 followers
popular_users = ...

print(f"Usernames with more than 1000 followers: {popular_users}")

In [None]:
# For those popular users, count how many posts each has in the collection and display in descending order as a dataframe

popular_user_post_counts = []

for username in popular_users:
    post_count = ...
    popular_user_post_counts.append([username, post_count])

df_popular_users = pd.DataFrame(popular_user_post_counts, columns=['username', 'post_count'])
df_popular_users = df_popular_users.sort_values(by='post_count', ascending=False).reset_index(drop=True)
df_popular_users

In [None]:
# For those popular users, list how many followers each account has (use find_one() for now, as the follower count could change between posts — we'll do a more advanced method later)

popular_user_followers = []

for username in popular_users:
    user_doc = ...
    followers_count = ...
    popular_user_followers.append([username, followers_count])

df_popular_user_followers = pd.DataFrame(popular_user_followers, columns=['username', 'followers_count'])
df_popular_user_followers = df_popular_user_followers.sort_values(by='followers_count', ascending=False).reset_index(drop=True)
df_popular_user_followers

In [None]:
# Find all posts that contain hashtags (use $exists and $ne)
# Print the post count and the hashtags used

posts_with_hashtags = ...
post_count = ...

hashtags = []

for post in posts_with_hashtags:
    new_hashtags = [... for tag in ...]
    hashtags.extend(new_hashtags)

hashtags = list(set(hashtags))  # get unique hashtags

print(f"Number of posts with hashtags: {post_count}")
print("Hashtags used in those posts:")
print(hashtags)

## Be sure the notebook is complete and all cells have been run before submission! Then upload to Canvas.