# MongoDB/Pymongo Aggregations: Mastodon Public Timeline

Work in driver/navigator pairs with a single laptop. Talk through each idea before you code so both partners understand the plan.

## 1. Setup

Import the core libraries we will need for HTTP requests, JSON inspection, and quick analyses.

In [None]:
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
from bs4 import BeautifulSoup

# 2. Retrieve posts from your local MongoDB database

In [None]:
# create new database connection
client = MongoClient()
db = client.mastodon_test
db

In [None]:
# show collections in mastodon_test
db.list_collection_names()

In [None]:
# assign variable to collection in mastodon_test
coll = db.posts
coll

In [None]:
# confirm number of documents
coll.count_documents({})

In [None]:
# show a sample post from the collection
sample = coll.find_one()
pprint(sample)

# 3. Pipeline basics
MongoDB analogs of MySQL's SELECT, LIMIT, ORDER BY, etc.

In [None]:
# Find one post in the collection and display ONLY the post author (account.acct), post date (created_at), and post content (content) using .aggregate()

pipeline = [
    {'$project': {
        '_id': 0,
        'account.acct': 1,
        'created_at': 1,
        'content': 1
    }},
    {'$limit': 1}
]

result = coll.aggregate(pipeline)
for doc in result:
    print(doc['account']['acct'])
    print(doc['created_at'])
    print(doc['content'])

In [None]:
# Do the same, but add follower count to the display and return three posts

pipeline = [
    {'$project': {
        '_id': 0,
        'account.acct': 1,
        'created_at': 1,
        'content': 1,
        'account.followers_count': 1
    }},
    {'$limit': 3}
]

result = coll.aggregate(pipeline)
for doc in result:
    print(doc['account']['acct'])
    print(doc['account']['followers_count'], 'followers')
    print(doc['created_at'])
    print(doc['content'])
    print()

In [None]:
# Do the same again, but show the three posts from users with the highest follower counts

pipeline = [
    {'$project': {
        '_id': 0,
        'account.acct': 1,
        'created_at': 1,
        'content': 1,
        'account.followers_count': 1
    }},
    {'$sort': {'account.followers_count': -1}},
    {'$limit': 3}
]

result = coll.aggregate(pipeline)
for doc in result:
    print(doc['account']['acct'])
    print(doc['account']['followers_count'], 'followers')
    print(doc['created_at'])
    print(doc['content'])
    print()

# 4. Flattening and unwinding

In [None]:
# show the raw document structure of the previous query (IOW, the same query but LIMIT 1 and pprint() the whole document)

pipeline = [
    {'$project': {
        '_id': 0,
        'account.acct': 1,
        'created_at': 1,
        'content': 1,
        'account.followers_count': 1,
    }},
    {'$sort': {'account.followers_count': -1}},
    {'$limit': 1}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# flatten the account information into top-level fields so there are no nested elements in the resulting dictionary

pipeline = [
    {'$project': {
        '_id': 0,
        'acct': '$account.acct',
        'created_at': 1,
        'content': 1,
        'followers_count': '$account.followers_count',
    }},
    {'$sort': {'followers_count': -1}},
    {'$limit': 1}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# count the number of hashtags per post

pipeline = [
    {'$project': {
        '_id': 0,
        'acct': '$account.acct',
        'created_at': 1,
        'content': 1,
        'followers_count': '$account.followers_count',
        'tags_count': {'$size': '$tags'}
    }},
    {'$limit': 5}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# find the five posts with the most hashtags

pipeline = [
    {'$project': {
        '_id': 0,
        'acct': '$account.acct',
        'created_at': 1,
        'content': 1,
        'followers_count': '$account.followers_count',
        'tags_count': {'$size': '$tags'}
    }},
    {'$sort': {'tags_count': -1}},
    {'$limit': 5}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# Use $unwind to create a separate document for each hashtag in a post
# Include the account.acct and created_at fields as well
# The resulting documents should have three fields: acct, created_at, and hashtag (the name of the hashtag, not the whole dictionary)
# Limit the results to 10 documents

pipeline = [
    {'$unwind': '$tags'},
    {'$project': {
        '_id': 0,
        'acct': '$account.acct',
        'created_at': 1,
        'hashtag': '$tags.name'
    }},
    {'$limit': 10}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

# 5. Accumulators

In [None]:
# repeat the above pipeline, but remove the limit, and group the unwound hashtags to count how many times each hashtag appears in the collection
# show the 10 most common hashtags in descending order of count

pipeline = [
    {'$unwind': '$tags'},
    {'$project': {
        '_id': 0,
        'acct': '$account.acct',
        'created_at': 1,
        'hashtag': '$tags.name'
    }},
    {'$group': {
        '_id': '$hashtag',
        'count': {'$sum': 1}
    }},
    {'$sort': {'count': -1}},
    {'$limit': 10}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# can you do this without $project?

pipeline = [
    {'$unwind': '$tags'},
    {'$group': {
        '_id': '$tags.name',
        'count': {'$sum': 1}
    }},
    {'$sort': {'count': -1}},
    {'$limit': 10}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# just because a hashtag appears in a lot of posts doesn't mean it's particularly influential or that it's trending
# let's assume that a post's influence is proportional to the number of followers the author has
# modify the previous pipeline to sum the followers_count of each post's author for each hashtag instead
# this is one (simplistic) way that analysts calculate "hashtag reach"

pipeline = [
    {'$unwind': '$tags'},
    {'$group': {
        '_id': '$tags.name',
        'reach': {'$sum': '$account.followers_count'}
    }},
    {'$sort': {'reach': -1}},
    {'$limit': 10}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# just for fun and practice, let's use the same code, but calculate the AVERAGE follower count per hashtag instead of the total reach

pipeline = [
    {'$unwind': '$tags'},
    {'$group': {
        '_id': '$tags.name',
        'avg_followers': {'$avg': '$account.followers_count'}
    }},
    {'$sort': {'avg_followers': -1}},
    {'$limit': 10}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# now let's return both the total reach and average follower count per hashtag, and put the results in a Pandas dataframe for easier viewing

pipeline = [
    {'$unwind': '$tags'},
    {'$group': {
        '_id': '$tags.name',
        'reach': {'$sum': '$account.followers_count'},
        'avg_followers': {'$avg': '$account.followers_count'}
    }},
    {'$sort': {'reach': -1}},
    {'$limit': 10}
]

result = coll.aggregate(pipeline)
rows = []

for doc in result:
    rows.append({
        'hashtag': doc['_id'],
        'reach': doc['reach'],
        'avg_followers': doc['avg_followers']
    })

df = pd.DataFrame(rows)
df

In [None]:
# in the previous notebook, we counted followers per user by using find_one() for each account in a loop â€” lots of queries
# now let's do it in a single query using aggregation
# keep in mind that a user's follower count may change between posts, so we will take the maximum follower count for each user across all their posts

pipeline = [
    {'$group': {
        '_id': '$account.acct',
        'max_followers': {'$max': '$account.followers_count'}
    }},
    {'$sort': {'max_followers': -1}},
    {'$limit': 10}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

In [None]:
# pick one popular hashtag seen above, and find the five users who used that hashtag and have the highest follower counts
# in order to omit any duplicate users, be sure to group by account.acct again and select the maximum follower count for each user

hashtag_to_check = 'nokings'  # replace with a real hashtag from above

pipeline = [
    {'$match': {'tags.name': hashtag_to_check}},
    {'$group': {
        '_id': '$account.acct',
        'max_followers': {'$max': '$account.followers_count'}
    }},
    {'$sort': {'max_followers': -1}},
    {'$limit': 5}
]

result = coll.aggregate(pipeline)
for doc in result:
    pprint(doc)

# Save this file WITH ALL OUTPUT SHOWING and submit to Canvas