In [224]:
%config Completer.use_jedi = False
from pymongo import MongoClient
import pandas as pd
import datetime as dt
from collections import defaultdict

client = MongoClient('localhost', 27017)
db = client.resilient

# r/all Analysis

## Connecting to the r/all comment collection.

In [243]:
# ac: r/all comments
ac = db.all

## Getting the number of comments in the collection.

In [244]:
print(f'There are {ac.estimated_document_count():,} r/all comments.')

There are 100,000 r/all comments.


## Counting the number of comments for each author.

In [245]:
# aa: r/all authors with the number of comments they've made.
aa = pd.DataFrame(all_comments.aggregate([
    {'$project': {'_id': 1, 'author': 1}}, # Only using author field.
    {'$group': {'_id': '$author', 'num_comments': {'$sum': 1}}},
    {'$sort': {'num_comments': -1}} # Sort descending.
], allowDiskUse=True))

print(
    aa.head(),
    aa.shape,
    '--- Number of comments per author. ---',
    sep='\n'
)

                    _id  num_comments
0         AutoModerator          3248
1                FMKBot           192
2  SecureMarionberry993            67
3            gyreor_654            65
4            leckerbrot            53
(66571, 2)
--- Number of comments per author. ---


## Counting the number of comments in each subreddit.

In [246]:
# asu: r/all comments per subreddit.
asu = pd.DataFrame(ac.aggregate([
    {'$project': {'_id': 1, 'subreddit': 1}},
    {'$group': {'_id': '$subreddit', 'num_comments': {'$sum': 1}}},
    {'$sort': {'num_comments': -1}}
], allowDiskUse=True))

print(
    asu.head(),
    asu.shape,
    '--- Number of comments per subreddit. ---',
    sep='\n'
)

             _id  num_comments
0      AskReddit          3139
1  AmItheAsshole          1137
2      worldnews          1124
3      Eldenring          1114
4      teenagers           978
(13401, 2)
--- Number of comments per subreddit. ---


# r/popular Analysis

## Connecting to r/popular snapshots collection.

In [235]:
# pss: r/popular snapshots
pss = db.popular

## Grouping r/popular thread IDs, identifying their subreddit, and listing their snapshots (along with snapshot attributes).

Each thread should come with their first timestamp (`min_utc`), last timestamp (`max_utc`), and the number of seconds in between known as the thread's lifespan (`lifespan`).

In [247]:
# pt: r/popular threads with their subreddit and associated snapshots.
pt = pd.DataFrame(pss.aggregate([
    {'$sort': {'retrieved_utc': 1}},
    {'$group': {'_id': '$id',
                'subreddit': {'$first': '$subreddit'},
                'created_utc': {'$first': '$created_utc'},
                'num_snapshots': {'$sum': 1},
                'max_utc': {'$max': '$retrieved_utc'},
                'min_utc': {'$min': '$retrieved_utc'},
                'snapshots': {'$push': 
                              {'retrieved_utc': '$retrieved_utc',
                               'rank': '$rank',
                               'score': '$score',
                               'upvote_ratio': '$upvote_ratio',
                               'num_comments': '$num_comments'}}}},
    {'$sort': {'num_snapshots': -1}}
], allowDiskUse=True)).rename(columns={'_id': 'id'})

pt['datetime'] = \
    pd.to_datetime(pt['created_utc'], unit='s') \
    .dt.tz_localize('US/Central')

pt['lifespan'] = pt['max_utc'] - pt['min_utc']

print(
    pt[['id', 'subreddit', 'num_snapshots']].head(),
    pt.shape,
    '--- Snapshots grouped by threads. ---',
    sep='\n'
)

       id           subreddit  num_snapshots
0  tkzhsx   interestingasfuck             10
1  tkt1a5              Tinder             10
2  tl4kfi      PoliticalHumor             10
3  tktx7r  WhitePeopleTwitter             10
4  tkyg1n            Teachers             10
(102, 9)
--- Snapshots grouped by threads. ---


## Getting the number of r/popular threads for each subreddit.

In [248]:
# psc: r/popular thread count per subreddit.
psc = pt.groupby('subreddit') \
    ['id'].count() \
    .sort_values(ascending=False)

print(
    psc.head(),
    psc.shape,
    '--- Number of r/popular thread per subreddit. ---',
    sep='\n'
)

subreddit
WhitePeopleTwitter    2
Cringetopia           2
interestingasfuck     2
mildlyinfuriating     2
cats                  2
Name: id, dtype: int64
(89,)
--- Number of r/popular thread per subreddit. ---


## The average lifespan for an r/popular thread for each subreddit.

In [251]:
# pals: r/popular average lifespan for each subreddit.
pals = pt.groupby('subreddit') \
    ['lifespan'].mean() \
    .sort_values(ascending=False)

print(
    pals.head(),
    pals.shape,
    '--- Average lifespan for r/popular threads for each subreddit. ---',
    sep='\n'
)

subreddit
worldnews          1099
ShitLiberalsSay    1099
OldSchoolCool      1099
OnePiece           1099
OnePunchMan        1099
Name: lifespan, dtype: int64
(89,)
--- Average lifespan for r/popular threads for each subreddit. ---
