In [262]:
import praw
import os
import glob
import time
from tqdm import tqdm
from dotenv import load_dotenv
from pymongo import MongoClient
import datetime as dt
import numpy as np
import pandas as pd
%config Completer.use_jedi = False

os.chdir('/shared/jackie/resilient-communities')

load_dotenv()

reddit = praw.Reddit(
    client_id=os.environ.get('REDDIT_CLIENT_ID_2'),
    client_secret=os.environ.get('REDDIT_CLIENT_SECRET_2'),
    user_agent=os.environ.get('REDDIT_USER_AGENT_2'),
    username=os.environ.get('REDDIT_USERNAME_2'),
    password=os.environ.get('REDDIT_PASSWORD_2')
)

client = MongoClient('localhost', 27017)
all_comments = client.resilient.all
popular_threads = client.resilient.popular_threads
pushshift_comments = client.pushshift_comments
pushshift_threads = client.pushshift_comments

# Get the total number of comments from PRAW stream.

We stream all comments into a single MongoDB collection.

In [287]:
pd.DataFrame.from_dict({
    'Comments': all_comments.estimated_document_count()
}, orient='index').rename(columns={0: 'N'}).style.format('{:,}')

Unnamed: 0,N
Comments,525300000


# Get r/popular threads stored in the collection.

Each r/popular thread and it's respective features are stored statically in a MongoDB collection. There are other collections (e.g., a collection for all Reddit comments and a collection for r/popular snapshots), but calculations should be ran on the r/popular collection. We derive features from those other collections in scripts. The scripts include: `store-popular-threads.py`, `generate-praw-features.py`, and `requery-comments.py`.

In [291]:
def get_popular_threads(filter={}, columns=[], n=None):
    
    # Adding index to the query projection.
    fields = {field: 1 for field in ['id'] + columns}
    
    cursor = popular_threads.find(filter, fields)
    
    if n is not None:
        cursor = cursor.limit(n)
    
    df = pd.DataFrame(cursor)
    
    if len(df) == 0:
        return df
    
    return df.set_index('id') \
        .drop(columns=['_id'])[columns]

get_popular_threads(
    columns=['author', 'title', 'subreddit', 'created_utc']
)

Unnamed: 0_level_0,author,title,subreddit,created_utc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tkwdrr,No-Gap240,Thanks a latte!,antiwork,1.648049e+09
tkl164,BrokenShutters,A truly awful aged like milk,agedlikemilk,1.648007e+09
tl0bmh,BolaSquirrel,PSA to invaders,Eldenring,1.648054e+09
tl4kfi,Ok_Inevitable_9461,...and they call the left snowflakes,PoliticalHumor,1.648059e+09
tkv6er,Father-TedCrilly,Worlds fattest man around the 1900’s.,AbsoluteUnits,1.648046e+09
...,...,...,...,...
v7vbxz,MrMoonchild_,Not everyone is a sport illustrator swimsuit m...,JoeRogan,1.654710e+09
v7vc22,beerbellybegone,Don't disrespect Grandma,quityourbullshit,1.654710e+09
v7vdbm,beerbellybegone,Preacher speaking God's word,MurderedByWords,1.654710e+09
v7wym2,SnooCupcakes8607,You bullied the wrong student my guy,instant_regret,1.654714e+09


# What's the feature coverage look like?

Answers: how many r/popular threads have basic, comment, removed, and newcomer features?

In [292]:
def display_popular_thread_coverage():
    
    total_threads = popular_threads.estimated_document_count()
    comment_coverage = len(get_popular_threads(filter={'praw_utc': {'$ne': None}}))
    removed_coverage = len(get_popular_threads(filter={'requery_utc': {'$ne': None}}))
    newcomer_coverage = len(get_popular_threads(filter={'newcomer_utc': {'$ne': None}}))
    
    return pd.DataFrame.from_dict({
        'Total': {'N': total_threads, 'Percentage': total_threads / total_threads},
        'Comment': {'N': comment_coverage, 'Percentage': comment_coverage / total_threads},
        'Removed': {'N': removed_coverage, 'Percentage': removed_coverage / total_threads},
        'Newcomer': {'N': newcomer_coverage, 'Percentage': newcomer_coverage / total_threads}
    }, orient='index').style.format({'N': '{:,}', 'Percentage': '{:.2f}'})

display_popular_thread_coverage()

Unnamed: 0,N,Percentage
Total,34381,1.0
Comment,33883,0.99
Removed,14541,0.42
Newcomer,6057,0.18


# List subreddits that reach r/popular in dataset and the number of threads that reached.

In [293]:
def list_popular_subreddits():
    
    popular_subreddits = get_popular_threads(columns=['subreddit'])
    
    return popular_subreddits['subreddit'].value_counts().to_frame()

list_popular_subreddits()

Unnamed: 0,subreddit
memes,615
MadeMeSmile,539
worldnews,532
interestingasfuck,519
funny,501
...,...
law,1
BMW,1
britishcolumbia,1
trans,1


In [312]:
def display_pushshift_coverage():
    
#     subreddits = pushshift_comments.list_collection_names()
    subreddits = ['MadeMeCry']
    
    table = {}
    
    for subreddit in tqdm(subreddits, disable=True):
        
        date_range = pd.DataFrame(pushshift_comments[subreddit].aggregate([
            {'$project': {'created_utc': 1}},
            {'$group': {'_id': None,
                       'min_utc': {'$min': '$created_utc'},
                       'max_utc': {'$max': '$created_utc'}}}
        ], allowDiskUse=True))
        
        table[subreddit] = {
            'N': pushshift_comments[subreddit].estimated_document_count(),
            'Start': dt.datetime.fromtimestamp(date_range['min_utc'].values[0]),
            'End': dt.datetime.fromtimestamp(date_range['max_utc'].values[0])
        }
        
    print('asdf')
        
    return pd.DataFrame.from_dict(table, orient='index').reindex(subreddits) \
        .style.format(formatter='{:,}', subset=['N'])

display_pushshift_coverage()

asdf


Unnamed: 0,N,Start,End
MadeMeCry,158549,2020-01-01 01:23:05,2022-06-08 12:34:30


In [309]:
print('asdfa')

asdfa
