In [129]:
import os
import pandas as pd
import praw
import time
import pickle

from dotenv import load_dotenv
from plotnine import *
from praw.models import MoreComments

load_dotenv('.env')
username, client_id, client_secret, pw = (
    os.environ['username'],
    os.environ['client_id'],
    os.environ['client_secret'],
    os.environ['pw'])

reddit = praw.Reddit(client_id=client_id, \
                     client_secret=client_secret, \
                     user_agent='jwhendy-scraper', \
                     username=username, \
                     password=pw)

In [3]:
sub_limit = 150
n_top = 50
n_old = 1000

subs = ['askreddit', 'dataisbeautiful', 'funny', 'adviceanimals', 'friendsafari', 
        'pics', 'wtf', 'gaming', 'videos', 'teenagers', 'todayilearned', 'worldnews', 'nba', 'soccer', 'hockey',
        'debatereligion', 'askscience', 'announcements', 'aww', 'music', 'movies', 'news', 'showerthoughts', 'science', 'iama', 'food',
        'jokes', 'explainlikeimfive', 'gifs', 'books', 'lifeprotips', 'art', 'blog', 'earthporn', 'mildlyinteresting', 'diy', 'sports',
        'nottheonion', 'space', 'gadgets', 'television', 'documentaries', 'photoshopbattles', 'listentothis', 'upliftingnews',
        'tifu', 'internetisbeautiful', 'history', 'philosophy', 'futurology', 'oldschoolcool', 'writingprompts', 'nosleep', 
        'personalfinance', 'creepy', 'twoxchromosomes', 'memes']

def get_post_info(post, sub_name):
    info = {
        'sub': sub_name,
        'post_id': post.id,
        'title': post.title,
        'comments': post.num_comments,
        'score': post.score,
        'created_utc': post.created_utc
    }

    return info

def get_comment_info(comment, sort):    
    info = {
        #'sub': comment.subreddit.title,
        'post_id': comment.submission.id,
        'comment_id': comment.id,
        'created_utc': comment.created_utc,
        'score': comment.score,
        'top_level': comment.parent_id.startswith('t3_'),
        'sort': sort}

    return info

def get_comments(post, sort, n):
    post.comment_sort = sort
    post.comment_limit = n
    #post.comments.replace_more(limit=0)
    
    return [get_comment_info(c, sort) for c in post.comments.list() if not isinstance(c, MoreComments)]


def save_cmnt_df(cmnt_list, sub_name, n, perc):
    df_cmnt = pd.DataFrame(cmnt_list)
    df = df_cmnt.merge(pd.DataFrame(post_list), on='post_id')
    df.columns = ['post_id', 'comment_id', 'comment_created_utc', 'comment_score', 'top_level',
                  'sort', 'sub', 'post_title', 'comments_n', 'post_score', 'post_created_utc']
    df = df.sort_values(['post_id', 'comment_created_utc', 'sort']).reset_index(drop=True)
    df['in_old'] = df['comment_id'].isin(df.loc[df['sort']=='old', 'comment_id'].unique())
    df = df.drop_duplicates(subset=['post_id', 'comment_id'], keep='last')
    df['t_delta_min'] = (df['comment_created_utc'] - df['post_created_utc'])/60
    df['comment_score_pct'] = df.groupby(['post_id'], as_index=False)['comment_score'].transform(lambda x: x/x.sum())
    df['nth'] = df.groupby('post_id')['comment_id'].transform(lambda x: range(len(x)))
    df.to_csv(f'./data/n-{n}_perc-{perc}_{sub_name}.csv', index=False)

In [26]:
### don't run, original scraping
#for sub_name in subs:
    print(sub_name)
    post_list, top_list, old_list = [], [], []
    sub = reddit.subreddit(sub_name)
    
    posts = sub.top(limit=sub_limit)
    #posts = [p for p in posts if p.num_comments > n_top*10]
    for post in posts:
        post_list.append(get_post_info(post, sub_name))
        top_list.extend(get_comments(post, sort='top', n=n_top))
    
    posts = sub.top(limit=sub_limit)
    for post in posts:
        old_list.extend(get_comments(post, sort='old', n=n_old))
    
    pickle.dump(post_list, open(f'./data/posts_{sub_name}.pickle', 'wb'))
    pickle.dump(top_list, open(f'./data/top_{sub_name}.pickle', 'wb'))
    pickle.dump(old_list, open(f'./data/old_{sub_name}.pickle', 'wb'))
    #save_cmnt_df(post_list, cmnt_list, sub_name, n, perc)

askreddit


In [88]:
def interpolate_n(x):
    mean_rate = x.iloc[int(len(x)*0.75):-1]['rate'].mean()
    x.loc[x['in_old']==False, 'nth'] = x.loc[x['in_old']==False, 't_delta_min']*mean_rate
    
    return x

def in_oldest(x):
    max_time = x.loc[x['sort']=='old', 'dt_min_post'].max()
    x['in_old'] = x['dt_min_post'] <= max_time
    
    return x

def lists_to_df(post_list, top_list, old_list, n_top):
    df = pd.DataFrame(top_list)
    df = df.sort_values(['post_id', 'score'], ascending=False).groupby('post_id').head(n_top)
    df = df.append(pd.DataFrame(old_list)).merge(pd.DataFrame(post_list), on='post_id')
    df.columns = ['post_id', 'comment_id', 'comment_created_utc', 'comment_score', 'top_level',
                 'sort', 'sub', 'post_title', 'comments_n', 'post_score', 'post_created_utc']
    
    ### calculate time since post and if the comment is within the top n oldest
    df['dt_min_post'] = ((df['comment_created_utc'] - df['post_created_utc'])/60).clip(0, None)
    df['dt_min_cmnt'] = df.groupby('post_id', as_index=False)['comment_created_utc'].transform(lambda x: (x-x.min())/60).clip(0, None)
    df = df.groupby('post_id', as_index=False).apply(in_oldest)
    
    ### sort by sort, creation time; drop dupes to retain only old not in top
    df = df.sort_values(['post_id', 'sort', 'comment_created_utc']).reset_index(drop=True)
    df = df.drop_duplicates(subset=['post_id', 'comment_id'], keep='last')
    
    ### sort by score, assign rank
    df = df.sort_values(['post_id', 'comment_score'], ascending=False).reset_index(drop=True)
    df['rank'] = df.groupby('post_id', as_index=False)['comment_id'].transform(lambda x: [i+1 for i in range(len(x))])
    df.loc[df['sort']=='old', 'rank'] = None
    
    ### sort only by creation time, assign nth; if not in oldest, replace with predicted n
    df = df.sort_values(['post_id', 'comment_created_utc']).reset_index(drop=True)
    #df['nth'] = df.groupby('post_id', as_index=False)['comment_id'].transform(lambda x: [i+1 for i in range(len(x))])
    #df['rate'] = df['nth'] / (df['t_delta_min']+0.5)
    #df.loc[df['in_old']==False, 'rate'] = None
    #df = df.groupby('post_id', as_index=False).apply(interpolate_n)
    #df = df.drop(columns=['rate'])
    
    ### calculate score dominance and 
    df['comment_score_pct'] = df.groupby(['post_id'], as_index=False)['comment_score'].transform(lambda x: x/x.sum())
    #df['nth_perc'] = (df['nth'] / df['comments_n']).clip(None, 1)
    
    ### label cleanup, drop unneeded
    df.loc[df['sort']=='old', 'sort'] = 'oldest ~500 comments/post'
    df.loc[df['sort']=='top', 'sort'] = f'top {n_top} comments/post'
                                                                                         
    return df


def unpickle(sub_name):
    post_list = pickle.load(open(f'./data/posts_{sub_name}.pickle', 'rb'))
    top_list = pickle.load(open(f'./data/top_{sub_name}.pickle', 'rb'))
    old_list = pickle.load(open(f'./data/old_{sub_name}.pickle', 'rb'))
    
    return post_list, top_list, old_list


df_list = [lists_to_df(*unpickle(sub_name), n_top=10) for sub_name in subs]
df = pd.concat(df_list)
df = df.loc[(df['comments_n'] > 500)]
df = df.loc[(df['dt_min_post'] <= 1440)]
df = df.groupby('post_id').filter(lambda x: len(x)>250)

df.tail()

Unnamed: 0,post_id,comment_id,comment_created_utc,comment_score,top_level,sort,sub,post_title,comments_n,post_score,post_created_utc,dt_min_post,dt_min_cmnt,in_old,rank,comment_score_pct
67315,hxq59l,fz8p761,1595713000.0,1,True,oldest ~500 comments/post,memes,Slap it quick!,1448,142964,1595697000.0,278.716667,277.683333,True,,4.8e-05
67316,hxq59l,fz8p7gs,1595713000.0,1,True,oldest ~500 comments/post,memes,Slap it quick!,1448,142964,1595697000.0,278.783333,277.75,True,,4.8e-05
67317,hxq59l,fz8p8wy,1595714000.0,1,True,oldest ~500 comments/post,memes,Slap it quick!,1448,142964,1595697000.0,279.166667,278.133333,True,,4.8e-05
67318,hxq59l,fz8pa5g,1595714000.0,20,False,oldest ~500 comments/post,memes,Slap it quick!,1448,142964,1595697000.0,279.483333,278.45,True,,0.000958
67319,hxq59l,fz8pa8q,1595714000.0,8,False,oldest ~500 comments/post,memes,Slap it quick!,1448,142964,1595697000.0,279.5,278.466667,True,,0.000383


In [118]:
len(df)

3118914

In [120]:
len(df['post_id'].unique())

7036

In [121]:
df.loc[df['sort']=='top 10 comments/post', 'dt_min_cmnt'].describe()

count    70335.000000
mean        92.169330
std         95.744594
min          0.000000
25%         25.500000
50%         66.983333
75%        130.391667
max       1348.416667
Name: dt_min_cmnt, dtype: float64

In [96]:
### % of total comments
#p = ggplot(df, aes(x='nth_perc*100', y='comment_score_pct', color='in_old')) + geom_point(size=0.2, alpha=0.1) + facet_wrap('~sort', ncol=2)
#p = p + scale_x_continuous(name='comment order (nth) / total post comments, %')
#p = p + scale_color_manual(name=' ', labels=['order known', 'predicted'], values=['red', 'black'], guide=False)

### order
#p = ggplot(df, aes(x='nth', y='comment_score_pct', color='in_old')) + geom_point(size=0.2, alpha=0.1) + facet_wrap('~sort', ncol=2)
#p = p + scale_x_continuous(name='comment order (nth)', limits=[0, 1000])
#p = p + scale_color_manual(name=' ', labels=['order known', 'predicted'], values=['red', 'black'], guide=False)

### time delta
p = ggplot(df, aes(x='dt_min_cmnt/60', y='comment_score_pct')) + geom_point(size=0.2, alpha=0.2) + facet_wrap('~sort', ncol=2)
p = p + scale_x_continuous(name='time since submission, hours', breaks=[0, 6, 12, 18, 24], limits=[0, 24])
p = p + scale_y_continuous(name='comment score/sum(top 10 + oldest)')
p = p + theme_minimal() + theme(text = element_text(family='Hack', size=12),
                                axis_text = element_text(family='Hack', size=10))
p = p + geom_vline(aes(xintercept=67/60), color='red', size=0.5, linetype='dashed', data=df.loc[df['sort']=='top 10 comments/post'])
p = p + theme(subplots_adjust={'wspace':0.2})
#p

In [97]:
p.save('oldest-vs-top_delta.png', dpi=200, width=7, height=4)

  warn("Saving {0} x {1} {2} image.".format(


In [99]:
df_c = df.copy().loc[df['sort'] == 'top 10 comments/post']
df_c= df_c.groupby(['sub', 'post_id'], as_index=False).agg({'dt_min_cmnt': 'mean'})
df_c = df_c.groupby(['sub'], as_index=False)['dt_min_cmnt'].agg('mean')
sub_order = df_c.sort_values('dt_min_cmnt')['sub'].unique()
sub_order

array(['soccer', 'nba', 'hockey', 'blog', 'announcements', 'movies',
       'iama', 'news', 'funny', 'gifs', 'gaming', 'mildlyinteresting',
       'sports', 'pics', 'aww', 'creepy', 'television', 'todayilearned',
       'worldnews', 'wtf', 'space', 'teenagers', 'nottheonion', 'videos',
       'memes', 'friendsafari', 'adviceanimals', 'gadgets',
       'oldschoolcool', 'twoxchromosomes', 'dataisbeautiful', 'tifu',
       'art', 'upliftingnews', 'showerthoughts', 'askreddit', 'books',
       'music', 'diy', 'futurology', 'lifeprotips', 'food',
       'internetisbeautiful', 'earthporn', 'science', 'listentothis',
       'personalfinance', 'jokes', 'philosophy', 'photoshopbattles',
       'documentaries', 'askscience', 'history', 'explainlikeimfive',
       'nosleep', 'writingprompts', 'debatereligion'], dtype=object)

In [125]:
df_d = df.copy()
df_d['sub'] = pd.Categorical(df_d['sub'], categories=sub_order)
p = ggplot(df_d, aes(x='dt_min_cmnt/60', color='sort')) + geom_density(aes(y='stat(scaled)')) + facet_wrap('~sub', ncol=8)
p = p + scale_x_continuous(name='time since submission, hours', breaks=[0, 6, 12, 18, 24])
p = p + theme_minimal() + theme(text = element_text(family='Hack', size=12),
                                axis_text = element_text(family='Hack', size=10),
                                plot_background=element_rect(fill='white'))
p = p + scale_y_continuous(name='density')
p = p + scale_color_manual(name=' ', breaks=['top 10 comments/post', 'oldest ~500 comments/post'],
                           labels = ['top 10 comments', 'oldest ~500 comments'],
                           values=['black', 'red'])
p = p + theme(subplots_adjust={'wspace':0.2})
#p

In [126]:
p.save('oldest-vs-top_by-sub.png', dpi=200, width=16, height=14)

  warn("Saving {0} x {1} {2} image.".format(
