In [1]:
# !pip install praw
# !pip install requests-cache
# !pip install pymongo

In [1]:
import requests_cache
requests_cache.install_cache('web_cache')

In [2]:
import praw
from pymongo import MongoClient
import json
import datetime
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
from IPython.display import JSON

In [3]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
raw = db['reddit-raw']

In [3]:
data_dir = Path('./data')

In [4]:
credentials = ""

with open('./reddit_credentials.json') as f:
    credentials = json.load(f)

In [5]:
reddit = praw.Reddit(client_id=credentials['client_id'],
                     client_secret=credentials['client_secret'],
                     password=credentials['password'],
                     user_agent=credentials['user_agent'],
                     username=credentials['username'])

In [6]:
submissions = [s for s in reddit.subreddit('dataisugly').new(limit=2)]
submissions

[Submission(id='enn8n7'), Submission(id='eni5xa')]

In [7]:
error_submissions = []

def digest_submission_preview (submission):
    try:
        if hasattr(submission, 'preview'):
            return submission.preview['images'][0]['source']['url']
        else:
            return ''
    except Exception as inst:
        print(f'Reddit submissions preview digest error: {inst}')
        return f'Preview error: {inst}'

def digest_submission (submission):
    try:
        return {
            'title': submission.title,
            'permalink': submission.permalink,
            'datetime': datetime.datetime.fromtimestamp(submission.created_utc).isoformat(),
            'author': submission.author.name if submission.author else '',
            'selftext': submission.selftext,
            'link': submission.url,
            'preview': digest_submission_preview(submission),
            'id': submission.id,
            'num_comments': submission.num_comments,
            'score': submission.score,
#             'upvote_ratio': submission.upvote_ratio # require extra request
        }
    except Exception as inst:
        print(f'Reddit submissions digest error: {inst} idx: {len(error_submissions)}')
        return {
            'title': 'digest error',
            'idx': len(error_submissions),
            'error': str(inst)
        }

def submissions_iter (r, sort, limit):
    if sort == 'top_all':
        return reddit.subreddit(r).top('all', limit=limit)
    elif sort == 'top_year':
        return reddit.subreddit(r).top('year', limit=limit)
    else:
        return reddit.subreddit(r).new(limit=limit)

def retrieve_submissions (r='dataisugly', sort='top_all', limit=100):
    return (digest_submission(s) for s in tqdm(submissions_iter(r, sort, limit), total=limit))

In [8]:
# crawler_queue = [
#     {'file': 'r_dataisugly_test.json', 'r': 'dataisugly', 'sort': 'top_all', 'limit': 10}
# ]

crawler_queue = [
    {'file': 'r_dataisugly_top_all_1000.json', 'r': 'dataisugly', 'sort': 'top_all', 'limit': 1000},
    {'file': 'r_dataisugly_top_year_1000.json', 'r': 'dataisugly', 'sort': 'top_year', 'limit': 1000},
    {'file': 'r_dataisugly_new_1000.json', 'r': 'dataisugly', 'sort': 'new', 'limit': 1000}
]

In [9]:
for c in crawler_queue:
    submissions = [s for s in retrieve_submissions(r=c['r'], sor=c['sort'], limit=c['limit'])]
    with open(data_dir/c['file'], 'w') as f:
        f.write(json.dumps(submissions))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [10]:
error_submissions

[]

In [14]:
df = pd.concat([pd.read_json(data_dir/c['file']) for c in crawler_queue])
df

Unnamed: 0,title,permalink,datetime,author,selftext,link,preview,id,num_comments,score
0,Accurate,/r/dataisugly/comments/66i283/accurate/,2017-04-20 14:33:08,Taipan100,,https://i.redd.it/9fxvy22dypsy.jpg,https://preview.redd.it/9fxvy22dypsy.jpg?auto=...,66i283,47,2073
1,"""fuckin idk just flip the graph upside down""",/r/dataisugly/comments/e2cvur/fuckin_idk_just_...,2019-11-27 08:03:07,khodor2012,,https://i.imgur.com/sTVYOIH.jpg,https://external-preview.redd.it/7KwLWyf4c1EvB...,e2cvur,27,1211
2,I present to you the printed press in Britain,/r/dataisugly/comments/6h6cr9/i_present_to_you...,2017-06-14 09:12:54,Anotimpuri,,https://i.redd.it/w5kjne5dvk3z.jpg,https://preview.redd.it/w5kjne5dvk3z.jpg?auto=...,6h6cr9,16,1199
3,One banana is most certainly better than two,/r/dataisugly/comments/4xjubo/one_banana_is_mo...,2016-08-13 16:44:06,AnAverageCat,,http://imgur.com/a/DD28T,https://external-preview.redd.it/I4-QVL487d_fh...,4xjubo,39,1168
4,When Venn diagram goes wrong,/r/dataisugly/comments/boo6ld/when_venn_diagra...,2019-05-14 20:34:31,seretidediskus,,https://i.redd.it/yke7x1e5m8y21.jpg,https://preview.redd.it/yke7x1e5m8y21.jpg?auto...,boo6ld,14,1043
...,...,...,...,...,...,...,...,...,...,...
995,"[OC] England's homeownership rate by age, 1981...",/r/dataisugly/comments/9bzlxi/oc_englands_home...,2018-09-01 02:23:37,TentacleYuri,,https://i.redd.it/k15ne8sc7ij11.png,https://preview.redd.it/k15ne8sc7ij11.png?auto...,9bzlxi,8,147
996,How Important is the MCAT? More important than...,/r/dataisugly/comments/9bxosl/how_important_is...,2018-08-31 21:39:57,nosretttapL,,https://i.redd.it/7ufxjhi3zhj11.jpg,https://preview.redd.it/7ufxjhi3zhj11.jpg?auto...,9bxosl,1,22
997,"Having the same spacing is nice and all, but s...",/r/dataisugly/comments/9bvo4v/having_the_same_...,2018-08-31 17:37:03,srgnknd,,https://i.imgur.com/DUHG2u4.png,https://external-preview.redd.it/TCIf15X0ZCPUc...,9bvo4v,1,18
998,This is how another instructor sent me his stu...,/r/dataisugly/comments/9bva50/this_is_how_anot...,2018-08-31 16:52:11,dirtydog85,"&#x200B;\n\n[Also, the red overall \\""column\\...",https://www.reddit.com/r/dataisugly/comments/9...,,9bva50,11,85


In [17]:
df.to_csv(data_dir/'r_dataisugly.csv', index=False)