In [2]:
# !pip install praw
# !pip install requests-cache
# !pip install pymongo

In [2]:
import requests
import requests_cache
!mkdir -p web_cache
requests_cache.install_cache('web_cache/redditcrawler')
# requests_cache.clear()

In [3]:
import praw
from pymongo import MongoClient
import json
from datetime import datetime
from tqdm.notebook import tqdm
from IPython.display import JSON
from collections import Counter

from lib.parallel import parallel
from lib.labels_from_tags import labels_from_tags

In [4]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
reddit_raw = db['reddit-raw']
pushshift_raw = db['pushshift-raw']
reddit_merge = db['reddit-merge']
reddit_cache = db['reddit-cache']
posts = db['posts']

In [5]:
credentials = ""

with open('./reddit_credentials.json') as f:
    credentials = json.load(f)

In [6]:
reddit = praw.Reddit(client_id=credentials['client_id'],
                     client_secret=credentials['client_secret'],
                     password=credentials['password'],
                     user_agent=credentials['user_agent'],
                     username=credentials['username'])

In [7]:
# break the rate limiter since we have requests cache
# from types import MethodType

# def new_delay (self):
#     pass

# reddit._core._rate_limiter.delay = MethodType(new_delay, reddit._core._rate_limiter)

# Reddit

In [8]:
class PRAWJSONEncoder(json.JSONEncoder):
    """Class to encode PRAW objects to JSON."""
    """From https://gist.github.com/jarhill0/6e6495706252d52573950c3820f533b0"""
    def default(self, obj):
        if isinstance(obj, praw.models.base.PRAWBase):
            obj_dict = {}
            for key, value in obj.__dict__.items():
                if not key.startswith('_'):
                    obj_dict[key] = value
            return obj_dict
        else:
            return json.JSONEncoder.default(self, obj)

def digest_reddit_submission (submission):
    s = json.loads(json.dumps(submission, cls=PRAWJSONEncoder))
    if not reddit_raw.find_one({'id': submission.id}, {'_id': 1}):
        return reddit_raw.insert_one(s)
    else:
        return reddit_raw.update_one({'id': submission.id}, {'$set': s})

def reddit_submissions_iter (r, sort, limit):
    if sort == 'top_all':
        return reddit.subreddit(r).top('all', limit=limit)
    elif sort == 'top_year':
        return reddit.subreddit(r).top('year', limit=limit)
    else:
        return reddit.subreddit(r).new(limit=limit)

def retrieve_reddit_submissions (r='dataisugly', sort='top_all', limit=100):
    return (digest_reddit_submission(s) for s in tqdm(reddit_submissions_iter(r, sort, limit), total=limit))

In [9]:
# crawler_queue = [
#     {'r': 'dataisugly', 'sort': 'top_all', 'limit': 10}
# ]

reddit_crawler_queue = [
    {'r': 'dataisugly', 'sort': 'top_all', 'limit': 1000},
    {'r': 'dataisugly', 'sort': 'top_year', 'limit': 1000},
    {'r': 'dataisugly', 'sort': 'new', 'limit': 1000}
]

In [10]:
for c in reddit_crawler_queue:
    [s for s in retrieve_reddit_submissions(r=c['r'], sort=c['sort'], limit=c['limit'])]

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




# Pushshift

In [11]:
def digest_pushshift_submission (submission):
    if not pushshift_raw.find_one({'id': submission['id']}, {'_id': 1}):
        return pushshift_raw.insert_one(submission)
    else:
        return ''

def retrieve_pushshift (r, before_epoch, limit=1000):
    payload = {
        'subreddit': r,
        'sort': 'desc',
        'sort_type': 'created_utc',
        'before': before_epoch,
        'size': limit
    }
    reply = requests.get('https://api.pushshift.io/reddit/search/submission/', params=payload)
    if reply.status_code != 200:
        print(reply.status_code)
        print(json.dumps(payload))
    return reply.json()['data']

# def retrieve (r, before_epoch, limit=200):
#     api = PushshiftAPI(reddit)
#     submissions = api.search_submissions(before=before_epoch, subreddit=r, limit=limit)
#     return [s for s in submissions]

def pushshift_submissions_iter (r, limit=200):
    before_epoch = int(datetime(2020, 1, 1).timestamp())
    last_submission_id = 0
    while True:
        submissions = retrieve_pushshift(r, before_epoch, limit)
        if last_submission_id == submissions[-1]['id']:
            print(f"Last: {submissions[-1]['id']} {submissions[-1]['created_utc']} {datetime.fromtimestamp(submissions[-1]['created_utc']).isoformat()}")
            break

        for submission in submissions:
            yield submission

        before_epoch = submissions[-1]['created_utc'] + 1
        last_submission_id = submissions[-1]['id']
        print(f"New epoch: {before_epoch} {datetime.fromtimestamp(before_epoch).isoformat()}")

def retrieve_pushshift_submissions (r, limit=200):
    return (digest_pushshift_submission(s) for s in tqdm(pushshift_submissions_iter(r, limit)))

In [12]:
pushshift_crawler_queue = [
    {'r': 'dataisugly', 'limit': 1000}
]

In [13]:
for c in pushshift_crawler_queue:
    submissions = [s for s in retrieve_pushshift_submissions(r=c['r'], limit=c['limit'])]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

New epoch: 1551741197 2019-03-04T23:13:17
New epoch: 1519462761 2018-02-24T08:59:21
New epoch: 1490703092 2017-03-28T12:11:32
New epoch: 1459722151 2016-04-03T22:22:31
New epoch: 1428266609 2015-04-05T20:43:29
New epoch: 1353713658 2012-11-23T23:34:18
Last: 13otwh 1353713657 2012-11-23T23:34:17



# Merge to reddit-merge

In [14]:
reddit_merge.drop()

In [15]:
def retrieve_submission (submission_id):
    s = reddit_cache.find_one({'id': submission_id})
    if s:
        return s

    try:
        s = reddit.submission(submission_id)
        name = s.name # trigger data fetching
        s = json.loads(json.dumps(s, cls=PRAWJSONEncoder))
    except Exception as e:
        print(submission_id, e)
        return None

    reddit_cache.update_one({'id': submission_id}, {'$setOnInsert': s}, upsert=True)
    return s

def add_merge (s, duplicated_with=None, retrieve=False):
    if not reddit_merge.find_one({'id': s['id']}, {'_id': 1}):
        if duplicated_with == None:
            duplicated_with = set()

        if 'crosspost_parent_list' in s:
            duplicated_with = duplicated_with.union([p['id'] for p in s['crosspost_parent_list']])

        if retrieve:
            submission = retrieve_submission(s['id'])
            if not submission:
                return 0
            if 'permalink' in s:
                submission['alt'] = s
            s = submission

        if 'crosspost_parent_list' in s:
            duplicated_with = duplicated_with.union([p['id'] for p in s['crosspost_parent_list']])
            del s['crosspost_parent_list']
        if 'alt' in s and 'crosspost_parent_list' in s['alt']:
            duplicated_with = duplicated_with.union([p['id'] for p in s['alt']['crosspost_parent_list']])
            del s['alt']['crosspost_parent_list']

        if 'reddit.com/r' in s['url']:
            tokens = s['url'].split('/')
            if len(tokens) > 6:
                duplicated_with.add(tokens[6])

        s['duplicated_with'] = list(duplicated_with)
        reddit_merge.update_one({'id': s['id']}, {'$setOnInsert': s}, upsert=True)

        added = [add_merge({'id': d}, duplicated_with=duplicated_with.union(set([s['id']])), retrieve=True) for d in duplicated_with]

        return sum(added) + 1

    return 0

In [16]:
result = parallel(add_merge, pushshift_raw.find(), params_dict={'retrieve': True}, total=pushshift_raw.estimated_document_count(), n_jobs=-1)
print(f"Retrieved {sum(result)}/{len(result)}")

HBox(children=(FloatProgress(value=0.0, max=5962.0), HTML(value='')))

bpca3e received 403 HTTP response
af71fc received 403 HTTP response

Retrieved 6477/5962


In [17]:
result = parallel(add_merge, reddit_raw.find(), total=reddit_raw.estimated_document_count(), n_jobs=-1)
print(f"Retrieved {sum(result)}/{len(result)}")

HBox(children=(FloatProgress(value=0.0, max=2097.0), HTML(value='')))


Retrieved 690/2097


In [18]:
Counter([s['id'] for s in reddit_merge.find()]).most_common(5)

[('ehsr2o', 1), ('eh9a95', 1), ('eh7wox', 1), ('ehafnl', 1), ('ehsqv3', 1)]

# Digest into posts

In [19]:
posts.drop()

In [20]:
from collections.abc import MutableMapping, Sequence
from collections import Counter

# from https://stackoverflow.com/questions/51488240/python-get-json-keys-as-full-path
def get_paths(source):
    paths = []
    if isinstance(source, MutableMapping):  # found a dict-like structure...
        for k, v in source.items():  # iterate over it; Python 2.x: source.iteritems()
            paths.append([k])  # add the current child path
            paths += [[k] + x for x in get_paths(v)]  # get sub-paths, extend with the current
    # else, check if a list-like structure, remove if you don't want list paths included
    elif isinstance(source, Sequence) and not isinstance(source, str):
        #                          Python 2.x: use basestring instead of str ^
        for i, v in enumerate(source):
            paths.append([i])
            paths += [[i] + x for x in get_paths(v)]  # get sub-paths, extend with the current
    return paths

c = Counter([str(p) for s in reddit_merge.find() for p in get_paths(s)])
with open('reddit_attrs.txt', 'w') as f:
    f.write(str(c))

In [21]:
class Submission ():
    _attrs = [
        'id',
        'post_id',
        'datetime',
        'url',
        'title',
        'content',
        'author',
        'thumbnail',
        'preview',
        'removed',
        'ups',
        'num_comments',
        'external_link',
        'source',
        'source_platform',
        'source_url',
        'duplicated_posts',
        'tags',
        'media_type',
        'labels',
        'alt',
        'preview_alt',
        'thumbnail_alt',
        'external_link_alt'
    ]

    _video_type = {
        'gfycat.com',
        'streamable.com',
        'vimeo.com',
        'youtube.com'
    }

    def __init__ (self, s, duplicated_post=None):
        self._s = s
#         self._duplicated_post = duplicated_post
        self._subreddit = s['subreddit'] if type(s['subreddit']) == str else s['subreddit']['display_name']

        self.id = s['id']
        self.post_id = f'reddit/{self._subreddit}/{self.id}'
        self.title = s['title']
        self.content = s['selftext'] if 'selftext' in s else ''
        self.ups = s['ups'] if 'ups' in s else 0
        self.num_comments = s['num_comments']
        self.source = self._subreddit
        self.source_platform = 'reddit'
        self.source_url = f'https://www.reddit.com/{s["subreddit_name_prefixed"]}' if 'subreddit_name_prefixed' in s else ''

    def digest (self):
        return {a:getattr(self, a) for a in Submission._attrs}

    @property
    def alt (self):
        if 'alt' not in self._s:
            return None
        return Submission(self._s['alt']).digest()

    @property
    def preview_alt (self):
        if self.alt and self.alt['preview']['url'] and self.alt['preview']['url'] != self.preview['url']:
            return self.alt['preview']
        else:
            return None

    @property
    def thumbnail_alt (self):
        if self.alt and self.alt['thumbnail']['url'] and self.alt['thumbnail']['url'] != self.thumbnail['url']:
            return self.alt['thumbnail']
        else:
            return None

    @property
    def external_link_alt (self):
        if self.alt and self.alt['external_link'] and self.alt['external_link'] != self.external_link:
            return self.alt['external_link']
        else:
            return None

    @property
    def datetime (self):
        return datetime.fromtimestamp(self._s['created_utc']).isoformat()

    @property
    def url (self):
        if 'permalink' in self._s:
            if self._s['permalink'].startswith('/'):
                return f'https://reddit.com{self._s["permalink"]}'
            else:
                print(f'url: {self._s["permalink"]}')
                return self._s['permalink']
        else:
            return ''

    @property
    def author (self):
        if 'author' in self._s and self._s['author']:
            return self._s['author'] if type(self._s['author']) == str else self._s['author']['name']
        else:
            return ''

    @property
    def external_link (self):
        if 'url' in self._s and self._s['url'] != self.url:
            return self._s['url']
        else:
            return ''

    @property
    def duplicated_posts (self):
        duplicated_posts = []
        if 'duplicated_with' not in self._s:
            return duplicated_posts

        for submission_id in self._s['duplicated_with']:
            s = reddit_merge.find_one({'id': submission_id})
            if s:
                duplicated_posts.append(Submission(s).post_id)
        return duplicated_posts
#         duplicated_posts = []
#         if self._duplicated_post:
#             duplicated_posts.append(self._duplicated_post)
#         if self._crosspost:
#             duplicated_posts.append(self._crosspost.post_id)
#         if 'viz.wtf' in self.external_link:
#             wtfviz_id = [t for t in self.external_link.split('/') if t.isdigit()]
#             duplicated_posts.append(f'tumblr/wtf-viz/{wtfviz_id}')
#         return duplicated_posts

    @property
    def tags (self):
        return [self._s['link_flair_text']] if 'link_flair_text' in self._s and self._s['link_flair_text'] else []

    @property
    def labels (self):
        return {
            'auto': labels_from_tags(self.tags)
        }

    @property
    def media_type (self):
        if 'is_video' in self._s and self._s['is_video']:
            return 'video'
        if 'media' in self._s and self._s['media'] and 'type' in self._s['media']:
            if self._s['media']['type'] in Submission._video_type:
                return 'video'
        if self.preview['url']:
            return 'image'
        return 'text'

    @property
    def preview (self):
        try:
            if 'preview' in self._s:
                return {
                    'url': self._s['preview']['images'][0]['source']['url'],
                    'width': self._s['preview']['images'][0]['source']['width'],
                    'height': self._s['preview']['images'][0]['source']['height']
                }
            elif 'url' in self._s:
#                     print(f'Reddit submission preview digest no preview but url: {self._s["id"]} {self.url}')
                return {
                    'url': '',
                    'width': 0,
                    'height': 0
                }
            else:
#                     print(f'Reddit submission preview digest no preview nor url: {self._s["id"]}')
                return {
                    'url': '',
                    'width': 0,
                    'height': 0
                }
        except Exception as inst:
            print(f'Reddit submission preview digest error: {inst}')
            print(f'Reddit submission: {self._s["id"]} {self.url}')
            return {
                'url': '',
                'width': 0,
                'height': 0
            }

    @property
    def thumbnail (self):
        try:
            if 'thumbnail' in self._s:
                if not self._s['thumbnail'].startswith('http'):
                    if not (self._s['thumbnail'] == 'default' or
                            self._s['thumbnail'] == 'self' or
                            self._s['thumbnail'] == 'spoiler' or
                            self._s['thumbnail'] == 'nsfw' or
                            self._s['thumbnail'] == 'image'):
                        print(f"Reddit submission thumbnail invalid url: {self.id} {self._s['thumbnail']}")
                    return {
                        'url': '',
                        'width': 0,
                        'height': 0
                    }
                else:
                    return {
                        'url': self._s['thumbnail'],
                        'width': self._s.get('thumbnail_width', -1),
                        'height': self._s.get('thumbnail_height', -1)
                    }
            elif 'preview' in self._s:
                return {
                    'url': self._s['preview']['images'][0]['resolutions'][0]['url'],
                    'width': self._s['preview']['images'][0]['resolutions'][0]['width'],
                    'height': self._s['preview']['images'][0]['resolutions'][0]['height']
                }
            else:
                return {
                    'url': '',
                    'width': 0,
                    'height': 0
                }
        except Exception as inst:
            print(f'Reddit submission thumbnail digest error: {inst}')
            print(f'Reddit submission: {self._s["id"]} {self.url}')
            return {
                'url': '',
                'width': 0,
                'height': 0
            }

    @property
    def removed (self):
        return ('removed_by_category' in self._s and not not self._s['removed_by_category'])

In [22]:
def digest_all_submissions ():
    for s in tqdm(reddit_merge.find()):
        submission = Submission(s)
        posts.replace_one({'post_id': submission.post_id}, submission.digest(), upsert=True)
#         if submission._crosspost:
#             posts.replace_one({'post_id': submission._crosspost.post_id}, submission._crosspost.digest(), upsert=True)
    return

In [23]:
digest_all_submissions()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




# Fix problematic posts

In [24]:
posts.find_one({'id': '4f927p'}) # invalid external_link: imgur.com/(null)

{'_id': ObjectId('5ef6128b10a328604705d300'),
 'id': '4f927p',
 'post_id': 'reddit/dataisugly/4f927p',
 'datetime': '2016-04-17T23:25:13',
 'url': 'https://reddit.com/r/dataisugly/comments/4f927p/the_truth_about_abuse/',
 'title': 'The Truth about Abuse',
 'content': '[deleted]',
 'author': '',
 'thumbnail': {'url': '', 'width': 0, 'height': 0},
 'preview': {'url': '', 'width': 0, 'height': 0},
 'removed': False,
 'ups': 0,
 'num_comments': 0,
 'external_link': 'http://imgur.com/(null)',
 'source': 'dataisugly',
 'source_platform': 'reddit',
 'source_url': 'https://www.reddit.com/r/dataisugly',
 'duplicated_posts': [],
 'tags': [],
 'media_type': 'text',
 'labels': {'auto': []},
 'alt': {'id': '4f927p',
  'post_id': 'reddit/dataisugly/4f927p',
  'datetime': '2016-04-17T23:25:13',
  'url': 'https://reddit.com/r/dataisugly/comments/4f927p/the_truth_about_abuse/',
  'title': 'The Truth about Abuse',
  'content': '',
  'author': 'sharkpony',
  'thumbnail': {'url': '', 'width': 0, 'height':

In [25]:
posts.update_one({'id': '4f927p'},
                 {'$set': {
                     'skip': True,
                     'crawler_remarks': 'invalid link: imgur.com/(null)'}})

<pymongo.results.UpdateResult at 0x7f89946b65f0>

In [26]:
posts.find_one({'external_link': 'http://[Imgur](http://i.imgur.com/ozwpju8.png)'}) # invalid external_link

{'_id': ObjectId('5ef6128d10a328604705dc14'),
 'id': '2zczew',
 'post_id': 'reddit/dataisugly/2zczew',
 'datetime': '2015-03-17T15:44:00',
 'url': 'https://reddit.com/r/dataisugly/comments/2zczew/the_cost_of_remediation_kind_of/',
 'title': 'The cost of remediation... kind of.',
 'content': '[deleted]',
 'author': '',
 'thumbnail': {'url': '', 'width': 0, 'height': 0},
 'preview': {'url': '', 'width': 0, 'height': 0},
 'removed': False,
 'ups': 1,
 'num_comments': 0,
 'external_link': 'http://[Imgur](http://i.imgur.com/ozwpju8.png)',
 'source': 'dataisugly',
 'source_platform': 'reddit',
 'source_url': 'https://www.reddit.com/r/dataisugly',
 'duplicated_posts': [],
 'tags': [],
 'media_type': 'text',
 'labels': {'auto': []},
 'alt': {'id': '2zczew',
  'post_id': 'reddit/dataisugly/2zczew',
  'datetime': '2015-03-17T15:44:00',
  'url': 'https://reddit.com/r/dataisugly/comments/2zczew/the_cost_of_remediation_kind_of/',
  'title': 'The cost of remediation... kind of.',
  'content': '',
  

In [27]:
posts.update_one({'external_link': 'http://[Imgur](http://i.imgur.com/ozwpju8.png)'},
                 {'$set': {
                     'external_link': 'http://i.imgur.com/ozwpju8.png',
                     'crawler_remarks': 'corrected invalid link'}})

<pymongo.results.UpdateResult at 0x7f89946b6640>