In [1]:
from pymongo import MongoClient
from datetime import datetime
from tqdm.notebook import tqdm

from lib.labels_from_tags import labels_from_tags

In [2]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
raw = db['reddit-merge']
posts = db['posts']

In [3]:
from collections.abc import MutableMapping, Sequence
from collections import Counter

# from https://stackoverflow.com/questions/51488240/python-get-json-keys-as-full-path
def get_paths(source):
    paths = []
    if isinstance(source, MutableMapping):  # found a dict-like structure...
        for k, v in source.items():  # iterate over it; Python 2.x: source.iteritems()
            paths.append([k])  # add the current child path
            paths += [[k] + x for x in get_paths(v)]  # get sub-paths, extend with the current
    # else, check if a list-like structure, remove if you don't want list paths included
    elif isinstance(source, Sequence) and not isinstance(source, str):
        #                          Python 2.x: use basestring instead of str ^
        for i, v in enumerate(source):
            paths.append([i])
            paths += [[i] + x for x in get_paths(v)]  # get sub-paths, extend with the current
    return paths

c = Counter([str(p) for s in raw.find() for p in get_paths(s)])
with open('reddit_attrs.txt', 'w') as f:
    f.write(str(c))

In [4]:
class Submission ():
    _attrs = [
        'id',
        'post_id',
        'datetime',
        'url',
        'title',
        'content',
        'author',
        'thumbnail',
        'preview',
        'removed',
        'ups',
        'num_comments',
        'external_link',
        'source',
        'source_url',
        'duplicated_post',
        'tags',
        'media_type',
        'labels'
    ]

    _video_type = {
        'gfycat.com',
        'streamable.com',
        'vimeo.com',
        'youtube.com'
    }

    def __init__ (self, s, duplicated_post=None):
        self._s = s
        self._duplicated_post = duplicated_post
        self._subreddit = s['subreddit'] if type(s['subreddit']) == str else s['subreddit']['display_name']

        self.id = s['id']
        self.post_id = f'reddit/{self._subreddit}/{self.id}'
        self.title = s['title']
        self.content = s['selftext']
        self.ups = s['ups']
        self.num_comments = s['num_comments']
        self.source = 'Reddit'
        self.source_url = f'https://www.reddit.com/{s["subreddit_name_prefixed"]}'

        self._crosspost = Submission(s['crosspost_parent_list'][0], self.post_id) if 'crosspost_parent_list' in s and len(s['crosspost_parent_list']) > 0 else None

    def digest (self):
        return {a:getattr(self, a) for a in Submission._attrs}

    @property
    def datetime (self):
        return datetime.fromtimestamp(self._s['created_utc']).isoformat()

    @property
    def url (self):
        if 'permalink' in self._s:
            if self._s['permalink'].startswith('/'):
                return f'https://reddit.com{self._s["permalink"]}'
            else:
                print(f'url: {self._s["permalink"]}')
                return self._s['permalink']
        else:
            return ''

    @property
    def author (self):
        if 'author' in self._s and self._s['author']:
            return self._s['author'] if type(self._s['author']) == str else self._s['author']['name']
        else:
            return ''

    @property
    def external_link (self):
        if 'url' in self._s and self._s['url'] != self.url:
            return self._s['url']
        else:
            return ''

    @property
    def duplicated_post (self):
        if self._duplicated_post:
            return self._duplicated_post
        if self._crosspost:
            return self._crosspost.post_id
        if 'viz.wtf' in self.external_link:
            wtfviz_id = [t for t in self.external_link.split('/') if t.isdigit()]
            return f'tumblr/wtf-viz/{wtfviz_id}'
        return ''

    @property
    def tags (self):
        return [self._s['link_flair_text']] if 'link_flair_text' in self._s and self._s['link_flair_text'] else []

    @property
    def labels (self):
        return {
            'auto': labels_from_tags(self.tags)
        }

    @property
    def media_type (self):
        if 'is_video' in self._s and self._s['is_video']:
            return 'video'
        if 'media' in self._s and self._s['media'] and 'type' in self._s['media']:
            if self._s['media']['type'] in Submission._video_type:
                return 'video'
        if self.preview['url']:
            return 'image'
        return 'text'

    @property
    def preview (self):
        try:
            if 'preview' in self._s:
                return {
                    'url': self._s['preview']['images'][0]['source']['url'],
                    'width': self._s['preview']['images'][0]['source']['width'],
                    'height': self._s['preview']['images'][0]['source']['height']
                }
            elif 'url' in self._s:
#                     print(f'Reddit submission preview digest no preview but url: {self._s["id"]} {self.url}')
                return {
                    'url': '',
                    'width': 0,
                    'height': 0
                }
            else:
#                     print(f'Reddit submission preview digest no preview nor url: {self._s["id"]}')
                return {
                    'url': '',
                    'width': 0,
                    'height': 0
                }
        except Exception as inst:
            print(f'Reddit submission preview digest error: {inst}')
            print(f'Reddit submission: {self._s["id"]} {self.url}')
            return {
                'url': '',
                'width': 0,
                'height': 0
            }

    @property
    def thumbnail (self):
        try:
            if 'thumbnail' in self._s:
                if not self._s['thumbnail'].startswith('http'):
                    if not (self._s['thumbnail'] == 'default' or self._s['thumbnail'] == 'self' or self._s['thumbnail'] == 'spoiler' or self._s['thumbnail'] == 'nsfw' or self._s['thumbnail'] == 'image'):
                        print(f"Reddit submission thumbnail invalid url: {self.id} {self._s['thumbnail']}")
                    return {
                        'url': '',
                        'width': 0,
                        'height': 0
                    }
                else:
                    return {
                        'url': self._s['thumbnail'],
                        'width': self._s['thumbnail_width'],
                        'height': self._s['thumbnail_height']
                    }
            elif 'preview' in self._s:
                return {
                    'url': self._s['preview']['images'][0]['resolutions'][0]['url'],
                    'width': self._s['preview']['images'][0]['resolutions'][0]['width'],
                    'height': self._s['preview']['images'][0]['resolutions'][0]['height']
                }
            else:
                return {
                    'url': '',
                    'width': 0,
                    'height': 0
                }
        except Exception as inst:
            print(f'Reddit submission thumbnail digest error: {inst}')
            print(f'Reddit submission: {self._s["id"]} {self.url}')
            return {
                'url': '',
                'width': 0,
                'height': 0
            }

    @property
    def removed (self):
        return '' if not self._s['removed_by_category'] else self._s['removed_by_category']

In [6]:
def digest_all_submissions ():
    for s in tqdm(raw.find()):
        submission = Submission(s)
        posts.replace_one({'post_id': submission.post_id}, submission.digest(), upsert=True)
        if submission._crosspost:
            posts.replace_one({'post_id': submission._crosspost.post_id}, submission._crosspost.digest(), upsert=True)
    return

In [7]:
digest_all_submissions()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [8]:
posts.find_one({'id': '4f927p'})

{'_id': ObjectId('5e36b8a188ff1ac1a2751292'),
 'id': '4f927p',
 'post_id': 'reddit/dataisugly/4f927p',
 'datetime': '2016-04-17T23:25:13',
 'url': 'https://reddit.com/r/dataisugly/comments/4f927p/the_truth_about_abuse/',
 'title': 'The Truth about Abuse',
 'content': '[deleted]',
 'author': '',
 'thumbnail': {'url': '', 'width': 0, 'height': 0},
 'preview': {'url': '', 'width': 0, 'height': 0},
 'removed': '',
 'ups': 0,
 'num_comments': 0,
 'external_link': 'http://imgur.com/(null)',
 'source': 'Reddit',
 'source_url': 'https://www.reddit.com/r/dataisugly',
 'duplicated_post': '',
 'tags': [],
 'media_type': 'text',
 'labels': {'auto': []}}

In [9]:
posts.delete_one({'id': '4f927p'})

<pymongo.results.DeleteResult at 0x7f15072a7ec8>