In [1]:
from pymongo import MongoClient
import json
from datetime import datetime
from tqdm.notebook import tqdm
from pathlib import Path
from IPython.display import JSON
from bs4 import BeautifulSoup as bs

from lib.labels_from_tags import labels_from_tags

In [2]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
raw = db['tumblr-raw']
posts = db['posts']

In [3]:
from collections.abc import MutableMapping, Sequence
from collections import Counter

# from https://stackoverflow.com/questions/51488240/python-get-json-keys-as-full-path
def get_paths(source):
    paths = []
    if isinstance(source, MutableMapping):  # found a dict-like structure...
        for k, v in source.items():  # iterate over it; Python 2.x: source.iteritems()
            paths.append([k])  # add the current child path
            paths += [[k] + x for x in get_paths(v)]  # get sub-paths, extend with the current
    # else, check if a list-like structure, remove if you don't want list paths included
    elif isinstance(source, Sequence) and not isinstance(source, str):
        #                          Python 2.x: use basestring instead of str ^
        for i, v in enumerate(source):
            paths.append([i])
            paths += [[i] + x for x in get_paths(v)]  # get sub-paths, extend with the current
    return paths

c = Counter([str(p) for s in raw.find() for p in get_paths(s)])
with open('tumblr_attrs.txt', 'w') as f:
    f.write(str(c))

In [4]:
class Post ():
    _attrs = [
        'id',
        'post_id',
        'datetime',
        'url',
        'title',
        'content',
        'thumbnail',
        'preview',
        'num_comments',
        'source',
        'source_platform',
        'source_url',
        'tags',
        'media_type',
        'labels'
    ]

    def __init__ (self, p):
        self._p = p
        self.id = p['id']
        self.post_id = f'tumblr/{p["blog_name"]}/{p["id"]}'
        self.url = p['post_url']
        self.title = p['summary']
        self.num_comments = p['note_count']
        self.source = p['blog']['name']
        self.source_platform = 'tumblr'
        self.source_url = p['blog']['url']

    def digest (self):
        return {a:getattr(self, a) for a in Post._attrs}

    @property
    def datetime (self):
        return datetime.fromtimestamp(self._p['timestamp']).isoformat()

    @property
    def content (self):
        if 'caption' in self._p:
            return bs(self._p['caption'], 'html.parser').get_text()
        else:
            return ''

    @property
    def tags (self):
        return self._p['tags']

    @property
    def labels (self):
        return {
            'auto': labels_from_tags(self.tags)
        }

    @property
    def media_type (self):
        if self.preview['url'] == '':
            return self._p['type']
        else:
            return 'image'

    @property
    def preview (self):
        if 'photos' in self._p:
            return {
                'url': self._p['photos'][0]['original_size']['url'],
                'width': self._p['photos'][0]['original_size']['width'],
                'height': self._p['photos'][0]['original_size']['height']
            }
        elif self._p['type'] == 'video':
            print('Video type')
            if self._p['video_type'] == "unknown":
                print("Unknown video type")
                print(self._p['player'][-1]['embed_code'])
            else:
                print(f'Tumblr post {self._p["id"]} has video: {self._p["permalink_url"]}')
        elif self._p['type'] == 'text':
            print('Text type')
            if 'body' in self._p:
                img = bs(self._p['body'], 'html.parser').find('img')
                if img:
                    print(f'Tumblr posts {self._p["id"]} has img in body: {img["data-orig-width"]} {img["data-orig-height"]} {img["src"]}')
                    print()
                    return {
                        'url': img['src'],
                        'width': img['data-orig-width'],
                        'height': img['data-orig-height'],
                    }

        print(f'Tumblr posts missing photos: {self._p["id"]}')
        print()
        return {
            'url': '',
            'width': 0,
            'height': 0
        }

    @property
    def thumbnail (self):
        if 'photos' in self._p:
            thumbnails = [s for s in self._p['photos'][0]['alt_sizes'] if s['width'] > 140 or s['height'] > 140]
            thumbnail = self._p['photos'][0]['alt_sizes'][0] if len(thumbnails) == 0 else thumbnails[-1]
            return {
                'url': thumbnail['url'],
                'width': thumbnail['width'],
                'height': thumbnail['height']
            }
        else:
            return {
                'url': '',
                'width': 0,
                'height': 0
            }

In [5]:
def digest_all_posts ():
    for p in tqdm(raw.find()):
        post = Post(p)
        posts.replace_one({'post_id': post.post_id}, post.digest(), upsert=True)
    return

In [6]:
digest_all_posts()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Text type
Tumblr posts 184638752491 has img in body: 1204 950 https://66.media.tumblr.com/bfe6995aeeee5056aaa49bf797d1907d/tumblr_inline_pqyyx3YYtI1w54cvo_540.png

Text type
Tumblr posts 184638752491 has img in body: 1204 950 https://66.media.tumblr.com/bfe6995aeeee5056aaa49bf797d1907d/tumblr_inline_pqyyx3YYtI1w54cvo_540.png

Video type
Unknown video type
<blockquote class="twitter-tweet" data-lang="en"><p lang="en" dir="ltr">Look how infant mortality has fallen with growth, across states of India! Each state has its own story: Maharashtra better than most southern states, Bihar stagnating under RJD, Odisha&#39;s gains despite low growth...what do you see? <br>Cool <a href="https://twitter.com/hashtag/Datavisualization?src=hash&amp;ref_src=twsrc%5Etfw">#Datavisualization</a> from Prof. <a href="https://twitter.com/muditkapoor?ref_src=twsrc%5Etfw">@muditkapoor</a> <a href="https://t.co/d49U3GVo5M">pic.twitter.com/d49U3GVo5M</a></p>&mdash; Shamika Ravi (@ShamikaRavi) <a href="https://twi