In [None]:
# !pip install imgur-downloader
# !pip install python-magic
# !apt install libmagic-dev
# !pip install requests_toolbelt

In [None]:
# no web cache, file check instead

In [None]:
from pymongo import MongoClient
import requests
from requests_toolbelt import SSLAdapter
from pathlib import Path
import glob
from PIL import Image
from io import BytesIO
import magic
# from imgur_downloader import ImgurDownloader
from lib.libpatching.ImgurDownloader import ImgurDownloader
from collections import Counter

from lib.parallel import parallel

In [None]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
dlthumbnailresults = db['dlthumbnailresults']
dlpreviewresults = db['dlpreviewresults']
dlexternalresults = db['dlexternalresults']

In [None]:
images_dir = Path('./images')

In [None]:
post = posts.find_one()
r = requests.get(post['preview']['url'])
pil_image = Image.open(BytesIO(r.content))
pil_image

In [None]:
ImgurDownloader('http://imgur.com/(null)')

In [None]:
def get_content_type_ext (content_type, req=None):
    if not content_type:
        content_type = ''
    content_type = content_type.lower()
    if content_type.startswith('image/jpeg') or content_type.startswith('image/jpg'):
        return '.jpg'
    elif content_type.startswith('image/png'):
        return '.png'
    elif content_type.startswith('image/gif'):
        return '.gif'
    elif content_type.startswith('image/webp'):
        return '.webp'
    elif content_type.startswith('image/svg'):
        return '.svg'
    elif content_type.startswith('image/bmp'):
        return '.bmp'
    elif content_type.startswith('video/mp4'):
        return '.mp4'
    elif req:
        return get_content_type_ext(magic.from_buffer(req.content, mime=True))
    elif content_type.startswith('text/html'):
        return '.html'
    elif content_type.startswith('application/pdf'):
        return '.pdf'
    else:
        print(f'Unknow content-type: {content_type}')
        return ''

def download_image (post, t='preview', outdir=None):
    if t in post:
        if type(post[t]) == str:
            link = post[t]
        else:
            link = post[t]['url']
    else:
        link = ''
    if not outdir:
        outdir = t
    result = {
        'id': post['id'],
        'post_id': post['post_id'],
        'link': link,
        'dest': images_dir/outdir/post['post_id']
    }

    if len(glob.glob(f'{result["dest"]}*')) != 0:
        result['status'] = 'exists'
        return result
    else:
        result['dest'].parent.mkdir(parents=True, exist_ok=True)

    if not link or link == 'None' or type(link) != str:
        result['status'] = 'invalid_url'
        return result

    try:
        if t == 'external_link':
            if 'imgur.com' in link and not (link.endswith('.jpg') or link.endswith('.png') or link.endswith('.gif')):
                filename, skipped = ImgurDownloader(link.split('#')[0], # remove image identifier hash
                                                    dir_download=result['dest'].parent,
                                                    file_name=result['id'],
                                                    delete_dne=True).save_images()
                if skipped != 0:
                    result['status'] = 'imgur_dne'
                else:
                    result['status'] = 'downloaded'
                return result
            elif 'viz.wtf' in link:
                result['status'] = 'skip_wtf-viz'
                return result
            elif 'twitter.com' in link:
                result['status'] = 'skip_twitter'
                return result
            elif 'youtube.com' in link or 'youtu.be' in link:
                result['status'] = 'skip_youtube'
                return result
            elif 'vimeo.com' in link:
                result['status'] = 'skip_vimeo'
                return result
            elif 'reddit.com/r' in link:
                result['status'] = 'skip_subreddit'
                return result
            elif 'reddit.com/user' in link:
                result['status'] = 'skip_redditor'
                return result

        try:
            r = requests.get(link, timeout=60)
        except requests.exceptions.SSLError:
            adapter = SSLAdapter('TLSv1')
            s = requests.Session()
            s.mount('https://', adapter)
            r = s.get(link, timeout=60, verify=False)
        except requests.exceptions.Timeout:
            r = requests.get(link, timeout=60)

        result['request_status'] = r.status_code
        if result['request_status'] >= 400:
            if result['request_status'] == 404:
                result['status'] = 'not_found'
            else:
                result['status'] = 'http_error'
            return result

        result['ext'] = get_content_type_ext(r.headers.get('content-type'), r)
        if result['ext'] == '':
            result['status'] = 'invalid_content_type'
            print(f'id: {result["id"]} link: {result["link"]}')
            return result
        if result['ext'] == '.html' or result['ext'] == '.pdf':
            result['status'] = 'skip_content_type'
            return result

        result['output'] = f'{result["dest"]}{result["ext"]}'
        with open(result['output'], 'wb') as f:
            f.write(r.content)
        result['status'] = 'downloaded'
        return result
    except Exception as inst:
        print(f"Download error: {post['post_id']} {link} {inst}")
        result['status'] = 'download_error'
        return result

    return result

In [None]:
for post in [posts.find_one()]:
    result = download_image(post, t='preview')
    print(result)

In [None]:
for post in [posts.find_one()]:
    result = download_image(post, t='preview')
    print(result)

In [None]:
for post in [posts.find_one()]:
    result = download_image(post, t='thumbnail')
    print(result)

In [None]:
for post in [posts.find_one({'id': '5chz6p'})]:
    result = download_image(post, t='external_link', outdir='external_link')
    print(result)

In [None]:
dl_preview_results = parallel(download_image, posts.find(), params_dict={'t': 'preview', 'outdir': 'preview'}, total=posts.estimated_document_count(), n_jobs=20)

In [None]:
dl_thumbnail_results = parallel(download_image, posts.find(), params_dict={'t': 'thumbnail', 'outdir': 'thumbnail'}, total=posts.estimated_document_count(), n_jobs=20)

In [None]:
dl_external_link_results = parallel(download_image, posts.find(), params_dict={'t': 'external_link', 'outdir': 'external_link'}, total=posts.estimated_document_count(), n_jobs=20)

In [None]:
dl_results = [
    (dl_preview_results, dlthumbnailresults),
    (dl_thumbnail_results, dlpreviewresults),
    (dl_external_link_results, dlexternalresults)
]

for results, outdb in dl_results:
    for r in results:
        r['dest'] = str(r['dest'])

    outdb.drop()
    outdb.insert_many(results)

In [None]:
Counter([r['status'] for r in dl_thumbnail_results])

In [None]:
Counter([r['status'] for r in dl_preview_results])

In [None]:
Counter([r['status'] for r in dl_external_link_results])

In [None]:
[r for r in dlexternalresults.find({'status': 'imgur_dne'})]