In [1]:
import requests_cache
requests_cache.install_cache('web_cache/archive')

In [2]:
from pymongo import MongoClient
import requests
from pathlib import Path
import pandas as pd
import glob
from PIL import Image
from io import BytesIO
from matplotlib.pyplot import imshow
import numpy as np
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import json
import magic
import time
from collections import Counter

from lib.parallel import parallel

from IPython.display import JSON

In [3]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
dlthumbnailresults = db['dlthumbnailresults']
dlpreviewresults = db['dlpreviewresults']
dlexternalresults = db['dlexternalresults']
dlthumbnailaltresults = db['dlthumbnailaltresults']
dlpreviewaltresults = db['dlpreviewaltresults']
dlexternalaltresults = db['dlexternalaltresults']
dlarchiveresults = db['dlarchiveresults']

In [4]:
images_dir = Path('../images')

# Find urls need recovery

In [6]:
recover_downloads = []
dbs = [
    dlthumbnailresults,
    dlpreviewresults,
    dlexternalresults,
    dlthumbnailaltresults,
    dlpreviewaltresults,
    dlexternalaltresults
]
for db in dbs:
    recover_downloads += [p for p in db.find()
                          if (p['status'] == 'not_found' or
                           p['status'] == 'http_error' or
                           p['status'] == 'download_error' or
                           p['status'] == 'invalid_content_type' or
                           p['status'] == 'skip_content_type')]
len(recover_downloads)

2360

## Also recover invalid images

In [7]:
# imagefiles = db['imagefiles']
# invalid_image_phashes = set(json.load(open('handmade/invalid_image_phashes.json')))

# invalid_images = [{
#     'id': i['id'],
#     'post_id': i['post_id'],
#     'link': posts.find_one({'post_id': i['post_id']})['external_link'],
#     'dest': i['file_path'],
#     'status': 'invalid_image',
#     'request_status': 500
# } for i in imagefiles.find() if i['phash'] in invalid_image_phashes]
# len(invalid_images)

# recover_downloads += invalid_images
# len(recover_downloads)

# Test recovering

In [8]:
# recover_downloads[0]

In [9]:
# r = requests.get(f"https://archive.org/wayback/available?url={recover_downloads[0]['link']}")
# r_json = r.json()
# JSON(r_json)

In [10]:
# url = r_json['archived_snapshots']['closest']['url'].replace('/http', 'if_/http')
# url

In [11]:
# r = requests.get(url)
# pil_image = Image.open(BytesIO(r.content))
# imshow(np.asarray(pil_image))

# Recover

In [12]:
def get_content_type_ext (content_type, req=None):
    content_type = content_type.lower()
    if content_type.startswith('image/jpeg') or content_type.startswith('image/jpg'):
        return '.jpg'
    elif content_type.startswith('image/png'):
        return '.png'
    elif content_type.startswith('image/gif'):
        return '.gif'
    elif content_type.startswith('image/webp'):
        return '.webp'
    elif content_type.startswith('image/svg'):
        return '.svg'
    elif content_type.startswith('image/bmp'):
        return '.bmp'
    elif req:
        return get_content_type_ext(magic.from_buffer(req.content, mime=True))
    elif content_type.startswith('text/html'):
        return '.html'
    elif content_type.startswith('application/pdf'):
        return '.pdf'
    else:
        print(f'Unknow content-type: {content_type}')
        return ''

def download_image (post, t='preview', outdir=None):
    with requests.Session() as session:
        if t in post:
            if type(post[t]) == str:
                link = post[t]
            else:
                link = post[t]['url']
        else:
            link = ''
        if not outdir:
            outdir = t
        result = {
            'id': post['id'],
            'post_id': post['post_id'],
            'link': link,
            'dest': images_dir/outdir/post['post_id']
        }

        if len(glob.glob(f'{result["dest"]}*')) != 0:
            result['status'] = 'exists'
            return result
        else:
            result['dest'].parent.mkdir(parents=True, exist_ok=True)

        if not link or link == 'None' or type(link) != str:
            result['status'] = 'invalid_url'
            return result

        try:
            archive_url = f"https://archive.org/wayback/available?url={link}"
            r = session.get(archive_url)
            if r.status_code >= 400:
                result['archive_status'] = 'archive_query_error'
                return result
            r_text = r.text
            r_json = json.loads(r_text)
            if 'closest' in r_json['archived_snapshots']:
                result['archive_link'] = r_json['archived_snapshots']['closest']['url'].replace('/http', 'if_/http')
                r = session.get(result['archive_link'])
                result['archive_request_status'] = r.status_code
                if result['archive_request_status'] >= 400:
                    result['archive_status'] = 'archive_error'
                    return result
            else:
                result['archive_status'] = 'archive_not_found'
                return result

            result['ext'] = get_content_type_ext(r.headers.get('content-type'), r)
            if result['ext'] == '':
                result['status'] = 'invalid_content_type'
                print(f'id: {result["id"]} link: {result["link"]}')
                return result
            if result['ext'] == '.html' or result['ext'] == '.pdf':
                result['status'] = 'skip_content_type'
                return result

            result['output'] = f'{result["dest"]}{result["ext"]}'
            with open(result['output'], 'wb') as f:
                f.write(r.content)
            result['status'] = 'downloaded'
            return result
        except Exception as inst:
            print(f"Download error: {post['post_id']} {link} {inst} {r.status_code}")
            print(f"Archive: {post['post_id']} {archive_url}")
            result['status'] = 'download_error'
            return result

        return result

In [13]:
# !rm images/archive/reddit/dataisugly/763yfl.png

In [14]:
# !ls -al images/archive/reddit/dataisugly/763yfl.png

In [15]:
# for post in [dlexternalresults.find_one({'id': '763yfl'})]:
#     result = download_image(post, t='link', outdir='archive')
#     print(result)

In [16]:
dl_archive_results = parallel(download_image,
                              recover_downloads,
                              total=len(recover_downloads),
                              params_dict={'t': 'link', 'outdir': 'archive'})

HBox(children=(FloatProgress(value=0.0, max=2360.0), HTML(value='')))

Unknow content-type: application/octet-stream
id: 6896pd link: http://gervise.com/681-2
Unknow content-type: application/octet-stream
id: 3wkmlo link: http://www.unz.com/isteve/how-many-would-emigrate-almost-7-billion-people-live-in-countries-poorer-than-us/
Unknow content-type: application/octet-stream
id: 3fow34 link: http://www.cdc.gov/mmwr/preview/mmwrhtml/mm6032a3.htm
Unknow content-type: text/xml
id: 2waaf6 link: http://www.msn.com/en-us/news/us/map-shows-loudest-and-quietest-places-in-the-us/ar-BBhGetT
Unknow content-type: application/octet-stream
id: 1oxrh5 link: http://www.badfigure.org/



In [17]:
for r in dl_archive_results:
    r['dest'] = str(r['dest'])

dlarchiveresults.drop()
dlarchiveresults.insert_many(dl_archive_results)

<pymongo.results.InsertManyResult at 0x7ff493b9d140>

In [18]:
Counter([r['archive_status'] if 'archive_status' in r else r['status'] for r in dl_archive_results])

Counter({'archive_query_error': 557,
         'archive_not_found': 1146,
         'exists': 169,
         'downloaded': 12,
         'skip_content_type': 471,
         'invalid_content_type': 5})