In [1]:
# !python3 -m pip install pyppeteer
# !apt install libxrender1 libxtst6 libxi6

# !pip install ffmpy

# !pip install webp
# !pip uninstall Pillow
# !pip install Pillow

# !pip install piexif

In [2]:
import os
from pathlib import Path

In [3]:
images_dir = Path('./images')
recycle_dir = Path('./recycle')

In [4]:
recycle_dir.mkdir(parents=True, exist_ok=True)

# Convert .svg to .png

Tried svglib, cairosvg and phantomjs, all are error prone. Improper font, color, etc.

Finally settled with puppeteer, the output images are rendered just like chrome does

In [5]:
from pyppeteer import launch

In [6]:
async def svg2png (in_svg_path, out_png_path):
    browser = await launch(args=['--no-sandbox'])
    page = await browser.newPage()
    await page.goto(f"file://{os.path.abspath(in_svg_path)}")
    dimensions = await page.evaluate('''() => {
        return {
            width: document.documentElement.width.baseVal.value,
            height: document.documentElement.height.baseVal.value
        }
    }''')
    await page.setViewport({'width': int(dimensions['width']), 'height': int(dimensions['height'])})
    await page.screenshot({'path': out_png_path})
    await browser.close()

In [7]:
for root, dirs, files in os.walk(images_dir):
    for name in files:
        try:
            if name.endswith('.svg'):
                basename, ext = os.path.splitext(name)
                f_in = os.path.join(root, name)
                f_out = os.path.join(root, f"{basename}.png")
                print(f_in)
                await svg2png(f_in, f_out)
                f_mv = recycle_dir/name
                !mv {f_in} {f_mv}
        except Exception as inst:
            print(f"Error converting image: {name} {inst}")

# Convert Imgur .gifv format, which downloaded as .mp4, to .gif

In [8]:
import ffmpy

In [9]:
for root, dirs, files in os.walk(images_dir):
    for name in files:
        try:
            if name.endswith('.mp4'):
                basename, ext = os.path.splitext(name)
                f_in = os.path.join(root, name)
                f_out = os.path.join(root, f"{basename}.gif")
                print(f_in)
                !rm {f_out}
                ffmpy.FFmpeg(inputs={f_in: None}, outputs={f_out: None}).run()
                f_mv = recycle_dir/name
                !mv {f_in} {f_mv}
        except Exception as inst:
            print(f"Error converting video: {name} {inst}")

# Convert .bmp to .png

In [10]:
import imageio

In [11]:
for root, dirs, files in os.walk(images_dir):
    for name in files:
        basename, ext = os.path.splitext(name)
        if ext == '.bmp':
            print(os.path.join(root, name))
            f_in = os.path.join(root, name)
            f_out = os.path.join(root, f"{basename}.png")
            im = imageio.imread(os.path.join(root, name))
            imageio.imwrite(f_out, im)

            f_mv = recycle_dir/name
            !mv {f_in} {f_mv}

# Convert .webp to .png

In [12]:
from PIL import Image

In [13]:
for root, dirs, files in os.walk(images_dir):
    for name in files:
        basename, ext = os.path.splitext(name)
        if ext == '.webp':
            print(os.path.join(root, name))
            f_in = os.path.join(root, name)
            f_out = os.path.join(root, f"{basename}.png")
            im = Image.open(os.path.join(root, name))
            im.save(f_out, 'png')

            f_mv = recycle_dir/name
            !mv {f_in} {f_mv}

# Correct extension

In [14]:
import os

In [15]:
# id: '2w19o1' downloaded from 'http://i.imgur.com/GE7LJR6.pngh' as .pngh

for root, dirs, files in os.walk(images_dir):
    for name in files:
        basename, ext = os.path.splitext(name)
        if ext == '.jpeg':
            print(os.path.join(root, name))
            os.rename(os.path.join(root, name), os.path.join(root, basename) + '.jpg')
        if ext == '.pngh':
            print(os.path.join(root, name))
            os.rename(os.path.join(root, name), os.path.join(root, basename) + '.png')

# Delete invalid file

In [16]:
# id: '4f927p' downloaded from 'http://imgur.com/(null)' as '4f927p(null'

for root, dirs, files in os.walk(images_dir):
    for name in files:
        basename, ext = os.path.splitext(name)
        if ext == '':
            print(os.path.join(root, name))
            os.remove(os.path.join(root, name))

# Repair corrupted files

In [17]:
import piexif

In [18]:
# import warnings
# warnings.filterwarnings("error")

In [19]:
# Error digesting image: 63027680638.jpg Corrupt EXIF data.  Expecting to read 2 bytes but only got 0.
# im = Image.open('images/preview/tumblr/wtf-viz/63027680638.jpg')
# piexif.remove('images/preview/tumblr/wtf-viz/63027680638.jpg')

In [20]:
# Error digesting image: 63027680638.jpg Corrupt EXIF data.  Expecting to read 2 bytes but only got 0.
# im = Image.open('images/thumbnail/tumblr/wtf-viz/63027680638.jpg')
# piexif.remove('images/thumbnail/tumblr/wtf-viz/63027680638.jpg')

In [21]:
# Error digesting image: 2jtymg.jpg Image size (135364608 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.
# 39744830 bytes ~37.9MB
# huge image, nothing to do
im = Image.open('images/archive/reddit/dataisugly/2jtymg.jpg')



In [22]:
# Error digesting image: 9g6pzu.png Image size (100000000 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.
# 2208964 bytes ~
# huge image, nothing to do
im = Image.open('images/external_link/reddit/dataisugly/9g6pzu.png')



In [23]:
# Error digesting image: anqyqb.png Palette images with Transparency expressed in bytes should be converted to RGBA images
# Happens when converting transparent .png to grayscale, nothing to do
im = Image.open('images/preview/reddit/dataisbeautiful/anqyqb.png')
imL = im.convert('L')

  "Palette images with Transparency expressed in bytes should be "
