In [None]:
import os
import toml
import imghdr
from urllib.parse import urlsplit
import requests

try:
    settings = toml.load("settings.toml")
except:
    setup = toml.load("settings.template.toml")
    with open("settings.toml", "w") as f:
        f.write(toml.dumps(setup))
        f.close()
    print("Fill settings.toml and try again")
    quit(1)

def save_file(url: str, file_path: str):
    """Returns the filepath"""
    return file_path


def get_reddit_media(url: str, settings: dict):
    """Downloads media from Reddit"""
    file_name = os.path.basename(urlsplit(url).path)
    file_extension = os.path.splitext(url)[-1].lower()
    # Fix for issue with i.reddituploads.com links not having a file extension in the URL
    if not file_extension:
        file_extension = '.jpg'
        file_name += '.jpg'
        url += '.jpg'
    # Download the file
    file_path = settings["media"]["media_folder"] + '/' + file_name
    #print(f'[ OK ] Downloading file at URL {url} to {file_path}, file type identified as {file_extension}')
    return save_file(url, file_path)


def get_imgur_image_media(url: str, settings: dict):
    """Retrieves a single image from an Imgur i.imgur.com link"""
    file_url = url.replace(".gifv", ".mp4").lower()  # Get the file URL and replace GIFV or MP4 with GIF versions
    file_name = os.path.basename(urlsplit(url).path)
    #print(f'[ OK ] Downloading Imgur media at URL {file_url} to {settings["media"]["media_folder"]}')
    file_path = save_file(file_url, f'{settings["media"]["media_folder"]}/{file_name}')  # Saves the image
    # Finally lets check if the imgur file is not a thumbnail
    if ".jpg" not in file_name and imghdr.what(file_path) != "gif":
        #print("[WARN] Imgur has not processed a GIF version of this link, so it can not be posted to Twitter")
        try:
            os.remove(file_path)
        except BaseException as e:
            print(f'[EROR] Error while deleting media file: {str(e)}')
        finally:
            raise ValueError()
    return file_path


def get_imgur_endpoint(url: str, object: str, settings: dict):
    """Retrieves the info of any object/ID pair from the API"""
    id = url.split('/')[-1].split('.')[0]  # Get the object ID = last element of the URL - file extension
    response = requests.get(
        f"https://api.imgur.com/3/{object}/{id}",
        headers={'Authorization': f'Client-ID {settings["media"]["imgur_client"]}'},
        timeout=30
    )
    # Make sure we got a 200 response code
    assert response.status_code == 200, f"Response code for URL \"https://api.imgur.com/3/{object}/{id}\" was {response.status_code}"
    return response.json()


def get_imgur_image(url: str, settings: dict):
    """Retrieves any Imgur image"""
    resp = get_imgur_endpoint(url, "image", settings)
    # Call the image downloader on the image link
    if "image" in resp["data"]["type"]:
        return get_imgur_image_media(resp["data"]["link"], settings)


def get_imgur_album(url: str, settings: dict):
    """Retrieves any Imgur album"""
    resp = get_imgur_endpoint(url, "album", settings)
    # Call the image downloader on the first image link of the album
    if "image" in resp["data"]["images"][0]["type"]:
        return get_imgur_image_media(resp["data"]["images"][0]["link"], settings)


def get_imgur_gallery(url: str, settings: dict):
    """Retrieves any Imgur image or album within a gallery"""
    resp = get_imgur_endpoint(url, "gallery", settings)
    if "image" in resp["data"]["type"]:
        if resp["data"]["is_album"]:
            return get_imgur_album(url, settings)
        else:
            return get_imgur_image(url, settings)


def get_imgur_media(url: str, settings: dict):
    """Downloads any Imgur link"""
    assert settings["media"]["imgur_client"] != "", "Imgur client must not be empty"
    if "/a/" in url:  # It's an album
        return get_imgur_album(url, settings)
    elif "/gallery/" in url:  # It's a gallery
        return get_imgur_gallery(url, settings)
    else:  # It's a single image
        return get_imgur_image(url, settings)

def get_media(url):
    try:
        """Retrieves static images and GIFs from popular image hosts"""
        # Download and save the linked image
        if 'redd.it' in url or 'reddituploads.com' in url:  # Reddit-hosted images
            return get_reddit_media(url, settings)
        elif 'imgur.com' in url:  # Imgur
            return get_imgur_media(url, settings)
        else:
            pass
    except BaseException as e:
        print(f"[WARN] Exception occurred: {e}")

In [None]:
import pandas as pd
import json

data_str = ""
with open("dataset/metadata_feet+feetpics.json", "r") as f:
    data_str = f.read()
    f.close()

data_dict = json.loads(data_str)

df = pd.DataFrame(data_dict)

df['file_name'] = list(map(get_media, map(lambda x: x[8], df.values)))

In [None]:
import hashlib

def get_md5(uri):
    try:
        with open(uri, 'rb') as f:
            data = f.read()
            return hashlib.md5(data).hexdigest()
    except TypeError:
        return ""
    except FileNotFoundError:
        return ""

df['file_hash'] = list(map(get_md5, map(lambda x: x[9], df.values)))

In [None]:
df['is_not_404_or_empty'] = list(map(lambda x: x != "d835884373f4d6c8f24742ceabe74946" and x != "", map(lambda x: x[10], df.values)))

In [None]:
def is_still(uri):
    try:
        return uri.endswith("jpg") or uri.endswith("jpeg") or uri.endswith("png")
    except:
        return False

df['is_still'] = list(map(is_still, map(lambda x: x[9], df.values)))

In [None]:
df_clean = df[df.is_not_404_or_empty & df.file_hash.notnull() & df.file_name.notnull() & df.is_still]

In [None]:
df_clean.to_csv("dataset/metadata_usable.csv", index=False)