In [2]:
import os
import tempfile
import multiprocessing
import urllib
import cv2
import pandas as pd
from tqdm import tqdm_notebook

In [None]:
# Administrative constants
vso_dir = './vso/'


def process_anps(file_name):
    # Get the ANP from the file name
    adj = file_name.split('_')[0]
    noun = file_name.split('_')[1].split('.')[0]

    # Download each image listed in the file and then get the RGB out of them
    with open(vso_dir + file_name, 'r') as file_handle:
        file_lines = file_handle.readlines()
    for line in tqdm_notebook(file_lines, total=len(file_lines), desc='Images for this ANP'):
        url = line.split(' ')[1]
        # Write the image to a temporary file
        _, img_path = tempfile.mkstemp()
        try:
            urllib.urlretrieve(url, img_path)
            img = cv2.imread(img_path)

        # Delete the temporary file
        finally:
            try:
                os.remove(img_path)
            except OSError:
                pass

    return adj, noun, img


# get all of the attribute-noun-pairs in Columbia's
# visual sentiment ontology (VSO) dataset
#adjs = []
#nouns = []
#imgs = []
file_names = os.listdir(vso_dir)
pool = multiprocessing.Pool(4)
iterator = pool.imap(process_anps, file_names)
data = list(tqdm_notebook(iterator, total=len(file_names), desc='ANPs'))

# Parse the information into a dataframe, then save
data_dict = {'Adjectives': [datum[0] for datum in data],
             'Nouns': [datum[1] for datum in data],
             'RGB': [datum[2] for datum in data]}
df = pd.DataFrame(data_dict)